pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -2,154 +2,192 @@ from __future__ import annotations
2
2
 
3
3
  import abc
4
4
  import builtins
5
+ import datetime
5
6
  import json
6
7
  import logging
8
+ from keyword import iskeyword as is_python_keyword
7
9
  from pathlib import Path
8
- from typing import _GenericAlias # type: ignore[attr-defined]
9
- from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, Sequence, Union, overload
10
+ from typing import TYPE_CHECKING, Any, Iterable, Literal
10
11
  from uuid import UUID
11
12
 
12
13
  import pandas as pd
13
14
  import sqlalchemy as sql
15
+ from typing_extensions import overload
14
16
 
15
17
  import pixeltable as pxt
16
- import pixeltable.catalog as catalog
17
- import pixeltable.env as env
18
- import pixeltable.exceptions as excs
19
- import pixeltable.exprs as exprs
20
- import pixeltable.index as index
21
- import pixeltable.metadata.schema as schema
22
- import pixeltable.type_system as ts
18
+ from pixeltable import catalog, env, exceptions as excs, exprs, index, type_system as ts
19
+ from pixeltable.catalog.table_metadata import (
20
+ ColumnMetadata,
21
+ EmbeddingIndexParams,
22
+ IndexMetadata,
23
+ TableMetadata,
24
+ VersionMetadata,
25
+ )
26
+ from pixeltable.metadata import schema
27
+ from pixeltable.metadata.utils import MetadataUtils
28
+ from pixeltable.utils.object_stores import ObjectOps
23
29
 
24
30
  from ..exprs import ColumnRef
25
31
  from ..utils.description_helper import DescriptionHelper
26
32
  from ..utils.filecache import FileCache
27
33
  from .column import Column
28
- from .globals import _ROWID_COLUMN_NAME, MediaValidation, UpdateStatus, is_system_column_name, is_valid_identifier
34
+ from .globals import (
35
+ _ROWID_COLUMN_NAME,
36
+ IfExistsParam,
37
+ IfNotExistsParam,
38
+ MediaValidation,
39
+ is_system_column_name,
40
+ is_valid_identifier,
41
+ )
29
42
  from .schema_object import SchemaObject
30
- from .table_version import TableVersion
43
+ from .table_version_handle import TableVersionHandle
31
44
  from .table_version_path import TableVersionPath
45
+ from .update_status import UpdateStatus
46
+
47
+ from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
48
+
32
49
 
33
50
  if TYPE_CHECKING:
34
51
  import torch.utils.data
52
+
35
53
  import pixeltable.plan
54
+ from pixeltable.globals import TableDataSource
55
+
36
56
 
37
57
  _logger = logging.getLogger('pixeltable')
38
58
 
59
+
39
60
  class Table(SchemaObject):
40
61
  """
41
62
  A handle to a table, view, or snapshot. This class is the primary interface through which table operations
42
63
  (queries, insertions, updates, etc.) are performed in Pixeltable.
64
+
65
+ Every user-invoked operation that runs an ExecNode tree (directly or indirectly) needs to call
66
+ FileCache.emit_eviction_warnings() at the end of the operation.
43
67
  """
44
- # Every user-invoked operation that runs an ExecNode tree (directly or indirectly) needs to call
45
- # FileCache.emit_eviction_warnings() at the end of the operation.
68
+
69
+ # the chain of TableVersions needed to run queries and supply metadata (eg, schema)
70
+ _tbl_version_path: TableVersionPath
71
+
72
+ # the physical TableVersion backing this Table; None for pure snapshots
73
+ _tbl_version: TableVersionHandle | None
46
74
 
47
75
  def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath):
48
76
  super().__init__(id, name, dir_id)
49
- self._is_dropped = False
50
- self.__tbl_version_path = tbl_version_path
51
- self.__query_scope = self.QueryScope(self)
77
+ self._tbl_version_path = tbl_version_path
78
+ self._tbl_version = None
52
79
 
53
- class QueryScope:
54
- __table: 'Table'
55
- _queries: dict[str, pxt.func.QueryTemplateFunction]
80
+ def _move(self, new_name: str, new_dir_id: UUID) -> None:
81
+ old_name = self._name
82
+ old_dir_id = self._dir_id
56
83
 
57
- def __init__(self, table: 'Table') -> None:
58
- self.__table = table
59
- self._queries = {}
84
+ cat = catalog.Catalog.get()
60
85
 
61
- def __getattr__(self, name: str) -> pxt.func.QueryTemplateFunction:
62
- if name in self._queries:
63
- return self._queries[name]
64
- raise AttributeError(f'Table {self.__table._name!r} has no query with that name: {name!r}')
86
+ @cat.register_undo_action
87
+ def _() -> None:
88
+ # TODO: We should really be invalidating the Table instance and forcing a reload.
89
+ self._name = old_name
90
+ self._dir_id = old_dir_id
65
91
 
66
- def _move(self, new_name: str, new_dir_id: UUID) -> None:
67
- self._check_is_dropped()
68
92
  super()._move(new_name, new_dir_id)
69
- with env.Env.get().engine.begin() as conn:
70
- stmt = sql.text((
71
- f"UPDATE {schema.Table.__table__} "
72
- f"SET {schema.Table.dir_id.name} = :new_dir_id, "
73
- f" {schema.Table.md.name}['name'] = :new_name "
74
- f"WHERE {schema.Table.id.name} = :id"))
75
- conn.execute(stmt, {'new_dir_id': new_dir_id, 'new_name': json.dumps(new_name), 'id': self._id})
76
-
77
- def get_metadata(self) -> dict[str, Any]:
93
+ conn = env.Env.get().conn
94
+ stmt = sql.text(
95
+ (
96
+ f'UPDATE {schema.Table.__table__} '
97
+ f'SET {schema.Table.dir_id.name} = :new_dir_id, '
98
+ f" {schema.Table.md.name} = jsonb_set({schema.Table.md.name}, '{{name}}', (:new_name)::jsonb) "
99
+ f'WHERE {schema.Table.id.name} = :id'
100
+ )
101
+ )
102
+ conn.execute(stmt, {'new_dir_id': new_dir_id, 'new_name': json.dumps(new_name), 'id': self._id})
103
+
104
+ # this is duplicated from SchemaObject so that our API docs show the docstring for Table
105
+ def get_metadata(self) -> 'TableMetadata':
78
106
  """
79
107
  Retrieves metadata associated with this table.
80
108
 
81
109
  Returns:
82
- A dictionary containing the metadata, in the following format:
83
-
84
- ```python
85
- {
86
- 'base': None, # If this is a view or snapshot, will contain the name of its base table
87
- 'schema': {
88
- 'col1': StringType(),
89
- 'col2': IntType(),
90
- },
91
- 'version': 22,
92
- 'schema_version': 1,
93
- 'comment': '',
94
- 'num_retained_versions': 10,
95
- 'is_view': False,
96
- 'is_snapshot': False,
97
- 'media_validation': 'on_write',
98
- }
99
- ```
110
+ A [TableMetadata][pixeltable.TableMetadata] instance containing this table's metadata.
100
111
  """
101
- self._check_is_dropped()
102
- md = super().get_metadata()
103
- md['base'] = self._base._path if self._base is not None else None
104
- md['schema'] = self._schema
105
- md['version'] = self._version
106
- md['schema_version'] = self._tbl_version.schema_version
107
- md['comment'] = self._comment
108
- md['num_retained_versions'] = self._num_retained_versions
109
- md['media_validation'] = self._media_validation.name.lower()
110
- return md
111
-
112
- @property
113
- def _version(self) -> int:
114
- """Return the version of this table. Used by tests to ascertain version changes."""
115
- return self._tbl_version.version
116
-
117
- @property
118
- def _tbl_version(self) -> TableVersion:
119
- """Return TableVersion for just this table."""
120
- return self._tbl_version_path.tbl_version
112
+ from pixeltable.catalog import retry_loop
113
+
114
+ @retry_loop(for_write=False)
115
+ def op() -> 'TableMetadata':
116
+ return self._get_metadata()
117
+
118
+ return op()
119
+
120
+ def _get_metadata(self) -> TableMetadata:
121
+ tvp = self._tbl_version_path
122
+ tv = tvp.tbl_version.get()
123
+ columns = tvp.columns()
124
+ column_info: dict[str, ColumnMetadata] = {}
125
+ for col in columns:
126
+ column_info[col.name] = ColumnMetadata(
127
+ name=col.name,
128
+ type_=col.col_type._to_str(as_schema=True),
129
+ version_added=col.schema_version_add,
130
+ is_stored=col.is_stored,
131
+ is_primary_key=col.is_pk,
132
+ media_validation=col.media_validation.name.lower() if col.media_validation is not None else None, # type: ignore[typeddict-item]
133
+ computed_with=col.value_expr.display_str(inline=False) if col.value_expr is not None else None,
134
+ defined_in=col.get_tbl().name,
135
+ )
121
136
 
122
- @property
123
- def _tbl_version_path(self) -> TableVersionPath:
124
- """Return TableVersionPath for just this table."""
125
- self._check_is_dropped()
126
- return self.__tbl_version_path
137
+ indices = tv.idxs_by_name.values()
138
+ index_info: dict[str, IndexMetadata] = {}
139
+ for info in indices:
140
+ if isinstance(info.idx, index.EmbeddingIndex):
141
+ col_ref = ColumnRef(info.col)
142
+ embedding = info.idx.embeddings[info.col.col_type._type](col_ref)
143
+ index_info[info.name] = IndexMetadata(
144
+ name=info.name,
145
+ columns=[info.col.name],
146
+ index_type='embedding',
147
+ parameters=EmbeddingIndexParams(
148
+ metric=info.idx.metric.name.lower(), # type: ignore[typeddict-item]
149
+ embedding=str(embedding),
150
+ embedding_functions=[str(fn) for fn in info.idx.embeddings.values()],
151
+ ),
152
+ )
127
153
 
128
- def __hash__(self) -> int:
129
- return hash(self._tbl_version.id)
154
+ return TableMetadata(
155
+ name=self._name,
156
+ path=self._path(),
157
+ columns=column_info,
158
+ indices=index_info,
159
+ is_replica=tv.is_replica,
160
+ is_view=False,
161
+ is_snapshot=False,
162
+ version=self._get_version(),
163
+ version_created=datetime.datetime.fromtimestamp(tv.created_at, tz=datetime.timezone.utc),
164
+ schema_version=tvp.schema_version(),
165
+ comment=self._get_comment(),
166
+ media_validation=self._get_media_validation().name.lower(), # type: ignore[typeddict-item]
167
+ base=None,
168
+ )
130
169
 
131
- def _check_is_dropped(self) -> None:
132
- if self._is_dropped:
133
- raise excs.Error(f'{self._display_name()} {self._name} has been dropped')
170
+ def _get_version(self) -> int:
171
+ """Return the version of this table. Used by tests to ascertain version changes."""
172
+ return self._tbl_version_path.version()
134
173
 
135
- def __getattr__(self, name: str) -> 'pxt.exprs.ColumnRef':
136
- """Return a ColumnRef for the given name.
137
- """
138
- return self._tbl_version_path.get_column_ref(name)
174
+ def _get_pxt_uri(self) -> str | None:
175
+ with catalog.Catalog.get().begin_xact(tbl_id=self._id):
176
+ return catalog.Catalog.get().get_additional_md(self._id).get('pxt_uri')
139
177
 
140
- @overload
141
- def __getitem__(self, name: str) -> 'pxt.exprs.ColumnRef': ...
178
+ def __hash__(self) -> int:
179
+ return hash(self._tbl_version_path.tbl_id)
142
180
 
143
- @overload
144
- def __getitem__(self, index: Union[exprs.Expr, Sequence[exprs.Expr]]) -> 'pxt.DataFrame': ...
181
+ def __getattr__(self, name: str) -> 'exprs.ColumnRef':
182
+ """Return a ColumnRef for the given name."""
183
+ col = self._tbl_version_path.get_column(name)
184
+ if col is None:
185
+ raise AttributeError(f'Unknown column: {name}')
186
+ return ColumnRef(col, reference_tbl=self._tbl_version_path)
145
187
 
146
- def __getitem__(self, index):
147
- """Return a ColumnRef or QueryTemplateFunction for the given name, or a DataFrame for the given slice.
148
- """
149
- if isinstance(index, str):
150
- return getattr(self, index)
151
- else:
152
- return self._df()[index]
188
+ def __getitem__(self, name: str) -> 'exprs.ColumnRef':
189
+ """Return a ColumnRef for the given name."""
190
+ return getattr(self, name)
153
191
 
154
192
  def list_views(self, *, recursive: bool = True) -> list[str]:
155
193
  """
@@ -162,130 +200,160 @@ class Table(SchemaObject):
162
200
  Returns:
163
201
  A list of view paths.
164
202
  """
165
- self._check_is_dropped()
166
- return [t._path for t in self._get_views(recursive=recursive)]
203
+ from pixeltable.catalog import retry_loop
167
204
 
168
- def _get_views(self, *, recursive: bool = True) -> list['Table']:
169
- dependents = catalog.Catalog.get().tbl_dependents[self._id]
205
+ # we need retry_loop() here, because we end up loading Tables for the views
206
+ @retry_loop(tbl=self._tbl_version_path, for_write=False)
207
+ def op() -> list[str]:
208
+ return [t._path() for t in self._get_views(recursive=recursive)]
209
+
210
+ return op()
211
+
212
+ def _get_views(self, *, recursive: bool = True, mutable_only: bool = False) -> list['Table']:
213
+ cat = catalog.Catalog.get()
214
+ view_ids = cat.get_view_ids(self._id)
215
+ views = [cat.get_table_by_id(id) for id in view_ids]
216
+ if mutable_only:
217
+ views = [t for t in views if t._tbl_version_path.is_mutable()]
170
218
  if recursive:
171
- return dependents + [t for view in dependents for t in view._get_views(recursive=True)]
172
- else:
173
- return dependents
219
+ views.extend(t for view in views for t in view._get_views(recursive=True, mutable_only=mutable_only))
220
+ return views
221
+
222
+ def select(self, *items: Any, **named_items: Any) -> 'pxt.Query':
223
+ """Select columns or expressions from this table.
174
224
 
175
- def _df(self) -> 'pxt.dataframe.DataFrame':
176
- """Return a DataFrame for this table.
225
+ See [`Query.select`][pixeltable.Query.select] for more details.
177
226
  """
178
- # local import: avoid circular imports
227
+ from pixeltable.catalog import Catalog
179
228
  from pixeltable.plan import FromClause
180
- return pxt.DataFrame(FromClause(tbls=[self._tbl_version_path]))
181
229
 
182
- @property
183
- def queries(self) -> 'Table.QueryScope':
184
- return self.__query_scope
230
+ query = pxt.Query(FromClause(tbls=[self._tbl_version_path]))
231
+ if len(items) == 0 and len(named_items) == 0:
232
+ return query # Select(*); no further processing is necessary
185
233
 
186
- def select(self, *items: Any, **named_items: Any) -> 'pxt.DataFrame':
187
- """Return a [`DataFrame`][pixeltable.DataFrame] for this table."""
188
- return self._df().select(*items, **named_items)
234
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
235
+ return query.select(*items, **named_items)
189
236
 
190
- def where(self, pred: 'exprs.Expr') -> 'pxt.DataFrame':
191
- """Return a [`DataFrame`][pixeltable.DataFrame] for this table."""
192
- return self._df().where(pred)
237
+ def where(self, pred: 'exprs.Expr') -> 'pxt.Query':
238
+ """Filter rows from this table based on the expression.
239
+
240
+ See [`Query.where`][pixeltable.Query.where] for more details.
241
+ """
242
+ from pixeltable.catalog import Catalog
243
+
244
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
245
+ return self.select().where(pred)
193
246
 
194
247
  def join(
195
- self, other: 'Table', *, on: Optional['exprs.Expr'] = None,
196
- how: 'pixeltable.plan.JoinType.LiteralType' = 'inner'
197
- ) -> 'pxt.DataFrame':
198
- """Return a [`DataFrame`][pixeltable.DataFrame] for this table."""
199
- return self._df().join(other, on=on, how=how)
248
+ self, other: 'Table', *, on: 'exprs.Expr' | None = None, how: 'pixeltable.plan.JoinType.LiteralType' = 'inner'
249
+ ) -> 'pxt.Query':
250
+ """Join this table with another table."""
251
+ from pixeltable.catalog import Catalog
200
252
 
201
- def order_by(self, *items: 'exprs.Expr', asc: bool = True) -> 'pxt.DataFrame':
202
- """Return a [`DataFrame`][pixeltable.DataFrame] for this table."""
203
- return self._df().order_by(*items, asc=asc)
253
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
254
+ return self.select().join(other, on=on, how=how)
204
255
 
205
- def group_by(self, *items: 'exprs.Expr') -> 'pxt.DataFrame':
206
- """Return a [`DataFrame`][pixeltable.DataFrame] for this table."""
207
- return self._df().group_by(*items)
256
+ def order_by(self, *items: 'exprs.Expr', asc: bool = True) -> 'pxt.Query':
257
+ """Order the rows of this table based on the expression.
208
258
 
209
- def limit(self, n: int) -> 'pxt.DataFrame':
210
- return self._df().limit(n)
259
+ See [`Query.order_by`][pixeltable.Query.order_by] for more details.
260
+ """
261
+ from pixeltable.catalog import Catalog
211
262
 
212
- def collect(self) -> 'pxt.dataframe.DataFrameResultSet':
213
- """Return rows from this table."""
214
- return self._df().collect()
263
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
264
+ return self.select().order_by(*items, asc=asc)
215
265
 
216
- def show(
217
- self, *args, **kwargs
218
- ) -> 'pxt.dataframe.DataFrameResultSet':
219
- """Return rows from this table.
266
+ def group_by(self, *items: 'exprs.Expr') -> 'pxt.Query':
267
+ """Group the rows of this table based on the expression.
268
+
269
+ See [`Query.group_by`][pixeltable.Query.group_by] for more details.
220
270
  """
221
- return self._df().show(*args, **kwargs)
271
+ from pixeltable.catalog import Catalog
272
+
273
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
274
+ return self.select().group_by(*items)
275
+
276
+ def distinct(self) -> 'pxt.Query':
277
+ """Remove duplicate rows from table."""
278
+ return self.select().distinct()
279
+
280
+ def limit(self, n: int) -> 'pxt.Query':
281
+ return self.select().limit(n)
282
+
283
+ def sample(
284
+ self,
285
+ n: int | None = None,
286
+ n_per_stratum: int | None = None,
287
+ fraction: float | None = None,
288
+ seed: int | None = None,
289
+ stratify_by: Any = None,
290
+ ) -> pxt.Query:
291
+ """Choose a shuffled sample of rows
292
+
293
+ See [`Query.sample`][pixeltable.Query.sample] for more details.
294
+ """
295
+ return self.select().sample(
296
+ n=n, n_per_stratum=n_per_stratum, fraction=fraction, seed=seed, stratify_by=stratify_by
297
+ )
298
+
299
+ def collect(self) -> 'pxt._query.ResultSet':
300
+ """Return rows from this table."""
301
+ return self.select().collect()
302
+
303
+ def show(self, *args: Any, **kwargs: Any) -> 'pxt._query.ResultSet':
304
+ """Return rows from this table."""
305
+ return self.select().show(*args, **kwargs)
222
306
 
223
- def head(
224
- self, *args, **kwargs
225
- ) -> 'pxt.dataframe.DataFrameResultSet':
307
+ def head(self, *args: Any, **kwargs: Any) -> 'pxt._query.ResultSet':
226
308
  """Return the first n rows inserted into this table."""
227
- return self._df().head(*args, **kwargs)
309
+ return self.select().head(*args, **kwargs)
228
310
 
229
- def tail(
230
- self, *args, **kwargs
231
- ) -> 'pxt.dataframe.DataFrameResultSet':
311
+ def tail(self, *args: Any, **kwargs: Any) -> 'pxt._query.ResultSet':
232
312
  """Return the last n rows inserted into this table."""
233
- return self._df().tail(*args, **kwargs)
313
+ return self.select().tail(*args, **kwargs)
234
314
 
235
315
  def count(self) -> int:
236
316
  """Return the number of rows in this table."""
237
- return self._df().count()
317
+ return self.select().count()
238
318
 
239
- @property
240
319
  def columns(self) -> list[str]:
241
- """Return the names of the columns in this table. """
320
+ """Return the names of the columns in this table."""
242
321
  cols = self._tbl_version_path.columns()
243
322
  return [c.name for c in cols]
244
323
 
245
- @property
246
- def _schema(self) -> dict[str, ts.ColumnType]:
324
+ def _get_schema(self) -> dict[str, ts.ColumnType]:
247
325
  """Return the schema (column names and column types) of this table."""
248
326
  return {c.name: c.col_type for c in self._tbl_version_path.columns()}
249
327
 
250
- @property
251
- def _query_names(self) -> list[str]:
252
- """Return the names of the registered queries for this table."""
253
- return list(self.__query_scope._queries.keys())
328
+ def get_base_table(self) -> 'Table' | None:
329
+ return self._get_base_table()
254
330
 
255
- @property
256
- def _base(self) -> Optional['Table']:
257
- """
258
- The base table of this `Table`. If this table is a view, returns the `Table`
259
- from which it was derived. Otherwise, returns `None`.
260
- """
261
- if self._tbl_version_path.base is None:
262
- return None
263
- base_id = self._tbl_version_path.base.tbl_version.id
264
- return catalog.Catalog.get().tbls[base_id]
331
+ @abc.abstractmethod
332
+ def _get_base_table(self) -> 'Table' | None:
333
+ """The base's Table instance. Requires a transaction context"""
265
334
 
266
- @property
267
- def _bases(self) -> list['Table']:
268
- """
269
- The ancestor list of bases of this table, starting with its immediate base.
270
- """
271
- bases = []
272
- base = self._base
335
+ def _get_base_tables(self) -> list['Table']:
336
+ """The ancestor list of bases of this table, starting with its immediate base. Requires a transaction context"""
337
+ bases: list[Table] = []
338
+ base = self._get_base_table()
273
339
  while base is not None:
274
340
  bases.append(base)
275
- base = base._base
341
+ base = base._get_base_table()
276
342
  return bases
277
343
 
278
344
  @property
279
- def _comment(self) -> str:
280
- return self._tbl_version.comment
345
+ @abc.abstractmethod
346
+ def _effective_base_versions(self) -> list[int | None]:
347
+ """The effective versions of the ancestor bases, starting with its immediate base."""
281
348
 
282
- @property
283
- def _num_retained_versions(self):
284
- return self._tbl_version.num_retained_versions
349
+ def _get_comment(self) -> str:
350
+ return self._tbl_version_path.comment()
285
351
 
286
- @property
287
- def _media_validation(self) -> MediaValidation:
288
- return self._tbl_version.media_validation
352
+ def _get_num_retained_versions(self) -> int:
353
+ return self._tbl_version_path.num_retained_versions()
354
+
355
+ def _get_media_validation(self) -> MediaValidation:
356
+ return self._tbl_version_path.media_validation()
289
357
 
290
358
  def __repr__(self) -> str:
291
359
  return self._descriptors().to_string()
@@ -297,74 +365,56 @@ class Table(SchemaObject):
297
365
  """
298
366
  Constructs a list of descriptors for this table that can be pretty-printed.
299
367
  """
300
- helper = DescriptionHelper()
301
- helper.append(self._title_descriptor())
302
- helper.append(self._col_descriptor())
303
- idxs = self._index_descriptor()
304
- if not idxs.empty:
305
- helper.append(idxs)
306
- stores = self._external_store_descriptor()
307
- if not stores.empty:
308
- helper.append(stores)
309
- if self._comment:
310
- helper.append(f'COMMENT: {self._comment}')
311
- return helper
312
-
313
- def _title_descriptor(self) -> str:
314
- title: str
315
- if self._base is None:
316
- title = f'Table\n{self._path!r}'
317
- else:
318
- title = f'View\n{self._path!r}'
319
- title += f'\n(of {self.__bases_to_desc()})'
320
- return title
321
-
322
- def _col_descriptor(self, columns: Optional[list[str]] = None) -> pd.DataFrame:
368
+ from pixeltable.catalog import Catalog
369
+
370
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
371
+ helper = DescriptionHelper()
372
+ helper.append(self._table_descriptor())
373
+ helper.append(self._col_descriptor())
374
+ idxs = self._index_descriptor()
375
+ if not idxs.empty:
376
+ helper.append(idxs)
377
+ stores = self._external_store_descriptor()
378
+ if not stores.empty:
379
+ helper.append(stores)
380
+ if self._get_comment():
381
+ helper.append(f'COMMENT: {self._get_comment()}')
382
+ return helper
383
+
384
+ def _col_descriptor(self, columns: list[str] | None = None) -> pd.DataFrame:
323
385
  return pd.DataFrame(
324
386
  {
325
387
  'Column Name': col.name,
326
388
  'Type': col.col_type._to_str(as_schema=True),
327
- 'Computed With': col.value_expr.display_str(inline=False) if col.value_expr is not None else ''
389
+ 'Computed With': col.value_expr.display_str(inline=False) if col.value_expr is not None else '',
328
390
  }
329
- for col in self.__tbl_version_path.columns()
391
+ for col in self._tbl_version_path.columns()
330
392
  if columns is None or col.name in columns
331
393
  )
332
394
 
333
- def __bases_to_desc(self) -> str:
334
- bases = self._bases
335
- assert len(bases) >= 1
336
- if len(bases) <= 2:
337
- return ', '.join(repr(b._path) for b in bases)
338
- else:
339
- return f'{bases[0]._path!r}, ..., {bases[-1]._path!r}'
340
-
341
- def _index_descriptor(self, columns: Optional[list[str]] = None) -> pd.DataFrame:
395
+ def _index_descriptor(self, columns: list[str] | None = None) -> pd.DataFrame:
342
396
  from pixeltable import index
343
397
 
398
+ if self._tbl_version is None:
399
+ return pd.DataFrame([])
344
400
  pd_rows = []
345
- for name, info in self._tbl_version.idxs_by_name.items():
401
+ for name, info in self._tbl_version.get().idxs_by_name.items():
346
402
  if isinstance(info.idx, index.EmbeddingIndex) and (columns is None or info.col.name in columns):
347
- display_embed = info.idx.string_embed if info.col.col_type.is_string_type() else info.idx.image_embed
348
- if info.idx.string_embed is not None and info.idx.image_embed is not None:
349
- embed_str = f'{display_embed} (+1)'
350
- else:
351
- embed_str = str(display_embed)
403
+ col_ref = ColumnRef(info.col)
404
+ embedding = info.idx.embeddings[info.col.col_type._type](col_ref)
352
405
  row = {
353
406
  'Index Name': name,
354
407
  'Column': info.col.name,
355
408
  'Metric': str(info.idx.metric.name.lower()),
356
- 'Embedding': embed_str,
409
+ 'Embedding': str(embedding),
357
410
  }
358
411
  pd_rows.append(row)
359
412
  return pd.DataFrame(pd_rows)
360
413
 
361
414
  def _external_store_descriptor(self) -> pd.DataFrame:
362
415
  pd_rows = []
363
- for name, store in self._tbl_version.external_stores.items():
364
- row = {
365
- 'External Store': name,
366
- 'Type': type(store).__name__,
367
- }
416
+ for name, store in self._tbl_version_path.tbl_version.get().external_stores.items():
417
+ row = {'External Store': name, 'Type': type(store).__name__}
368
418
  pd_rows.append(row)
369
419
  return pd.DataFrame(pd_rows)
370
420
 
@@ -372,77 +422,103 @@ class Table(SchemaObject):
372
422
  """
373
423
  Print the table schema.
374
424
  """
375
- self._check_is_dropped()
376
425
  if getattr(builtins, '__IPYTHON__', False):
377
- from IPython.display import display
378
- display(self._repr_html_())
426
+ from IPython.display import Markdown, display
427
+
428
+ display(Markdown(self._repr_html_()))
379
429
  else:
380
430
  print(repr(self))
381
431
 
382
- def _drop(self) -> None:
383
- cat = catalog.Catalog.get()
384
- # verify all dependents are deleted by now
385
- for dep in cat.tbl_dependents[self._id]:
386
- assert dep._is_dropped
387
- self._check_is_dropped()
388
- self._tbl_version.drop()
389
- self._is_dropped = True
390
- # update catalog
391
- cat = catalog.Catalog.get()
392
- del cat.tbls[self._id]
393
-
394
432
  # TODO Factor this out into a separate module.
395
433
  # The return type is unresolvable, but torch can't be imported since it's an optional dependency.
396
- def to_pytorch_dataset(self, image_format : str = 'pt') -> 'torch.utils.data.IterableDataset':
434
+ def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
397
435
  """Return a PyTorch Dataset for this table.
398
- See DataFrame.to_pytorch_dataset()
436
+ See Query.to_pytorch_dataset()
399
437
  """
400
- return self._df().to_pytorch_dataset(image_format=image_format)
438
+ return self.select().to_pytorch_dataset(image_format=image_format)
401
439
 
402
440
  def to_coco_dataset(self) -> Path:
403
441
  """Return the path to a COCO json file for this table.
404
- See DataFrame.to_coco_dataset()
405
- """
406
- return self._df().to_coco_dataset()
407
-
408
- def __setitem__(self, col_name: str, spec: Union[ts.ColumnType, exprs.Expr]) -> None:
442
+ See Query.to_coco_dataset()
409
443
  """
410
- Adds a column to the table. This is an alternate syntax for `add_column()`; the meaning of
444
+ return self.select().to_coco_dataset()
411
445
 
412
- >>> tbl['new_col'] = pxt.Int
413
-
414
- is exactly equivalent to
446
+ def _column_has_dependents(self, col: Column) -> bool:
447
+ """Returns True if the column has dependents, False otherwise."""
448
+ assert col is not None
449
+ assert col.name in self._get_schema()
450
+ cat = catalog.Catalog.get()
451
+ if any(c.name is not None for c in cat.get_column_dependents(col.get_tbl().id, col.id)):
452
+ return True
453
+ assert self._tbl_version is not None
454
+ return any(
455
+ col in store.get_local_columns()
456
+ for view in (self, *self._get_views(recursive=True))
457
+ for store in view._tbl_version.get().external_stores.values()
458
+ )
415
459
 
416
- >>> tbl.add_column(new_col=pxt.Int)
460
+ def _ignore_or_drop_existing_columns(self, new_col_names: list[str], if_exists: IfExistsParam) -> list[str]:
461
+ """Check and handle existing columns in the new column specification based on the if_exists parameter.
417
462
 
418
- For details, see the documentation for [`add_column()`][pixeltable.catalog.Table.add_column].
463
+ If `if_exists='ignore'`, returns a list of existing columns, if any, in `new_col_names`.
419
464
  """
420
- self._check_is_dropped()
421
- if not isinstance(col_name, str):
422
- raise excs.Error(f'Column name must be a string, got {type(col_name)}')
423
- if not isinstance(spec, (ts.ColumnType, exprs.Expr, type, _GenericAlias)):
424
- raise excs.Error(f'Column spec must be a ColumnType, Expr, or type, got {type(spec)}')
425
- self.add_column(stored=None, print_stats=False, on_error='abort', **{col_name: spec})
465
+ assert self._tbl_version is not None
466
+ existing_col_names = set(self._get_schema().keys())
467
+ cols_to_ignore = []
468
+ for new_col_name in new_col_names:
469
+ if new_col_name in existing_col_names:
470
+ if if_exists == IfExistsParam.ERROR:
471
+ raise excs.Error(f'Duplicate column name: {new_col_name}')
472
+ elif if_exists == IfExistsParam.IGNORE:
473
+ cols_to_ignore.append(new_col_name)
474
+ elif if_exists in (IfExistsParam.REPLACE, IfExistsParam.REPLACE_FORCE):
475
+ if new_col_name not in self._tbl_version.get().cols_by_name:
476
+ # for views, it is possible that the existing column
477
+ # is a base table column; in that case, we should not
478
+ # drop/replace that column. Continue to raise error.
479
+ raise excs.Error(f'Column {new_col_name!r} is a base table column. Cannot replace it.')
480
+ col = self._tbl_version.get().cols_by_name[new_col_name]
481
+ # cannot drop a column with dependents; so reject
482
+ # replace directive if column has dependents.
483
+ if self._column_has_dependents(col):
484
+ raise excs.Error(
485
+ f'Column {new_col_name!r} already exists and has dependents. '
486
+ f'Cannot {if_exists.name.lower()} it.'
487
+ )
488
+ self.drop_column(new_col_name)
489
+ assert new_col_name not in self._tbl_version.get().cols_by_name
490
+ return cols_to_ignore
426
491
 
427
492
  def add_columns(
428
493
  self,
429
- schema: dict[str, Union[ts.ColumnType, builtins.type, _GenericAlias]]
494
+ schema: dict[str, ts.ColumnType | builtins.type | _GenericAlias],
495
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
430
496
  ) -> UpdateStatus:
431
497
  """
432
- Adds multiple columns to the table. The columns must be concrete (non-computed) columns; to add computed columns,
433
- use [`add_computed_column()`][pixeltable.catalog.Table.add_computed_column] instead.
498
+ Adds multiple columns to the table. The columns must be concrete (non-computed) columns; to add computed
499
+ columns, use [`add_computed_column()`][pixeltable.catalog.Table.add_computed_column] instead.
434
500
 
435
- The format of the `schema` argument is identical to the format of the schema in a call to
436
- [`create_table()`][pixeltable.globals.create_table].
501
+ The format of the `schema` argument is a dict mapping column names to their types.
437
502
 
438
503
  Args:
439
504
  schema: A dictionary mapping column names to types.
505
+ if_exists: Determines the behavior if a column already exists. Must be one of the following:
506
+
507
+ - `'error'`: an exception will be raised.
508
+ - `'ignore'`: do nothing and return.
509
+ - `'replace' or 'replace_force'`: drop the existing column and add the new column, if it has no
510
+ dependents.
511
+
512
+ Note that the `if_exists` parameter is applied to all columns in the schema.
513
+ To apply different behaviors to different columns, please use
514
+ [`add_column()`][pixeltable.Table.add_column] for each column.
440
515
 
441
516
  Returns:
442
517
  Information about the execution status of the operation.
443
518
 
444
519
  Raises:
445
- Error: If any column name is invalid or already exists.
520
+ Error: If any column name is invalid, or already exists and `if_exists='error'`,
521
+ or `if_exists='replace*'` but the column has dependents or is a basetable column.
446
522
 
447
523
  Examples:
448
524
  Add multiple columns to the table `my_table`:
@@ -454,50 +530,60 @@ class Table(SchemaObject):
454
530
  ... }
455
531
  ... tbl.add_columns(schema)
456
532
  """
457
- self._check_is_dropped()
458
- col_schema = {
459
- col_name: {'type': ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)}
460
- for col_name, spec in schema.items()
461
- }
462
- new_cols = self._create_columns(col_schema)
463
- for new_col in new_cols:
464
- self._verify_column(new_col, set(self._schema.keys()), set(self._query_names))
465
- status = self._tbl_version.add_columns(new_cols, print_stats=False, on_error='abort')
466
- FileCache.get().emit_eviction_warnings()
467
- return status
468
-
469
- # TODO: add_column() still supports computed columns for backward-compatibility. In the future, computed columns
470
- # will be supported only through add_computed_column(). At that point, we can remove the `stored`,
471
- # `print_stats`, and `on_error` parameters, and change the method body to simply call self.add_columns(kwargs),
472
- # simplifying the code. For the time being, there's some obvious code duplication.
533
+ from pixeltable.catalog import Catalog
534
+
535
+ # lock_mutable_tree=True: we might end up having to drop existing columns, which requires locking the tree
536
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
537
+ self.__check_mutable('add columns to')
538
+ col_schema = {
539
+ col_name: {'type': ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)}
540
+ for col_name, spec in schema.items()
541
+ }
542
+
543
+ # handle existing columns based on if_exists parameter
544
+ cols_to_ignore = self._ignore_or_drop_existing_columns(
545
+ list(col_schema.keys()), IfExistsParam.validated(if_exists, 'if_exists')
546
+ )
547
+ # if all columns to be added already exist and user asked to ignore
548
+ # existing columns, there's nothing to do.
549
+ for cname in cols_to_ignore:
550
+ assert cname in col_schema
551
+ del col_schema[cname]
552
+ result = UpdateStatus()
553
+ if len(col_schema) == 0:
554
+ return result
555
+ new_cols = self._create_columns(col_schema)
556
+ for new_col in new_cols:
557
+ self._verify_column(new_col)
558
+ assert self._tbl_version is not None
559
+ result += self._tbl_version.get().add_columns(new_cols, print_stats=False, on_error='abort')
560
+ FileCache.get().emit_eviction_warnings()
561
+ return result
562
+
473
563
  def add_column(
474
564
  self,
475
565
  *,
476
- stored: Optional[bool] = None,
477
- print_stats: bool = False,
478
- on_error: Literal['abort', 'ignore'] = 'abort',
479
- **kwargs: Union[ts.ColumnType, builtins.type, _GenericAlias, exprs.Expr]
566
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
567
+ **kwargs: ts.ColumnType | builtins.type | _GenericAlias | exprs.Expr,
480
568
  ) -> UpdateStatus:
481
569
  """
482
- Adds a column to the table.
570
+ Adds an ordinary (non-computed) column to the table.
483
571
 
484
572
  Args:
485
573
  kwargs: Exactly one keyword argument of the form `col_name=col_type`.
486
- stored: Whether the column is materialized and stored or computed on demand. Only valid for image columns.
487
- print_stats: If `True`, print execution metrics during evaluation.
488
- on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
489
- row.
574
+ if_exists: Determines the behavior if the column already exists. Must be one of the following:
490
575
 
491
- - `'abort'`: an exception will be raised and the column will not be added.
492
- - `'ignore'`: execution will continue and the column will be added. Any rows
493
- with errors will have a `None` value for the column, with information about the error stored in the
494
- corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
576
+ - `'error'`: an exception will be raised.
577
+ - `'ignore'`: do nothing and return.
578
+ - `'replace'` or `'replace_force'`: drop the existing column and add the new column, if it has
579
+ no dependents.
495
580
 
496
581
  Returns:
497
582
  Information about the execution status of the operation.
498
583
 
499
584
  Raises:
500
- Error: If the column name is invalid or already exists.
585
+ Error: If the column name is invalid, or already exists and `if_exists='erorr'`,
586
+ or `if_exists='replace*'` but the column has dependents or is a basetable column.
501
587
 
502
588
  Examples:
503
589
  Add an int column:
@@ -506,52 +592,59 @@ class Table(SchemaObject):
506
592
 
507
593
  Alternatively, this can also be expressed as:
508
594
 
509
- >>> tbl['new_col'] = pxt.Int
595
+ >>> tbl.add_columns({'new_col': pxt.Int})
510
596
  """
511
- self._check_is_dropped()
512
597
  # verify kwargs and construct column schema dict
513
598
  if len(kwargs) != 1:
514
599
  raise excs.Error(
515
- f'add_column() requires exactly one keyword argument of the form "col_name=col_type"; '
516
- f'got {len(kwargs)} instead ({", ".join(list(kwargs.keys()))})'
600
+ f'add_column() requires exactly one keyword argument of the form `col_name=col_type`; '
601
+ f'got {len(kwargs)} arguments instead ({", ".join(kwargs.keys())})'
517
602
  )
518
- col_name, spec = next(iter(kwargs.items()))
519
- if not is_valid_identifier(col_name):
520
- raise excs.Error(f'Invalid column name: {col_name!r}')
521
-
522
- col_schema: dict[str, Any] = {}
523
- if isinstance(spec, (ts.ColumnType, builtins.type, _GenericAlias)):
524
- col_schema['type'] = ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)
525
- else:
526
- col_schema['value'] = spec
527
- if stored is not None:
528
- col_schema['stored'] = stored
529
-
530
- new_col = self._create_columns({col_name: col_schema})[0]
531
- self._verify_column(new_col, set(self._schema.keys()), set(self._query_names))
532
- status = self._tbl_version.add_columns([new_col], print_stats=print_stats, on_error=on_error)
533
- FileCache.get().emit_eviction_warnings()
534
- return status
603
+ col_type = next(iter(kwargs.values()))
604
+ if not isinstance(col_type, (ts.ColumnType, type, _GenericAlias)):
605
+ raise excs.Error(
606
+ 'The argument to add_column() must be a type; did you intend to use add_computed_column() instead?'
607
+ )
608
+ return self.add_columns(kwargs, if_exists=if_exists)
535
609
 
536
610
  def add_computed_column(
537
611
  self,
538
612
  *,
539
- stored: Optional[bool] = None,
613
+ stored: bool | None = None,
614
+ destination: str | Path | None = None,
540
615
  print_stats: bool = False,
541
616
  on_error: Literal['abort', 'ignore'] = 'abort',
542
- **kwargs: exprs.Expr
617
+ if_exists: Literal['error', 'ignore', 'replace'] = 'error',
618
+ **kwargs: exprs.Expr,
543
619
  ) -> UpdateStatus:
544
620
  """
545
621
  Adds a computed column to the table.
546
622
 
547
623
  Args:
548
624
  kwargs: Exactly one keyword argument of the form `col_name=expression`.
625
+ stored: Whether the column is materialized and stored or computed on demand.
626
+ destination: An object store reference for persisting computed files.
627
+ print_stats: If `True`, print execution metrics during evaluation.
628
+ on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
629
+ row.
630
+
631
+ - `'abort'`: an exception will be raised and the column will not be added.
632
+ - `'ignore'`: execution will continue and the column will be added. Any rows
633
+ with errors will have a `None` value for the column, with information about the error stored in the
634
+ corresponding `tbl.col_name.errormsg` and `tbl.col_name.errortype` fields.
635
+ if_exists: Determines the behavior if the column already exists. Must be one of the following:
636
+
637
+ - `'error'`: an exception will be raised.
638
+ - `'ignore'`: do nothing and return.
639
+ - `'replace' or 'replace_force'`: drop the existing column and add the new column, iff it has
640
+ no dependents.
549
641
 
550
642
  Returns:
551
643
  Information about the execution status of the operation.
552
644
 
553
645
  Raises:
554
- Error: If the column name is invalid or already exists.
646
+ Error: If the column name is invalid or already exists and `if_exists='error'`,
647
+ or `if_exists='replace*'` but the column has dependents or is a basetable column.
555
648
 
556
649
  Examples:
557
650
  For a table with an image column `frame`, add an image column `rotated` that rotates the image by
@@ -563,25 +656,53 @@ class Table(SchemaObject):
563
656
 
564
657
  >>> tbl.add_computed_column(rotated=tbl.frame.rotate(90), stored=False)
565
658
  """
566
- self._check_is_dropped()
567
- if len(kwargs) != 1:
568
- raise excs.Error(
569
- f'add_computed_column() requires exactly one keyword argument of the form "column-name=type|value-expression"; '
570
- f'got {len(kwargs)} arguments instead ({", ".join(list(kwargs.keys()))})'
571
- )
572
- col_name, spec = next(iter(kwargs.items()))
573
- if not is_valid_identifier(col_name):
574
- raise excs.Error(f'Invalid column name: {col_name!r}')
575
-
576
- col_schema: dict[str, Any] = {'value': spec}
577
- if stored is not None:
578
- col_schema['stored'] = stored
659
+ from pixeltable.catalog import Catalog
579
660
 
580
- new_col = self._create_columns({col_name: col_schema})[0]
581
- self._verify_column(new_col, set(self._schema.keys()), set(self._query_names))
582
- status = self._tbl_version.add_columns([new_col], print_stats=print_stats, on_error=on_error)
583
- FileCache.get().emit_eviction_warnings()
584
- return status
661
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
662
+ self.__check_mutable('add columns to')
663
+ if len(kwargs) != 1:
664
+ raise excs.Error(
665
+ f'add_computed_column() requires exactly one keyword argument of the form '
666
+ '`col_name=col_type` or `col_name=expression`; '
667
+ f'got {len(kwargs)} arguments instead ({", ".join(kwargs.keys())})'
668
+ )
669
+ col_name, spec = next(iter(kwargs.items()))
670
+ if not is_valid_identifier(col_name):
671
+ raise excs.Error(f'Invalid column name: {col_name}')
672
+
673
+ col_schema: dict[str, Any] = {'value': spec}
674
+ if stored is not None:
675
+ col_schema['stored'] = stored
676
+
677
+ if destination is not None:
678
+ col_schema['destination'] = destination
679
+
680
+ # Raise an error if the column expression refers to a column error property
681
+ if isinstance(spec, exprs.Expr):
682
+ for e in spec.subexprs(expr_class=exprs.ColumnPropertyRef, traverse_matches=False):
683
+ if e.is_cellmd_prop():
684
+ raise excs.Error(
685
+ f'Use of a reference to the {e.prop.name.lower()!r} property of another column '
686
+ f'is not allowed in a computed column.'
687
+ )
688
+
689
+ # handle existing columns based on if_exists parameter
690
+ cols_to_ignore = self._ignore_or_drop_existing_columns(
691
+ [col_name], IfExistsParam.validated(if_exists, 'if_exists')
692
+ )
693
+ # if the column to add already exists and user asked to ignore
694
+ # existing column, there's nothing to do.
695
+ result = UpdateStatus()
696
+ if len(cols_to_ignore) != 0:
697
+ assert cols_to_ignore[0] == col_name
698
+ return result
699
+
700
+ new_col = self._create_columns({col_name: col_schema})[0]
701
+ self._verify_column(new_col)
702
+ assert self._tbl_version is not None
703
+ result += self._tbl_version.get().add_columns([new_col], print_stats=print_stats, on_error=on_error)
704
+ FileCache.get().emit_eviction_warnings()
705
+ return result
585
706
 
586
707
  @classmethod
587
708
  def _validate_column_spec(cls, name: str, spec: dict[str, Any]) -> None:
@@ -591,118 +712,132 @@ class Table(SchemaObject):
591
712
  (on account of containing Python Callables or Exprs).
592
713
  """
593
714
  assert isinstance(spec, dict)
594
- valid_keys = {'type', 'value', 'stored', 'media_validation'}
595
- for k in spec.keys():
715
+ valid_keys = {'type', 'value', 'stored', 'media_validation', 'destination'}
716
+ for k in spec:
596
717
  if k not in valid_keys:
597
- raise excs.Error(f'Column {name}: invalid key {k!r}')
718
+ raise excs.Error(f'Column {name!r}: invalid key {k!r}')
598
719
 
599
720
  if 'type' not in spec and 'value' not in spec:
600
- raise excs.Error(f"Column {name}: 'type' or 'value' must be specified")
721
+ raise excs.Error(f"Column {name!r}: 'type' or 'value' must be specified")
601
722
 
602
- if 'type' in spec:
603
- if not isinstance(spec['type'], (ts.ColumnType, type, _GenericAlias)):
604
- raise excs.Error(f'Column {name}: "type" must be a type or ColumnType, got {spec["type"]}')
723
+ if 'type' in spec and not isinstance(spec['type'], (ts.ColumnType, type, _GenericAlias)):
724
+ raise excs.Error(f"Column {name!r}: 'type' must be a type or ColumnType; got {spec['type']}")
605
725
 
606
726
  if 'value' in spec:
607
727
  value_expr = exprs.Expr.from_object(spec['value'])
608
728
  if value_expr is None:
609
- raise excs.Error(f'Column {name}: value must be a Pixeltable expression.')
729
+ raise excs.Error(f"Column {name!r}: 'value' must be a Pixeltable expression.")
610
730
  if 'type' in spec:
611
- raise excs.Error(f"Column {name}: 'type' is redundant if 'value' is specified")
731
+ raise excs.Error(f"Column {name!r}: 'type' is redundant if 'value' is specified")
612
732
 
613
733
  if 'media_validation' in spec:
614
- _ = catalog.MediaValidation.validated(spec['media_validation'], f'Column {name}: media_validation')
734
+ _ = catalog.MediaValidation.validated(spec['media_validation'], f'Column {name!r}: media_validation')
615
735
 
616
736
  if 'stored' in spec and not isinstance(spec['stored'], bool):
617
- raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
737
+ raise excs.Error(f"Column {name!r}: 'stored' must be a bool; got {spec['stored']}")
738
+
739
+ d = spec.get('destination')
740
+ if d is not None and not isinstance(d, (str, Path)):
741
+ raise excs.Error(f'Column {name!r}: `destination` must be a string or path; got {d}')
618
742
 
619
743
  @classmethod
620
744
  def _create_columns(cls, schema: dict[str, Any]) -> list[Column]:
621
745
  """Construct list of Columns, given schema"""
622
746
  columns: list[Column] = []
623
747
  for name, spec in schema.items():
624
- col_type: Optional[ts.ColumnType] = None
625
- value_expr: Optional[exprs.Expr] = None
626
- primary_key: Optional[bool] = None
627
- media_validation: Optional[catalog.MediaValidation] = None
748
+ col_type: ts.ColumnType | None = None
749
+ value_expr: exprs.Expr | None = None
750
+ primary_key: bool = False
751
+ media_validation: catalog.MediaValidation | None = None
628
752
  stored = True
753
+ destination: str | None = None
629
754
 
630
755
  if isinstance(spec, (ts.ColumnType, type, _GenericAlias)):
631
756
  col_type = ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)
632
757
  elif isinstance(spec, exprs.Expr):
633
758
  # create copy so we can modify it
634
759
  value_expr = spec.copy()
760
+ value_expr.bind_rel_paths()
635
761
  elif isinstance(spec, dict):
636
762
  cls._validate_column_spec(name, spec)
637
763
  if 'type' in spec:
638
764
  col_type = ts.ColumnType.normalize_type(
639
- spec['type'], nullable_default=True, allow_builtin_types=False)
765
+ spec['type'], nullable_default=True, allow_builtin_types=False
766
+ )
640
767
  value_expr = spec.get('value')
641
768
  if value_expr is not None and isinstance(value_expr, exprs.Expr):
642
769
  # create copy so we can modify it
643
770
  value_expr = value_expr.copy()
771
+ value_expr.bind_rel_paths()
644
772
  stored = spec.get('stored', True)
645
- primary_key = spec.get('primary_key')
773
+ primary_key = spec.get('primary_key', False)
646
774
  media_validation_str = spec.get('media_validation')
647
775
  media_validation = (
648
- catalog.MediaValidation[media_validation_str.upper()] if media_validation_str is not None
649
- else None
776
+ catalog.MediaValidation[media_validation_str.upper()] if media_validation_str is not None else None
650
777
  )
778
+ destination = spec.get('destination')
651
779
  else:
652
780
  raise excs.Error(f'Invalid value for column {name!r}')
653
781
 
654
782
  column = Column(
655
- name, col_type=col_type, computed_with=value_expr, stored=stored, is_pk=primary_key,
656
- media_validation=media_validation)
783
+ name,
784
+ col_type=col_type,
785
+ computed_with=value_expr,
786
+ stored=stored,
787
+ is_pk=primary_key,
788
+ media_validation=media_validation,
789
+ destination=destination,
790
+ )
791
+ # Validate the column's resolved_destination. This will ensure that if the column uses a default (global)
792
+ # media destination, it gets validated at this time.
793
+ ObjectOps.validate_destination(column.destination, column.name)
657
794
  columns.append(column)
795
+
658
796
  return columns
659
797
 
660
798
  @classmethod
661
- def _verify_column(
662
- cls, col: Column, existing_column_names: set[str], existing_query_names: Optional[set[str]] = None
663
- ) -> None:
799
+ def validate_column_name(cls, name: str) -> None:
800
+ """Check that a name is usable as a pixeltable column name"""
801
+ if is_system_column_name(name) or is_python_keyword(name):
802
+ raise excs.Error(f'{name!r} is a reserved name in Pixeltable; please choose a different column name.')
803
+ if not is_valid_identifier(name):
804
+ raise excs.Error(f'Invalid column name: {name}')
805
+
806
+ @classmethod
807
+ def _verify_column(cls, col: Column) -> None:
664
808
  """Check integrity of user-supplied Column and supply defaults"""
665
- if is_system_column_name(col.name):
666
- raise excs.Error(f'{col.name!r} is a reserved name in Pixeltable; please choose a different column name.')
667
- if not is_valid_identifier(col.name):
668
- raise excs.Error(f"Invalid column name: {col.name!r}")
669
- if col.name in existing_column_names:
670
- raise excs.Error(f'Duplicate column name: {col.name!r}')
671
- if existing_query_names is not None and col.name in existing_query_names:
672
- raise excs.Error(f'Column name conflicts with a registered query: {col.name!r}')
673
- if col.stored is False and not (col.is_computed and col.col_type.is_image_type()):
674
- raise excs.Error(f'Column {col.name!r}: stored={col.stored} only applies to computed image columns')
809
+ cls.validate_column_name(col.name)
810
+ if col.stored is False and not col.is_computed:
811
+ raise excs.Error(f'Column {col.name!r}: `stored={col.stored}` only applies to computed columns')
675
812
  if col.stored is False and col.has_window_fn_call():
676
- raise excs.Error((
677
- f'Column {col.name!r}: stored={col.stored} is not valid for image columns computed with a streaming '
678
- f'function'))
813
+ raise excs.Error(
814
+ (
815
+ f'Column {col.name!r}: `stored={col.stored}` is not valid for image columns computed with a '
816
+ f'streaming function'
817
+ )
818
+ )
819
+ if col._explicit_destination is not None and not (col.stored and col.is_computed):
820
+ raise excs.Error(f'Column {col.name!r}: `destination` property only applies to stored computed columns')
679
821
 
680
822
  @classmethod
681
823
  def _verify_schema(cls, schema: list[Column]) -> None:
682
824
  """Check integrity of user-supplied schema and set defaults"""
683
- column_names: set[str] = set()
684
825
  for col in schema:
685
- cls._verify_column(col, column_names)
686
- column_names.add(col.name)
687
-
688
- def __check_column_name_exists(self, column_name: str, include_bases: bool = False) -> None:
689
- col = self._tbl_version_path.get_column(column_name, include_bases)
690
- if col is None:
691
- raise excs.Error(f'Column {column_name!r} unknown')
692
-
693
- def __check_column_ref_exists(self, col_ref: ColumnRef, include_bases: bool = False) -> None:
694
- exists = self._tbl_version_path.has_column(col_ref.col, include_bases)
695
- if not exists:
696
- raise excs.Error(f'Unknown column: {col_ref.col.qualified_name}')
826
+ cls._verify_column(col)
697
827
 
698
- def drop_column(self, column: Union[str, ColumnRef]) -> None:
828
+ def drop_column(self, column: str | ColumnRef, if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
699
829
  """Drop a column from the table.
700
830
 
701
831
  Args:
702
832
  column: The name or reference of the column to drop.
833
+ if_not_exists: Directive for handling a non-existent column. Must be one of the following:
834
+
835
+ - `'error'`: raise an error if the column does not exist.
836
+ - `'ignore'`: do nothing if the column does not exist.
703
837
 
704
838
  Raises:
705
- Error: If the column does not exist or if it is referenced by a dependent computed column.
839
+ Error: If the column does not exist and `if_exists='error'`,
840
+ or if it is referenced by a dependent computed column.
706
841
 
707
842
  Examples:
708
843
  Drop the column `col` from the table `my_table` by column name:
@@ -714,42 +849,96 @@ class Table(SchemaObject):
714
849
 
715
850
  >>> tbl = pxt.get_table('my_table')
716
851
  ... tbl.drop_column(tbl.col)
852
+
853
+ Drop the column `col` from the table `my_table` if it exists, otherwise do nothing:
854
+
855
+ >>> tbl = pxt.get_table('my_table')
856
+ ... tbl.drop_col(tbl.col, if_not_exists='ignore')
717
857
  """
718
- self._check_is_dropped()
719
- col: Column = None
720
- if isinstance(column, str):
721
- self.__check_column_name_exists(column)
722
- col = self._tbl_version.cols_by_name[column]
723
- else:
724
- self.__check_column_ref_exists(column)
725
- col = column.col
858
+ from pixeltable.catalog import Catalog
726
859
 
727
- dependent_user_cols = [c for c in col.dependent_cols if c.name is not None]
728
- if len(dependent_user_cols) > 0:
729
- raise excs.Error(
730
- f'Cannot drop column `{col.name}` because the following columns depend on it:\n'
731
- f'{", ".join(c.name for c in dependent_user_cols)}'
732
- )
860
+ cat = Catalog.get()
733
861
 
734
- # See if this column has a dependent store. We need to look through all stores in all
735
- # (transitive) views of this table.
736
- dependent_stores = [
737
- (view, store)
738
- for view in [self] + self._get_views(recursive=True)
739
- for store in view._tbl_version.external_stores.values()
740
- if col in store.get_local_columns()
741
- ]
742
- if len(dependent_stores) > 0:
743
- dependent_store_names = [
744
- store.name if view._id == self._id else f'{store.name} (in view `{view._name}`)'
745
- for view, store in dependent_stores
862
+ # lock_mutable_tree=True: we need to be able to see whether any transitive view has column dependents
863
+ with cat.begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
864
+ self.__check_mutable('drop columns from')
865
+ col: Column = None
866
+ if_not_exists_ = IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
867
+
868
+ if isinstance(column, str):
869
+ col = self._tbl_version_path.get_column(column)
870
+ if col is None:
871
+ if if_not_exists_ == IfNotExistsParam.ERROR:
872
+ raise excs.Error(f'Unknown column: {column}')
873
+ assert if_not_exists_ == IfNotExistsParam.IGNORE
874
+ return
875
+ if col.get_tbl().id != self._tbl_version_path.tbl_id:
876
+ raise excs.Error(f'Cannot drop base table column {col.name!r}')
877
+ col = self._tbl_version.get().cols_by_name[column]
878
+ else:
879
+ exists = self._tbl_version_path.has_column(column.col)
880
+ if not exists:
881
+ if if_not_exists_ == IfNotExistsParam.ERROR:
882
+ raise excs.Error(f'Unknown column: {column.col.qualified_name}')
883
+ assert if_not_exists_ == IfNotExistsParam.IGNORE
884
+ return
885
+ col = column.col
886
+ if col.get_tbl().id != self._tbl_version_path.tbl_id:
887
+ raise excs.Error(f'Cannot drop base table column {col.name!r}')
888
+
889
+ dependent_user_cols = [c for c in cat.get_column_dependents(col.get_tbl().id, col.id) if c.name is not None]
890
+ if len(dependent_user_cols) > 0:
891
+ raise excs.Error(
892
+ f'Cannot drop column {col.name!r} because the following columns depend on it:\n'
893
+ f'{", ".join(c.name for c in dependent_user_cols)}'
894
+ )
895
+
896
+ views = self._get_views(recursive=True, mutable_only=True)
897
+
898
+ # See if any view predicates depend on this column
899
+ dependent_views: list[tuple[Table, exprs.Expr]] = []
900
+ for view in views:
901
+ if view._tbl_version is not None:
902
+ predicate = view._tbl_version.get().predicate
903
+ if predicate is not None:
904
+ for predicate_col in exprs.Expr.get_refd_column_ids(predicate.as_dict()):
905
+ if predicate_col.tbl_id == col.get_tbl().id and predicate_col.col_id == col.id:
906
+ dependent_views.append((view, predicate))
907
+
908
+ if len(dependent_views) > 0:
909
+ dependent_views_str = '\n'.join(
910
+ f'view: {view._path()}, predicate: {predicate}' for view, predicate in dependent_views
911
+ )
912
+ raise excs.Error(
913
+ f'Cannot drop column {col.name!r} because the following views depend on it:\n{dependent_views_str}'
914
+ )
915
+
916
+ # See if this column has a dependent store. We need to look through all stores in all
917
+ # (transitive) views of this table.
918
+ col_handle = col.handle
919
+ dependent_stores = [
920
+ (view, store)
921
+ for view in (self, *views)
922
+ for store in view._tbl_version.get().external_stores.values()
923
+ if col_handle in store.get_local_columns()
746
924
  ]
747
- raise excs.Error(
748
- f'Cannot drop column `{col.name}` because the following external stores depend on it:\n'
749
- f'{", ".join(dependent_store_names)}'
750
- )
925
+ if len(dependent_stores) > 0:
926
+ dependent_store_names = [
927
+ store.name if view._id == self._id else f'{store.name} (in view {view._name!r})'
928
+ for view, store in dependent_stores
929
+ ]
930
+ raise excs.Error(
931
+ f'Cannot drop column {col.name!r} because the following external stores depend on it:\n'
932
+ f'{", ".join(dependent_store_names)}'
933
+ )
934
+ all_columns = self.columns()
935
+ if len(all_columns) == 1 and col.name == all_columns[0]:
936
+ raise excs.Error(
937
+ f'Cannot drop column {col.name!r} because it is the last remaining column in this table.'
938
+ f' Tables must have at least one column.'
939
+ )
751
940
 
752
- self._tbl_version.drop_column(col)
941
+ self._tbl_version.get().drop_column(col)
753
942
 
754
943
  def rename_column(self, old_name: str, new_name: str) -> None:
755
944
  """Rename a column.
@@ -767,89 +956,164 @@ class Table(SchemaObject):
767
956
  >>> tbl = pxt.get_table('my_table')
768
957
  ... tbl.rename_column('col1', 'col2')
769
958
  """
770
- self._tbl_version.rename_column(old_name, new_name)
959
+ from pixeltable.catalog import Catalog
960
+
961
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=False):
962
+ self._tbl_version.get().rename_column(old_name, new_name)
963
+
964
+ def _list_index_info_for_test(self) -> list[dict[str, Any]]:
965
+ """
966
+ Returns list of all the indexes on this table. Used for testing.
967
+
968
+ Returns:
969
+ A list of index information, each containing the index's
970
+ id, name, and the name of the column it indexes.
971
+ """
972
+ index_info = []
973
+ for idx_name, idx in self._tbl_version.get().idxs_by_name.items():
974
+ index_info.append({'_id': idx.id, '_name': idx_name, '_column': idx.col.name})
975
+ return index_info
771
976
 
772
977
  def add_embedding_index(
773
- self, column: Union[str, ColumnRef], *, idx_name: Optional[str] = None,
774
- string_embed: Optional[pxt.Function] = None, image_embed: Optional[pxt.Function] = None,
775
- metric: str = 'cosine'
978
+ self,
979
+ column: str | ColumnRef,
980
+ *,
981
+ idx_name: str | None = None,
982
+ embedding: pxt.Function | None = None,
983
+ string_embed: pxt.Function | None = None,
984
+ image_embed: pxt.Function | None = None,
985
+ metric: Literal['cosine', 'ip', 'l2'] = 'cosine',
986
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
776
987
  ) -> None:
777
988
  """
778
- Add an embedding index to the table. Once the index is added, it will be automatically kept up to data as new
989
+ Add an embedding index to the table. Once the index is created, it will be automatically kept up-to-date as new
779
990
  rows are inserted into the table.
780
991
 
781
- Indices are currently supported only for `String` and `Image` columns. The index must specify, at
782
- minimum, an embedding of the appropriate type (string or image). It may optionally specify _both_ a string
783
- and image embedding (into the same vector space); in particular, this can be used to provide similarity search
784
- of text over an image column.
992
+ To add an embedding index, one must specify, at minimum, the column to be indexed and an embedding UDF.
993
+ Only `String` and `Image` columns are currently supported.
994
+
995
+ Examples:
996
+ Here's an example that uses a
997
+ [CLIP embedding][pixeltable.functions.huggingface.clip] to index an image column:
998
+
999
+ >>> from pixeltable.functions.huggingface import clip
1000
+ >>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
1001
+ >>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
1002
+
1003
+ Once the index is created, similarity lookups can be performed using the `similarity` pseudo-function:
1004
+
1005
+ >>> reference_img = PIL.Image.open('my_image.jpg')
1006
+ >>> sim = tbl.img.similarity(image=reference_img)
1007
+ >>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
1008
+
1009
+ If the embedding UDF is a multimodal embedding (supporting more than one data type), then lookups may be
1010
+ performed using any of its supported modalities. In our example, CLIP supports both text and images, so we
1011
+ can also search for images using a text description:
1012
+
1013
+ >>> sim = tbl.img.similarity(string='a picture of a train')
1014
+ >>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
1015
+
1016
+ Audio and video lookups would look like this:
1017
+
1018
+ >>> sim = tbl.img.similarity(audio='/path/to/audio.flac')
1019
+ >>> sim = tbl.img.similarity(video='/path/to/video.mp4')
785
1020
 
786
1021
  Args:
787
- column: The name of, or reference to, the column to index; must be a `String` or `Image` column.
788
- idx_name: The name of index. If not specified, a name such as `'idx0'` will be generated automatically.
789
- If specified, the name must be unique for this table.
790
- string_embed: A function to embed text; required if the column is a `String` column.
791
- image_embed: A function to embed images; required if the column is an `Image` column.
792
- metric: Distance metric to use for the index; one of `'cosine'`, `'ip'`, or `'l2'`;
793
- the default is `'cosine'`.
1022
+ column: The name of, or reference to, the column to be indexed; must be a `String` or `Image` column.
1023
+ idx_name: An optional name for the index. If not specified, a name such as `'idx0'` will be generated
1024
+ automatically. If specified, the name must be unique for this table and a valid pixeltable column name.
1025
+ embedding: The UDF to use for the embedding. Must be a UDF that accepts a single argument of type `String`
1026
+ or `Image` (as appropriate for the column being indexed) and returns a fixed-size 1-dimensional
1027
+ array of floats.
1028
+ string_embed: An optional UDF to use for the string embedding component of this index.
1029
+ Can be used in conjunction with `image_embed` to construct multimodal embeddings manually, by
1030
+ specifying different embedding functions for different data types.
1031
+ image_embed: An optional UDF to use for the image embedding component of this index.
1032
+ Can be used in conjunction with `string_embed` to construct multimodal embeddings manually, by
1033
+ specifying different embedding functions for different data types.
1034
+ metric: Distance metric to use for the index; one of `'cosine'`, `'ip'`, or `'l2'`.
1035
+ The default is `'cosine'`.
1036
+ if_exists: Directive for handling an existing index with the same name. Must be one of the following:
1037
+
1038
+ - `'error'`: raise an error if an index with the same name already exists.
1039
+ - `'ignore'`: do nothing if an index with the same name already exists.
1040
+ - `'replace'` or `'replace_force'`: replace the existing index with the new one.
794
1041
 
795
1042
  Raises:
796
- Error: If an index with that name already exists for the table, or if the specified column does not exist.
1043
+ Error: If an index with the specified name already exists for the table and `if_exists='error'`, or if
1044
+ the specified column does not exist.
797
1045
 
798
1046
  Examples:
799
- Add an index to the `img` column of the table `my_table` by column name:
1047
+ Add an index to the `img` column of the table `my_table`:
800
1048
 
1049
+ >>> from pixeltable.functions.huggingface import clip
801
1050
  >>> tbl = pxt.get_table('my_table')
802
- ... tbl.add_embedding_index('img', image_embed=my_image_func)
1051
+ >>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
1052
+ >>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
803
1053
 
804
- Add an index to the `img` column of the table `my_table` by column reference:
805
- >>> tbl = pxt.get_table('my_table')
806
- ... tbl.add_embedding_index(tbl.img, image_embed=my_image_func)
1054
+ Alternatively, the `img` column may be specified by name:
1055
+
1056
+ >>> tbl.add_embedding_index('img', embedding=embedding_fn)
807
1057
 
808
- Add another index to the `img` column, using the inner product as the distance metric,
809
- and with a specific name; `string_embed` is also specified in order to search with text:
1058
+ Add a second index to the `img` column, using the inner product as the distance metric,
1059
+ and with a specific name:
810
1060
 
811
1061
  >>> tbl.add_embedding_index(
812
- ... 'img',
813
- ... idx_name='clip_idx',
814
- ... image_embed=my_image_func,
815
- ... string_embed=my_string_func,
1062
+ ... tbl.img,
1063
+ ... idx_name='ip_idx',
1064
+ ... embedding=embedding_fn,
816
1065
  ... metric='ip'
817
1066
  ... )
818
1067
 
819
- Alternatively:
1068
+ Add an index using separately specified string and image embeddings:
820
1069
 
821
1070
  >>> tbl.add_embedding_index(
822
1071
  ... tbl.img,
823
- ... idx_name='clip_idx',
824
- ... image_embed=my_image_func,
825
- ... string_embed=my_string_func,
826
- ... metric='ip'
1072
+ ... string_embed=string_embedding_fn,
1073
+ ... image_embed=image_embedding_fn
827
1074
  ... )
828
1075
  """
829
- if self._tbl_version_path.is_snapshot():
830
- raise excs.Error('Cannot add an index to a snapshot')
831
- col: Column
832
- if isinstance(column, str):
833
- self.__check_column_name_exists(column, include_bases=True)
834
- col = self._tbl_version_path.get_column(column, include_bases=True)
835
- else:
836
- self.__check_column_ref_exists(column, include_bases=True)
837
- col = column.col
838
-
839
- if idx_name is not None and idx_name in self._tbl_version.idxs_by_name:
840
- raise excs.Error(f'Duplicate index name: {idx_name}')
841
- from pixeltable.index import EmbeddingIndex
842
-
843
- # create the EmbeddingIndex instance to verify args
844
- idx = EmbeddingIndex(col, metric=metric, string_embed=string_embed, image_embed=image_embed)
845
- status = self._tbl_version.add_index(col, idx_name=idx_name, idx=idx)
846
- # TODO: how to deal with exceptions here? drop the index and raise?
847
- FileCache.get().emit_eviction_warnings()
1076
+ from pixeltable.catalog import Catalog
1077
+
1078
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
1079
+ self.__check_mutable('add an index to')
1080
+ col = self._resolve_column_parameter(column)
1081
+
1082
+ if idx_name is not None and idx_name in self._tbl_version.get().idxs_by_name:
1083
+ if_exists_ = IfExistsParam.validated(if_exists, 'if_exists')
1084
+ # An index with the same name already exists.
1085
+ # Handle it according to if_exists.
1086
+ if if_exists_ == IfExistsParam.ERROR:
1087
+ raise excs.Error(f'Duplicate index name: {idx_name}')
1088
+ if not isinstance(self._tbl_version.get().idxs_by_name[idx_name].idx, index.EmbeddingIndex):
1089
+ raise excs.Error(
1090
+ f'Index {idx_name!r} is not an embedding index. Cannot {if_exists_.name.lower()} it.'
1091
+ )
1092
+ if if_exists_ == IfExistsParam.IGNORE:
1093
+ return
1094
+ assert if_exists_ in (IfExistsParam.REPLACE, IfExistsParam.REPLACE_FORCE)
1095
+ self.drop_index(idx_name=idx_name)
1096
+ assert idx_name not in self._tbl_version.get().idxs_by_name
1097
+ from pixeltable.index import EmbeddingIndex
1098
+
1099
+ # idx_name must be a valid pixeltable column name
1100
+ if idx_name is not None:
1101
+ Table.validate_column_name(idx_name)
1102
+
1103
+ # validate EmbeddingIndex args
1104
+ idx = EmbeddingIndex(metric=metric, embed=embedding, string_embed=string_embed, image_embed=image_embed)
1105
+ _ = idx.create_value_expr(col)
1106
+ _ = self._tbl_version.get().add_index(col, idx_name=idx_name, idx=idx)
1107
+ # TODO: how to deal with exceptions here? drop the index and raise?
1108
+ FileCache.get().emit_eviction_warnings()
848
1109
 
849
1110
  def drop_embedding_index(
850
- self, *,
851
- column: Union[str, ColumnRef, None] = None,
852
- idx_name: Optional[str] = None) -> None:
1111
+ self,
1112
+ *,
1113
+ column: str | ColumnRef | None = None,
1114
+ idx_name: str | None = None,
1115
+ if_not_exists: Literal['error', 'ignore'] = 'error',
1116
+ ) -> None:
853
1117
  """
854
1118
  Drop an embedding index from the table. Either a column name or an index name (but not both) must be
855
1119
  specified. If a column name or reference is specified, it must be a column containing exactly one
@@ -859,11 +1123,20 @@ class Table(SchemaObject):
859
1123
  column: The name of, or reference to, the column from which to drop the index.
860
1124
  The column must have only one embedding index.
861
1125
  idx_name: The name of the index to drop.
1126
+ if_not_exists: Directive for handling a non-existent index. Must be one of the following:
1127
+
1128
+ - `'error'`: raise an error if the index does not exist.
1129
+ - `'ignore'`: do nothing if the index does not exist.
1130
+
1131
+ Note that `if_not_exists` parameter is only applicable when an `idx_name` is specified
1132
+ and it does not exist, or when `column` is specified and it has no index.
1133
+ `if_not_exists` does not apply to non-exisitng column.
862
1134
 
863
1135
  Raises:
864
1136
  Error: If `column` is specified, but the column does not exist, or it contains no embedding
865
- indices or multiple embedding indices.
866
- Error: If `idx_name` is specified, but the index does not exist or is not an embedding index.
1137
+ indices and `if_not_exists='error'`, or the column has multiple embedding indices.
1138
+ Error: If `idx_name` is specified, but the index is not an embedding index, or
1139
+ the index does not exist and `if_not_exists='error'`.
867
1140
 
868
1141
  Examples:
869
1142
  Drop the embedding index on the `img` column of the table `my_table` by column name:
@@ -880,25 +1153,46 @@ class Table(SchemaObject):
880
1153
  >>> tbl = pxt.get_table('my_table')
881
1154
  ... tbl.drop_embedding_index(idx_name='idx1')
882
1155
 
1156
+ Drop the embedding index `idx1` of the table `my_table` by index name, if it exists, otherwise do nothing:
1157
+ >>> tbl = pxt.get_table('my_table')
1158
+ ... tbl.drop_embedding_index(idx_name='idx1', if_not_exists='ignore')
883
1159
  """
1160
+ from pixeltable.catalog import Catalog
1161
+
884
1162
  if (column is None) == (idx_name is None):
885
1163
  raise excs.Error("Exactly one of 'column' or 'idx_name' must be provided")
886
1164
 
1165
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
1166
+ col: Column = None
1167
+ if idx_name is None:
1168
+ col = self._resolve_column_parameter(column)
1169
+ assert col is not None
1170
+
1171
+ self._drop_index(col=col, idx_name=idx_name, _idx_class=index.EmbeddingIndex, if_not_exists=if_not_exists)
1172
+
1173
+ def _resolve_column_parameter(self, column: str | ColumnRef) -> Column:
1174
+ """Resolve a column parameter to a Column object"""
887
1175
  col: Column = None
888
- if idx_name is None:
889
- if isinstance(column, str):
890
- self.__check_column_name_exists(column, include_bases=True)
891
- col = self._tbl_version_path.get_column(column, include_bases=True)
892
- else:
893
- self.__check_column_ref_exists(column, include_bases=True)
894
- col = column.col
895
- assert col is not None
896
- self._drop_index(col=col, idx_name=idx_name, _idx_class=index.EmbeddingIndex)
1176
+ if isinstance(column, str):
1177
+ col = self._tbl_version_path.get_column(column)
1178
+ if col is None:
1179
+ raise excs.Error(f'Unknown column: {column}')
1180
+ elif isinstance(column, ColumnRef):
1181
+ exists = self._tbl_version_path.has_column(column.col)
1182
+ if not exists:
1183
+ raise excs.Error(f'Unknown column: {column.col.qualified_name}')
1184
+ col = column.col
1185
+ else:
1186
+ raise excs.Error(f'Invalid column parameter type: {type(column)}')
1187
+ return col
897
1188
 
898
1189
  def drop_index(
899
- self, *,
900
- column: Union[str, ColumnRef, None] = None,
901
- idx_name: Optional[str] = None) -> None:
1190
+ self,
1191
+ *,
1192
+ column: str | ColumnRef | None = None,
1193
+ idx_name: str | None = None,
1194
+ if_not_exists: Literal['error', 'ignore'] = 'error',
1195
+ ) -> None:
902
1196
  """
903
1197
  Drop an index from the table. Either a column name or an index name (but not both) must be
904
1198
  specified. If a column name or reference is specified, it must be a column containing exactly one index;
@@ -908,6 +1202,14 @@ class Table(SchemaObject):
908
1202
  column: The name of, or reference to, the column from which to drop the index.
909
1203
  The column must have only one embedding index.
910
1204
  idx_name: The name of the index to drop.
1205
+ if_not_exists: Directive for handling a non-existent index. Must be one of the following:
1206
+
1207
+ - `'error'`: raise an error if the index does not exist.
1208
+ - `'ignore'`: do nothing if the index does not exist.
1209
+
1210
+ Note that `if_not_exists` parameter is only applicable when an `idx_name` is specified
1211
+ and it does not exist, or when `column` is specified and it has no index.
1212
+ `if_not_exists` does not apply to non-exisitng column.
911
1213
 
912
1214
  Raises:
913
1215
  Error: If `column` is specified, but the column does not exist, or it contains no
@@ -929,76 +1231,105 @@ class Table(SchemaObject):
929
1231
  >>> tbl = pxt.get_table('my_table')
930
1232
  ... tbl.drop_index(idx_name='idx1')
931
1233
 
1234
+ Drop the index `idx1` of the table `my_table` by index name, if it exists, otherwise do nothing:
1235
+ >>> tbl = pxt.get_table('my_table')
1236
+ ... tbl.drop_index(idx_name='idx1', if_not_exists='ignore')
1237
+
932
1238
  """
1239
+ from pixeltable.catalog import Catalog
1240
+
933
1241
  if (column is None) == (idx_name is None):
934
1242
  raise excs.Error("Exactly one of 'column' or 'idx_name' must be provided")
935
1243
 
936
- col: Column = None
937
- if idx_name is None:
938
- if isinstance(column, str):
939
- self.__check_column_name_exists(column, include_bases=True)
940
- col = self._tbl_version_path.get_column(column, include_bases=True)
941
- else:
942
- self.__check_column_ref_exists(column, include_bases=True)
943
- col = column.col
944
- assert col is not None
945
- self._drop_index(col=col, idx_name=idx_name)
1244
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=False):
1245
+ col: Column = None
1246
+ if idx_name is None:
1247
+ col = self._resolve_column_parameter(column)
1248
+ assert col is not None
1249
+
1250
+ self._drop_index(col=col, idx_name=idx_name, if_not_exists=if_not_exists)
946
1251
 
947
1252
  def _drop_index(
948
- self, *, col: Optional[Column] = None,
949
- idx_name: Optional[str] = None,
950
- _idx_class: Optional[type[index.IndexBase]] = None
1253
+ self,
1254
+ *,
1255
+ col: Column | None = None,
1256
+ idx_name: str | None = None,
1257
+ _idx_class: type[index.IndexBase] | None = None,
1258
+ if_not_exists: Literal['error', 'ignore'] = 'error',
951
1259
  ) -> None:
952
- if self._tbl_version_path.is_snapshot():
953
- raise excs.Error('Cannot drop an index from a snapshot')
1260
+ from pixeltable.catalog import Catalog
1261
+
1262
+ self.__check_mutable('drop an index from')
954
1263
  assert (col is None) != (idx_name is None)
955
1264
 
956
1265
  if idx_name is not None:
957
- if idx_name not in self._tbl_version.idxs_by_name:
958
- raise excs.Error(f'Index {idx_name!r} does not exist')
959
- idx_id = self._tbl_version.idxs_by_name[idx_name].id
1266
+ if_not_exists_ = IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
1267
+ if idx_name not in self._tbl_version.get().idxs_by_name:
1268
+ if if_not_exists_ == IfNotExistsParam.ERROR:
1269
+ raise excs.Error(f'Index {idx_name!r} does not exist')
1270
+ assert if_not_exists_ == IfNotExistsParam.IGNORE
1271
+ return
1272
+ idx_info = self._tbl_version.get().idxs_by_name[idx_name]
960
1273
  else:
961
- if col.tbl.id != self._tbl_version.id:
1274
+ if col.get_tbl().id != self._tbl_version.id:
962
1275
  raise excs.Error(
963
- f'Column {col.name!r}: cannot drop index from column that belongs to base ({col.tbl.name}!r)')
964
- idx_info = [info for info in self._tbl_version.idxs_by_name.values() if info.col.id == col.id]
1276
+ f'Column {col.name!r}: '
1277
+ f'cannot drop index from column that belongs to base table {col.get_tbl().name!r}'
1278
+ )
1279
+ idx_info_list = [info for info in self._tbl_version.get().idxs_by_name.values() if info.col.id == col.id]
965
1280
  if _idx_class is not None:
966
- idx_info = [info for info in idx_info if isinstance(info.idx, _idx_class)]
967
- if len(idx_info) == 0:
968
- raise excs.Error(f'Column {col.name!r} does not have an index')
969
- if len(idx_info) > 1:
970
- raise excs.Error(f"Column {col.name!r} has multiple indices; specify 'idx_name' instead")
971
- idx_id = idx_info[0].id
972
- self._tbl_version.drop_index(idx_id)
1281
+ idx_info_list = [info for info in idx_info_list if isinstance(info.idx, _idx_class)]
1282
+ if len(idx_info_list) == 0:
1283
+ if_not_exists_ = IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
1284
+ if if_not_exists_ == IfNotExistsParam.ERROR:
1285
+ raise excs.Error(f'Column {col.name!r} does not have an index')
1286
+ assert if_not_exists_ == IfNotExistsParam.IGNORE
1287
+ return
1288
+ if len(idx_info_list) > 1:
1289
+ raise excs.Error(f'Column {col.name!r} has multiple indices; specify `idx_name` explicitly to drop one')
1290
+ idx_info = idx_info_list[0]
1291
+
1292
+ # Find out if anything depends on this index
1293
+ val_col = idx_info.val_col
1294
+ dependent_user_cols = [
1295
+ c for c in Catalog.get().get_column_dependents(val_col.get_tbl().id, val_col.id) if c.name is not None
1296
+ ]
1297
+ if len(dependent_user_cols) > 0:
1298
+ raise excs.Error(
1299
+ f'Cannot drop index {idx_info.name!r} because the following columns depend on it:\n'
1300
+ f'{", ".join(c.name for c in dependent_user_cols)}'
1301
+ )
1302
+ self._tbl_version.get().drop_index(idx_info.id)
973
1303
 
974
1304
  @overload
975
1305
  def insert(
976
1306
  self,
977
- rows: Iterable[dict[str, Any]],
1307
+ source: TableDataSource,
978
1308
  /,
979
1309
  *,
1310
+ source_format: Literal['csv', 'excel', 'parquet', 'json'] | None = None,
1311
+ schema_overrides: dict[str, ts.ColumnType] | None = None,
1312
+ on_error: Literal['abort', 'ignore'] = 'abort',
980
1313
  print_stats: bool = False,
981
- on_error: Literal['abort', 'ignore'] = 'abort'
1314
+ **kwargs: Any,
982
1315
  ) -> UpdateStatus: ...
983
1316
 
984
1317
  @overload
985
1318
  def insert(
986
- self,
987
- *,
988
- print_stats: bool = False,
989
- on_error: Literal['abort', 'ignore'] = 'abort',
990
- **kwargs: Any
1319
+ self, /, *, on_error: Literal['abort', 'ignore'] = 'abort', print_stats: bool = False, **kwargs: Any
991
1320
  ) -> UpdateStatus: ...
992
1321
 
993
- @abc.abstractmethod # type: ignore[misc]
1322
+ @abc.abstractmethod
994
1323
  def insert(
995
1324
  self,
996
- rows: Optional[Iterable[dict[str, Any]]] = None,
1325
+ source: TableDataSource | None = None,
997
1326
  /,
998
1327
  *,
999
- print_stats: bool = False,
1328
+ source_format: Literal['csv', 'excel', 'parquet', 'json'] | None = None,
1329
+ schema_overrides: dict[str, ts.ColumnType] | None = None,
1000
1330
  on_error: Literal['abort', 'ignore'] = 'abort',
1001
- **kwargs: Any
1331
+ print_stats: bool = False,
1332
+ **kwargs: Any,
1002
1333
  ) -> UpdateStatus:
1003
1334
  """Inserts rows into this table. There are two mutually exclusive call patterns:
1004
1335
 
@@ -1006,35 +1337,40 @@ class Table(SchemaObject):
1006
1337
 
1007
1338
  ```python
1008
1339
  insert(
1009
- rows: Iterable[dict[str, Any]],
1340
+ source: TableSourceDataType,
1010
1341
  /,
1011
1342
  *,
1343
+ on_error: Literal['abort', 'ignore'] = 'abort',
1012
1344
  print_stats: bool = False,
1013
- on_error: Literal['abort', 'ignore'] = 'abort'
1014
- )```
1345
+ **kwargs: Any,
1346
+ )
1347
+ ```
1015
1348
 
1016
1349
  To insert just a single row, you can use the more concise syntax:
1017
1350
 
1018
1351
  ```python
1019
1352
  insert(
1020
1353
  *,
1021
- print_stats: bool = False,
1022
1354
  on_error: Literal['abort', 'ignore'] = 'abort',
1355
+ print_stats: bool = False,
1023
1356
  **kwargs: Any
1024
- )```
1357
+ )
1358
+ ```
1025
1359
 
1026
1360
  Args:
1027
- rows: (if inserting multiple rows) A list of rows to insert, each of which is a dictionary mapping column
1028
- names to values.
1361
+ source: A data source from which data can be imported.
1029
1362
  kwargs: (if inserting a single row) Keyword-argument pairs representing column names and values.
1030
- print_stats: If `True`, print statistics about the cost of computed columns.
1363
+ (if inserting multiple rows) Additional keyword arguments are passed to the data source.
1364
+ source_format: A hint about the format of the source data
1365
+ schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
1031
1366
  on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
1032
1367
  invalid media file (such as a corrupt image) for one of the inserted rows.
1033
1368
 
1034
1369
  - If `on_error='abort'`, then an exception will be raised and the rows will not be inserted.
1035
1370
  - If `on_error='ignore'`, then execution will continue and the rows will be inserted. Any cells
1036
- with errors will have a `None` value for that cell, with information about the error stored in the
1037
- corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
1371
+ with errors will have a `None` value for that cell, with information about the error stored in the
1372
+ corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
1373
+ print_stats: If `True`, print statistics about the cost of computed columns.
1038
1374
 
1039
1375
  Returns:
1040
1376
  An [`UpdateStatus`][pixeltable.UpdateStatus] object containing information about the update.
@@ -1046,6 +1382,7 @@ class Table(SchemaObject):
1046
1382
  - The table has been dropped.
1047
1383
  - One of the rows being inserted does not conform to the table schema.
1048
1384
  - An error occurs during processing of computed columns, and `on_error='ignore'`.
1385
+ - An error occurs while importing data from a source, and `on_error='abort'`.
1049
1386
 
1050
1387
  Examples:
1051
1388
  Insert two rows into the table `my_table` with three int columns ``a``, ``b``, and ``c``.
@@ -1057,11 +1394,24 @@ class Table(SchemaObject):
1057
1394
  Insert a single row using the alternative syntax:
1058
1395
 
1059
1396
  >>> tbl.insert(a=3, b=3, c=3)
1397
+
1398
+ Insert rows from a CSV file:
1399
+
1400
+ >>> tbl.insert(source='path/to/file.csv')
1401
+
1402
+ Insert Pydantic model instances into a table with two `pxt.Int` columns `a` and `b`:
1403
+
1404
+ >>> class MyModel(pydantic.BaseModel):
1405
+ ... a: int
1406
+ ... b: int
1407
+ ...
1408
+ ... models = [MyModel(a=1, b=2), MyModel(a=3, b=4)]
1409
+ ... tbl.insert(models)
1060
1410
  """
1061
1411
  raise NotImplementedError
1062
1412
 
1063
1413
  def update(
1064
- self, value_spec: dict[str, Any], where: Optional['pxt.exprs.Expr'] = None, cascade: bool = True
1414
+ self, value_spec: dict[str, Any], where: 'exprs.Expr' | None = None, cascade: bool = True
1065
1415
  ) -> UpdateStatus:
1066
1416
  """Update rows in this table.
1067
1417
 
@@ -1070,6 +1420,9 @@ class Table(SchemaObject):
1070
1420
  where: a predicate to filter rows to update.
1071
1421
  cascade: if True, also update all computed columns that transitively depend on the updated columns.
1072
1422
 
1423
+ Returns:
1424
+ An [`UpdateStatus`][pixeltable.UpdateStatus] object containing information about the update.
1425
+
1073
1426
  Examples:
1074
1427
  Set column `int_col` to 1 for all rows:
1075
1428
 
@@ -1087,13 +1440,19 @@ class Table(SchemaObject):
1087
1440
 
1088
1441
  >>> tbl.update({'int_col': tbl.int_col + 1}, where=tbl.int_col == 0)
1089
1442
  """
1090
- status = self._tbl_version.update(value_spec, where, cascade)
1091
- FileCache.get().emit_eviction_warnings()
1092
- return status
1443
+ from pixeltable.catalog import Catalog
1444
+
1445
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
1446
+ self.__check_mutable('update')
1447
+ result = self._tbl_version.get().update(value_spec, where, cascade)
1448
+ FileCache.get().emit_eviction_warnings()
1449
+ return result
1093
1450
 
1094
1451
  def batch_update(
1095
- self, rows: Iterable[dict[str, Any]], cascade: bool = True,
1096
- if_not_exists: Literal['error', 'ignore', 'insert'] = 'error'
1452
+ self,
1453
+ rows: Iterable[dict[str, Any]],
1454
+ cascade: bool = True,
1455
+ if_not_exists: Literal['error', 'ignore', 'insert'] = 'error',
1097
1456
  ) -> UpdateStatus:
1098
1457
  """Update rows in this table.
1099
1458
 
@@ -1111,47 +1470,137 @@ class Table(SchemaObject):
1111
1470
  Update the `name` and `age` columns for the rows with ids 1 and 2 (assuming `id` is the primary key).
1112
1471
  If either row does not exist, this raises an error:
1113
1472
 
1114
- >>> tbl.update([{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 2, 'name': 'Bob', 'age': 40}])
1473
+ >>> tbl.batch_update(
1474
+ ... [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 2, 'name': 'Bob', 'age': 40}]
1475
+ ... )
1115
1476
 
1116
1477
  Update the `name` and `age` columns for the row with `id` 1 (assuming `id` is the primary key) and insert
1117
1478
  the row with new `id` 3 (assuming this key does not exist):
1118
1479
 
1119
- >>> tbl.update(
1120
- [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 3, 'name': 'Bob', 'age': 40}],
1121
- if_not_exists='insert')
1480
+ >>> tbl.batch_update(
1481
+ ... [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 3, 'name': 'Bob', 'age': 40}],
1482
+ ... if_not_exists='insert'
1483
+ ... )
1122
1484
  """
1123
- if self._tbl_version_path.is_snapshot():
1124
- raise excs.Error('Cannot update a snapshot')
1125
- rows = list(rows)
1126
-
1127
- row_updates: list[dict[Column, exprs.Expr]] = []
1128
- pk_col_names = set(c.name for c in self._tbl_version.primary_key_columns())
1129
-
1130
- # pseudo-column _rowid: contains the rowid of the row to update and can be used instead of the primary key
1131
- has_rowid = _ROWID_COLUMN_NAME in rows[0]
1132
- rowids: list[tuple[int, ...]] = []
1133
- if len(pk_col_names) == 0 and not has_rowid:
1134
- raise excs.Error('Table must have primary key for batch update')
1135
-
1136
- for row_spec in rows:
1137
- col_vals = self._tbl_version._validate_update_spec(row_spec, allow_pk=not has_rowid, allow_exprs=False)
1138
- if has_rowid:
1139
- # we expect the _rowid column to be present for each row
1140
- assert _ROWID_COLUMN_NAME in row_spec
1141
- rowids.append(row_spec[_ROWID_COLUMN_NAME])
1142
- else:
1143
- col_names = set(col.name for col in col_vals.keys())
1144
- if any(pk_col_name not in col_names for pk_col_name in pk_col_names):
1145
- missing_cols = pk_col_names - set(col.name for col in col_vals.keys())
1146
- raise excs.Error(f'Primary key columns ({", ".join(missing_cols)}) missing in {row_spec}')
1147
- row_updates.append(col_vals)
1148
- status = self._tbl_version.batch_update(
1149
- row_updates, rowids, error_if_not_exists=if_not_exists == 'error',
1150
- insert_if_not_exists=if_not_exists == 'insert', cascade=cascade)
1151
- FileCache.get().emit_eviction_warnings()
1152
- return status
1153
-
1154
- def delete(self, where: Optional['pxt.exprs.Expr'] = None) -> UpdateStatus:
1485
+ from pixeltable.catalog import Catalog
1486
+
1487
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
1488
+ self.__check_mutable('update')
1489
+ rows = list(rows)
1490
+
1491
+ row_updates: list[dict[Column, exprs.Expr]] = []
1492
+ pk_col_names = {c.name for c in self._tbl_version.get().primary_key_columns()}
1493
+
1494
+ # pseudo-column _rowid: contains the rowid of the row to update and can be used instead of the primary key
1495
+ has_rowid = _ROWID_COLUMN_NAME in rows[0]
1496
+ rowids: list[tuple[int, ...]] = []
1497
+ if len(pk_col_names) == 0 and not has_rowid:
1498
+ raise excs.Error('Table must have primary key for batch update')
1499
+
1500
+ for row_spec in rows:
1501
+ col_vals = self._tbl_version.get()._validate_update_spec(
1502
+ row_spec, allow_pk=not has_rowid, allow_exprs=False, allow_media=False
1503
+ )
1504
+ if has_rowid:
1505
+ # we expect the _rowid column to be present for each row
1506
+ assert _ROWID_COLUMN_NAME in row_spec
1507
+ rowids.append(row_spec[_ROWID_COLUMN_NAME])
1508
+ else:
1509
+ col_names = {col.name for col in col_vals}
1510
+ if any(pk_col_name not in col_names for pk_col_name in pk_col_names):
1511
+ missing_cols = pk_col_names - {col.name for col in col_vals}
1512
+ raise excs.Error(
1513
+ f'Primary key column(s) {", ".join(repr(c) for c in missing_cols)} missing in {row_spec}'
1514
+ )
1515
+ row_updates.append(col_vals)
1516
+
1517
+ result = self._tbl_version.get().batch_update(
1518
+ row_updates,
1519
+ rowids,
1520
+ error_if_not_exists=if_not_exists == 'error',
1521
+ insert_if_not_exists=if_not_exists == 'insert',
1522
+ cascade=cascade,
1523
+ )
1524
+ FileCache.get().emit_eviction_warnings()
1525
+ return result
1526
+
1527
+ def recompute_columns(
1528
+ self,
1529
+ *columns: str | ColumnRef,
1530
+ where: 'exprs.Expr' | None = None,
1531
+ errors_only: bool = False,
1532
+ cascade: bool = True,
1533
+ ) -> UpdateStatus:
1534
+ """Recompute the values in one or more computed columns of this table.
1535
+
1536
+ Args:
1537
+ columns: The names or references of the computed columns to recompute.
1538
+ where: A predicate to filter rows to recompute.
1539
+ errors_only: If True, only run the recomputation for rows that have errors in the column (ie, the column's
1540
+ `errortype` property indicates that an error occurred). Only allowed for recomputing a single column.
1541
+ cascade: if True, also update all computed columns that transitively depend on the recomputed columns.
1542
+
1543
+ Examples:
1544
+ Recompute computed columns `c1` and `c2` for all rows in this table, and everything that transitively
1545
+ depends on them:
1546
+
1547
+ >>> tbl.recompute_columns('c1', 'c2')
1548
+
1549
+ Recompute computed column `c1` for all rows in this table, but don't recompute other columns that depend on
1550
+ it:
1551
+
1552
+ >>> tbl.recompute_columns(tbl.c1, tbl.c2, cascade=False)
1553
+
1554
+ Recompute column `c1` and its dependents, but only for rows with `c2` == 0:
1555
+
1556
+ >>> tbl.recompute_columns('c1', where=tbl.c2 == 0)
1557
+
1558
+ Recompute column `c1` and its dependents, but only for rows that have errors in it:
1559
+
1560
+ >>> tbl.recompute_columns('c1', errors_only=True)
1561
+ """
1562
+ from pixeltable.catalog import Catalog
1563
+
1564
+ cat = Catalog.get()
1565
+ # lock_mutable_tree=True: we need to be able to see whether any transitive view has column dependents
1566
+ with cat.begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
1567
+ self.__check_mutable('recompute columns of')
1568
+ if len(columns) == 0:
1569
+ raise excs.Error('At least one column must be specified to recompute')
1570
+ if errors_only and len(columns) > 1:
1571
+ raise excs.Error('Cannot use errors_only=True with multiple columns')
1572
+
1573
+ col_names: list[str] = []
1574
+ for column in columns:
1575
+ col_name: str
1576
+ col: Column
1577
+ if isinstance(column, str):
1578
+ col = self._tbl_version_path.get_column(column)
1579
+ if col is None:
1580
+ raise excs.Error(f'Unknown column: {column}')
1581
+ col_name = column
1582
+ else:
1583
+ assert isinstance(column, ColumnRef)
1584
+ col = column.col
1585
+ if not self._tbl_version_path.has_column(col):
1586
+ raise excs.Error(f'Unknown column: {col.name}')
1587
+ col_name = col.name
1588
+ if not col.is_computed:
1589
+ raise excs.Error(f'Column {col_name!r} is not a computed column')
1590
+ if col.get_tbl().id != self._tbl_version_path.tbl_id:
1591
+ raise excs.Error(f'Cannot recompute column of a base: {col_name}')
1592
+ col_names.append(col_name)
1593
+
1594
+ if where is not None and not where.is_bound_by([self._tbl_version_path]):
1595
+ raise excs.Error(f'`where` predicate ({where}) is not bound by {self._display_str()}')
1596
+
1597
+ result = self._tbl_version.get().recompute_columns(
1598
+ col_names, where=where, errors_only=errors_only, cascade=cascade
1599
+ )
1600
+ FileCache.get().emit_eviction_warnings()
1601
+ return result
1602
+
1603
+ def delete(self, where: 'exprs.Expr' | None = None) -> UpdateStatus:
1155
1604
  """Delete rows in this table.
1156
1605
 
1157
1606
  Args:
@@ -1174,69 +1623,96 @@ class Table(SchemaObject):
1174
1623
  .. warning::
1175
1624
  This operation is irreversible.
1176
1625
  """
1177
- if self._tbl_version_path.is_snapshot():
1178
- raise excs.Error('Cannot revert a snapshot')
1179
- self._tbl_version.revert()
1626
+ with catalog.Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
1627
+ self.__check_mutable('revert')
1628
+ self._tbl_version.get().revert()
1629
+ # remove cached md in order to force a reload on the next operation
1630
+ self._tbl_version_path.clear_cached_md()
1180
1631
 
1181
- @overload
1182
- def query(self, py_fn: Callable) -> 'pxt.func.QueryTemplateFunction': ...
1632
+ def push(self) -> None:
1633
+ from pixeltable.share import push_replica
1634
+ from pixeltable.share.protocol import PxtUri
1183
1635
 
1184
- @overload
1185
- def query(
1186
- self, *, param_types: Optional[list[ts.ColumnType]] = None
1187
- ) -> Callable[[Callable], 'pxt.func.QueryTemplateFunction']: ...
1188
-
1189
- def query(self, *args: Any, **kwargs: Any) -> Any:
1190
- def make_query_template(
1191
- py_fn: Callable, param_types: Optional[list[ts.ColumnType]]
1192
- ) -> 'pxt.func.QueryTemplateFunction':
1193
- if py_fn.__module__ != '__main__' and py_fn.__name__.isidentifier():
1194
- # this is a named function in a module
1195
- function_path = f'{py_fn.__module__}.{py_fn.__qualname__}'
1196
- else:
1197
- function_path = None
1198
- query_name = py_fn.__name__
1199
- if query_name in self._schema.keys():
1200
- raise excs.Error(f'Query name {query_name!r} conflicts with existing column')
1201
- if query_name in self.__query_scope._queries and function_path is not None:
1202
- raise excs.Error(f'Duplicate query name: {query_name!r}')
1203
- query_fn = pxt.func.QueryTemplateFunction.create(
1204
- py_fn, param_types=param_types, path=function_path, name=query_name)
1205
- self.__query_scope._queries[query_name] = query_fn
1206
- return query_fn
1207
-
1208
- # TODO: verify that the inferred return type matches that of the template
1209
- # TODO: verify that the signature doesn't contain batched parameters
1210
-
1211
- if len(args) == 1:
1212
- assert len(kwargs) == 0 and callable(args[0])
1213
- return make_query_template(args[0], None)
1214
- else:
1215
- assert len(args) == 0 and len(kwargs) == 1 and 'param_types' in kwargs
1216
- return lambda py_fn: make_query_template(py_fn, kwargs['param_types'])
1636
+ pxt_uri = self._get_pxt_uri()
1637
+ tbl_version = self._tbl_version_path.tbl_version.get()
1638
+
1639
+ if tbl_version.is_replica:
1640
+ raise excs.Error(f'push(): Cannot push replica table {self._name!r}. (Did you mean `pull()`?)')
1641
+
1642
+ if pxt_uri is None:
1643
+ raise excs.Error(
1644
+ f'push(): Table {self._name!r} has not yet been published to Pixeltable Cloud. '
1645
+ 'To publish it, use `pxt.publish()` instead.'
1646
+ )
1647
+
1648
+ if isinstance(self, catalog.View) and self._is_anonymous_snapshot():
1649
+ raise excs.Error(
1650
+ f'push(): Cannot push specific-version table handle {tbl_version.versioned_name!r}. '
1651
+ 'To push the latest version instead:\n'
1652
+ f' t = pxt.get_table({self._name!r})\n'
1653
+ f' t.push()'
1654
+ )
1655
+
1656
+ if self._tbl_version is None:
1657
+ # Named snapshots never have new versions to push.
1658
+ env.Env.get().console_logger.info('push(): Everything up to date.')
1659
+ return
1660
+
1661
+ # Parse the pxt URI to extract org/db and create a UUID-based URI for pushing
1662
+ parsed_uri = PxtUri(uri=pxt_uri)
1663
+ uuid_uri_obj = PxtUri.from_components(org=parsed_uri.org, id=self._id, db=parsed_uri.db)
1664
+ uuid_uri = str(uuid_uri_obj)
1665
+
1666
+ push_replica(uuid_uri, self)
1667
+
1668
+ def pull(self) -> None:
1669
+ from pixeltable.share import pull_replica
1670
+ from pixeltable.share.protocol import PxtUri
1671
+
1672
+ pxt_uri = self._get_pxt_uri()
1673
+ tbl_version = self._tbl_version_path.tbl_version.get()
1674
+
1675
+ if not tbl_version.is_replica or pxt_uri is None:
1676
+ raise excs.Error(
1677
+ f'pull(): Table {self._name!r} is not a replica of a Pixeltable Cloud table (nothing to `pull()`).'
1678
+ )
1679
+
1680
+ if isinstance(self, catalog.View) and self._is_anonymous_snapshot():
1681
+ raise excs.Error(
1682
+ f'pull(): Cannot pull specific-version table handle {tbl_version.versioned_name!r}. '
1683
+ 'To pull the latest version instead:\n'
1684
+ f' t = pxt.get_table({self._name!r})\n'
1685
+ f' t.pull()'
1686
+ )
1687
+
1688
+ # Parse the pxt URI to extract org/db and create a UUID-based URI for pulling
1689
+ parsed_uri = PxtUri(uri=pxt_uri)
1690
+ uuid_uri_obj = PxtUri.from_components(org=parsed_uri.org, id=self._id, db=parsed_uri.db)
1691
+ uuid_uri = str(uuid_uri_obj)
1692
+
1693
+ pull_replica(self._path(), uuid_uri)
1217
1694
 
1218
- @property
1219
1695
  def external_stores(self) -> list[str]:
1220
- return list(self._tbl_version.external_stores.keys())
1696
+ return list(self._tbl_version.get().external_stores.keys())
1221
1697
 
1222
1698
  def _link_external_store(self, store: 'pxt.io.ExternalStore') -> None:
1223
1699
  """
1224
1700
  Links the specified `ExternalStore` to this table.
1225
1701
  """
1226
- if self._tbl_version.is_snapshot:
1227
- raise excs.Error(f'Table `{self._name}` is a snapshot, so it cannot be linked to an external store.')
1228
- if store.name in self.external_stores:
1229
- raise excs.Error(f'Table `{self._name}` already has an external store with that name: {store.name}')
1230
- _logger.info(f'Linking external store `{store.name}` to table `{self._name}`')
1231
- self._tbl_version.link_external_store(store)
1232
- print(f'Linked external store `{store.name}` to table `{self._name}`.')
1702
+ from pixeltable.catalog import Catalog
1703
+
1704
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=False):
1705
+ self.__check_mutable('link an external store to')
1706
+ if store.name in self.external_stores():
1707
+ raise excs.Error(f'Table {self._name!r} already has an external store with that name: {store.name}')
1708
+ _logger.info(f'Linking external store {store.name!r} to table {self._name!r}.')
1709
+
1710
+ store.link(self._tbl_version.get()) # might call tbl_version.add_columns()
1711
+ self._tbl_version.get().link_external_store(store)
1712
+ env.Env.get().console_logger.info(f'Linked external store {store.name!r} to table {self._name!r}.')
1233
1713
 
1234
1714
  def unlink_external_stores(
1235
- self,
1236
- stores: Optional[str | list[str]] = None,
1237
- *,
1238
- delete_external_data: bool = False,
1239
- ignore_errors: bool = False
1715
+ self, stores: str | list[str] | None = None, *, delete_external_data: bool = False, ignore_errors: bool = False
1240
1716
  ) -> None:
1241
1717
  """
1242
1718
  Unlinks this table's external stores.
@@ -1249,31 +1725,37 @@ class Table(SchemaObject):
1249
1725
  delete_external_data (bool): If `True`, then the external data store will also be deleted. WARNING: This
1250
1726
  is a destructive operation that will delete data outside Pixeltable, and cannot be undone.
1251
1727
  """
1252
- self._check_is_dropped()
1253
- all_stores = self.external_stores
1254
-
1255
- if stores is None:
1256
- stores = all_stores
1257
- elif isinstance(stores, str):
1258
- stores = [stores]
1259
-
1260
- # Validation
1261
- if not ignore_errors:
1262
- for store in stores:
1263
- if store not in all_stores:
1264
- raise excs.Error(f'Table `{self._name}` has no external store with that name: {store}')
1265
-
1266
- for store in stores:
1267
- self._tbl_version.unlink_external_store(store, delete_external_data=delete_external_data)
1268
- print(f'Unlinked external store from table `{self._name}`: {store}')
1728
+ from pixeltable.catalog import Catalog
1729
+
1730
+ if not self._tbl_version_path.is_mutable():
1731
+ return
1732
+ with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=False):
1733
+ all_stores = self.external_stores()
1734
+
1735
+ if stores is None:
1736
+ stores = all_stores
1737
+ elif isinstance(stores, str):
1738
+ stores = [stores]
1739
+
1740
+ # Validation
1741
+ if not ignore_errors:
1742
+ for store_name in stores:
1743
+ if store_name not in all_stores:
1744
+ raise excs.Error(f'Table {self._name!r} has no external store with that name: {store_name}')
1745
+
1746
+ for store_name in stores:
1747
+ store = self._tbl_version.get().external_stores[store_name]
1748
+ # get hold of the store's debug string before deleting it
1749
+ store_str = str(store)
1750
+ store.unlink(self._tbl_version.get()) # might call tbl_version.drop_columns()
1751
+ self._tbl_version.get().unlink_external_store(store)
1752
+ if delete_external_data and isinstance(store, pxt.io.external_store.Project):
1753
+ store.delete()
1754
+ env.Env.get().console_logger.info(f'Unlinked external store from table {self._name!r}: {store_str}')
1269
1755
 
1270
1756
  def sync(
1271
- self,
1272
- stores: Optional[str | list[str]] = None,
1273
- *,
1274
- export_data: bool = True,
1275
- import_data: bool = True
1276
- ) -> 'pxt.io.SyncStatus':
1757
+ self, stores: str | list[str] | None = None, *, export_data: bool = True, import_data: bool = True
1758
+ ) -> UpdateStatus:
1277
1759
  """
1278
1760
  Synchronizes this table with its linked external stores.
1279
1761
 
@@ -1283,28 +1765,139 @@ class Table(SchemaObject):
1283
1765
  export_data: If `True`, data from this table will be exported to the external stores during synchronization.
1284
1766
  import_data: If `True`, data from the external stores will be imported to this table during synchronization.
1285
1767
  """
1286
- self._check_is_dropped()
1287
- all_stores = self.external_stores
1768
+ from pixeltable.catalog import Catalog
1288
1769
 
1289
- if stores is None:
1290
- stores = all_stores
1291
- elif isinstance(stores, str):
1292
- stores = [stores]
1770
+ if not self._tbl_version_path.is_mutable():
1771
+ return UpdateStatus()
1772
+ # we lock the entire tree starting at the root base table in order to ensure that all synced columns can
1773
+ # have their updates propagated down the tree
1774
+ base_tv = self._tbl_version_path.get_tbl_versions()[-1]
1775
+ with Catalog.get().begin_xact(tbl=TableVersionPath(base_tv), for_write=True, lock_mutable_tree=True):
1776
+ all_stores = self.external_stores()
1293
1777
 
1294
- for store in stores:
1295
- if store not in all_stores:
1296
- raise excs.Error(f'Table `{self._name}` has no external store with that name: {store}')
1778
+ if stores is None:
1779
+ stores = all_stores
1780
+ elif isinstance(stores, str):
1781
+ stores = [stores]
1297
1782
 
1298
- sync_status = pxt.io.SyncStatus.empty()
1299
- for store in stores:
1300
- store_obj = self._tbl_version.external_stores[store]
1301
- store_sync_status = store_obj.sync(self, export_data=export_data, import_data=import_data)
1302
- sync_status = sync_status.combine(store_sync_status)
1783
+ for store in stores:
1784
+ if store not in all_stores:
1785
+ raise excs.Error(f'Table {self._name!r} has no external store with that name: {store}')
1786
+
1787
+ sync_status = UpdateStatus()
1788
+ for store in stores:
1789
+ store_obj = self._tbl_version.get().external_stores[store]
1790
+ store_sync_status = store_obj.sync(self, export_data=export_data, import_data=import_data)
1791
+ sync_status += store_sync_status
1303
1792
 
1304
1793
  return sync_status
1305
1794
 
1306
1795
  def __dir__(self) -> list[str]:
1307
- return list(super().__dir__()) + list(self._schema.keys()) + self._query_names
1796
+ return list(super().__dir__()) + list(self._get_schema().keys())
1308
1797
 
1309
1798
  def _ipython_key_completions_(self) -> list[str]:
1310
- return list(self._schema.keys()) + self._query_names
1799
+ return list(self._get_schema().keys())
1800
+
1801
+ def get_versions(self, n: int | None = None) -> list[VersionMetadata]:
1802
+ """
1803
+ Returns information about versions of this table, most recent first.
1804
+
1805
+ `get_versions()` is intended for programmatic access to version metadata; for human-readable
1806
+ output, use [`history()`][pixeltable.Table.history] instead.
1807
+
1808
+ Args:
1809
+ n: if specified, will return at most `n` versions
1810
+
1811
+ Returns:
1812
+ A list of [VersionMetadata][pixeltable.VersionMetadata] dictionaries, one per version retrieved, most
1813
+ recent first.
1814
+
1815
+ Examples:
1816
+ Retrieve metadata about all versions of the table `tbl`:
1817
+
1818
+ >>> tbl.get_versions()
1819
+
1820
+ Retrieve metadata about the most recent 5 versions of the table `tbl`:
1821
+
1822
+ >>> tbl.get_versions(n=5)
1823
+ """
1824
+ from pixeltable.catalog import Catalog
1825
+
1826
+ if n is None:
1827
+ n = 1_000_000_000
1828
+ if not isinstance(n, int) or n < 1:
1829
+ raise excs.Error(f'Invalid value for `n`: {n}')
1830
+
1831
+ # Retrieve the table history components from the catalog
1832
+ tbl_id = self._id
1833
+ # Collect an extra version, if available, to allow for computation of the first version's schema change
1834
+ vers_list = Catalog.get().collect_tbl_history(tbl_id, n + 1)
1835
+
1836
+ # Construct the metadata change description dictionary
1837
+ md_list = [(vers_md.version_md.version, vers_md.schema_version_md.columns) for vers_md in vers_list]
1838
+ md_dict = MetadataUtils._create_md_change_dict(md_list)
1839
+
1840
+ # Construct report lines
1841
+ if len(vers_list) > n:
1842
+ assert len(vers_list) == n + 1
1843
+ over_count = 1
1844
+ else:
1845
+ over_count = 0
1846
+
1847
+ metadata_dicts: list[VersionMetadata] = []
1848
+ for vers_md in vers_list[0 : len(vers_list) - over_count]:
1849
+ version = vers_md.version_md.version
1850
+ schema_change = md_dict.get(version, None)
1851
+ update_status = vers_md.version_md.update_status
1852
+ if update_status is None:
1853
+ update_status = UpdateStatus()
1854
+ change_type: Literal['schema', 'data'] = 'schema' if schema_change is not None else 'data'
1855
+ rcs = update_status.row_count_stats + update_status.cascade_row_count_stats
1856
+ metadata_dicts.append(
1857
+ VersionMetadata(
1858
+ version=version,
1859
+ created_at=datetime.datetime.fromtimestamp(vers_md.version_md.created_at, tz=datetime.timezone.utc),
1860
+ user=vers_md.version_md.user,
1861
+ change_type=change_type,
1862
+ inserts=rcs.ins_rows,
1863
+ updates=rcs.upd_rows,
1864
+ deletes=rcs.del_rows,
1865
+ errors=rcs.num_excs,
1866
+ computed=rcs.computed_values,
1867
+ schema_change=schema_change,
1868
+ )
1869
+ )
1870
+
1871
+ return metadata_dicts
1872
+
1873
+ def history(self, n: int | None = None) -> pd.DataFrame:
1874
+ """
1875
+ Returns a human-readable report about versions of this table.
1876
+
1877
+ `history()` is intended for human-readable output of version metadata; for programmatic access,
1878
+ use [`get_versions()`][pixeltable.Table.get_versions] instead.
1879
+
1880
+ Args:
1881
+ n: if specified, will return at most `n` versions
1882
+
1883
+ Returns:
1884
+ A report with information about each version, one per row, most recent first.
1885
+
1886
+ Examples:
1887
+ Report all versions of the table:
1888
+
1889
+ >>> tbl.history()
1890
+
1891
+ Report only the most recent 5 changes to the table:
1892
+
1893
+ >>> tbl.history(n=5)
1894
+ """
1895
+ versions = self.get_versions(n)
1896
+ assert len(versions) > 0
1897
+ return pd.DataFrame([list(v.values()) for v in versions], columns=list(versions[0].keys()))
1898
+
1899
+ def __check_mutable(self, op_descr: str) -> None:
1900
+ if self._tbl_version_path.is_replica():
1901
+ raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a replica.')
1902
+ if self._tbl_version_path.is_snapshot():
1903
+ raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a snapshot.')