pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/globals.py CHANGED
@@ -1,46 +1,94 @@
1
- import dataclasses
1
+ from __future__ import annotations
2
+
2
3
  import logging
3
- from typing import Any, Iterable, Optional, Union, Literal
4
- from uuid import UUID
4
+ import os
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Iterable, Literal, TypedDict, Union
5
7
 
6
8
  import pandas as pd
7
- import sqlalchemy as sql
9
+ import pydantic
8
10
  from pandas.io.formats.style import Styler
9
- from sqlalchemy.util.preloaded import orm
10
11
 
11
- import pixeltable.exceptions as excs
12
- import pixeltable.exprs as exprs
13
- from pixeltable import DataFrame, catalog, func
14
- from pixeltable.catalog import Catalog
15
- from pixeltable.dataframe import DataFrameResultSet
12
+ from pixeltable import Query, catalog, exceptions as excs, exprs, func, share, type_system as ts
13
+ from pixeltable.catalog import Catalog, TableVersionPath
14
+ from pixeltable.catalog.insertable_table import OnErrorParameter
15
+ from pixeltable.config import Config
16
16
  from pixeltable.env import Env
17
+ from pixeltable.io.table_data_conduit import QueryTableDataConduit, TableDataConduit
17
18
  from pixeltable.iterators import ComponentIterator
18
- from pixeltable.metadata import schema
19
- from pixeltable.utils.filecache import FileCache
19
+
20
+ if TYPE_CHECKING:
21
+ import datasets # type: ignore[import-untyped]
22
+
23
+ RowData = list[dict[str, Any]]
24
+ TableDataSource = Union[
25
+ str,
26
+ os.PathLike,
27
+ Path, # OS paths, filenames, URLs
28
+ Iterable[dict[str, Any]], # dictionaries of values
29
+ Iterable[pydantic.BaseModel], # Pydantic model instances
30
+ catalog.Table, # Pixeltable Table
31
+ Query, # Pixeltable Query
32
+ pd.DataFrame, # pandas DataFrame
33
+ datasets.Dataset,
34
+ datasets.DatasetDict, # Huggingface datasets
35
+ ]
36
+
20
37
 
21
38
  _logger = logging.getLogger('pixeltable')
22
39
 
23
40
 
24
- def init() -> None:
41
+ def init(config_overrides: dict[str, Any] | None = None) -> None:
25
42
  """Initializes the Pixeltable environment."""
43
+ if config_overrides is None:
44
+ config_overrides = {}
45
+ Config.init(config_overrides)
26
46
  _ = Catalog.get()
27
47
 
28
48
 
29
49
  def create_table(
30
- path_str: str,
31
- schema_or_df: Union[dict[str, Any], DataFrame],
50
+ path: str,
51
+ schema: dict[str, Any] | None = None,
32
52
  *,
33
- primary_key: Optional[Union[str, list[str]]] = None,
53
+ source: TableDataSource | None = None,
54
+ source_format: Literal['csv', 'excel', 'parquet', 'json'] | None = None,
55
+ schema_overrides: dict[str, Any] | None = None,
56
+ create_default_idxs: bool = True,
57
+ on_error: Literal['abort', 'ignore'] = 'abort',
58
+ primary_key: str | list[str] | None = None,
34
59
  num_retained_versions: int = 10,
35
60
  comment: str = '',
36
- media_validation: Literal['on_read', 'on_write'] = 'on_write'
61
+ media_validation: Literal['on_read', 'on_write'] = 'on_write',
62
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
63
+ extra_args: dict[str, Any] | None = None, # Additional arguments to data source provider
37
64
  ) -> catalog.Table:
38
- """Create a new base table.
65
+ """Create a new base table. Exactly one of `schema` or `source` must be provided.
66
+
67
+ If a `schema` is provided, then an empty table will be created with the specified schema.
68
+
69
+ If a `source` is provided, then Pixeltable will attempt to infer a data source format and table schema from the
70
+ contents of the specified data, and the data will be imported from the specified source into the new table. The
71
+ source format and/or schema can be specified directly via the `source_format` and `schema_overrides` parameters.
39
72
 
40
73
  Args:
41
- path_str: Path to the table.
42
- schema_or_df: Either a dictionary that maps column names to column types, or a
43
- [`DataFrame`][pixeltable.DataFrame] whose contents and schema will be used to pre-populate the table.
74
+ path: Pixeltable path (qualified name) of the table, such as `'my_table'` or `'my_dir.my_subdir.my_table'`.
75
+ schema: Schema for the new table, mapping column names to Pixeltable types.
76
+ source: A data source (file, URL, Table, Query, or list of rows) to import from.
77
+ source_format: Must be used in conjunction with a `source`.
78
+ If specified, then the given format will be used to read the source data. (Otherwise,
79
+ Pixeltable will attempt to infer the format from the source data.)
80
+ schema_overrides: Must be used in conjunction with a `source`.
81
+ If specified, then columns in `schema_overrides` will be given the specified types.
82
+ (Pixeltable will attempt to infer the types of any columns not specified.)
83
+ create_default_idxs: If True, creates a B-tree index on every scalar and media column that is not computed,
84
+ except for boolean columns.
85
+ on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
86
+ invalid media file (such as a corrupt image) for one of the inserted rows.
87
+
88
+ - If `on_error='abort'`, then an exception will be raised and the rows will not be inserted.
89
+ - If `on_error='ignore'`, then execution will continue and the rows will be inserted. Any cells
90
+ with errors will have a `None` value for that cell, with information about the error stored in the
91
+ corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
44
92
  primary_key: An optional column name or list of column names to use as the primary key(s) of the
45
93
  table.
46
94
  num_retained_versions: Number of versions of the table to retain.
@@ -49,12 +97,28 @@ def create_table(
49
97
 
50
98
  - `'on_read'`: validate media files at query time
51
99
  - `'on_write'`: validate media files during insert/update operations
100
+ if_exists: Determines the behavior if a table already exists at the specified path location.
101
+
102
+ - `'error'`: raise an error
103
+ - `'ignore'`: do nothing and return the existing table handle
104
+ - `'replace'`: if the existing table has no views or snapshots, drop and replace it with a new one;
105
+ raise an error if the existing table has views or snapshots
106
+ - `'replace_force'`: drop the existing table and all its views and snapshots, and create a new one
107
+ extra_args: Must be used in conjunction with a `source`. If specified, then additional arguments will be
108
+ passed along to the source data provider.
52
109
 
53
110
  Returns:
54
- A handle to the newly created [`Table`][pixeltable.Table].
111
+ A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
112
+ Please note the schema of the existing table may not match the schema provided in the call.
55
113
 
56
114
  Raises:
57
- Error: if the path already exists or is invalid.
115
+ Error: if
116
+
117
+ - the path is invalid, or
118
+ - the path already exists and `if_exists='error'`, or
119
+ - the path already exists and is not a table, or
120
+ - an error occurs while attempting to create the table, or
121
+ - an error occurs while attempting to import data from the source.
58
122
 
59
123
  Examples:
60
124
  Create a table with an int and a string column:
@@ -66,164 +130,293 @@ def create_table(
66
130
 
67
131
  >>> tbl1 = pxt.get_table('orig_table')
68
132
  ... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
69
- """
70
- path = catalog.Path(path_str)
71
- Catalog.get().paths.check_is_valid(path, expected=None)
72
- dir = Catalog.get().paths[path.parent]
73
-
74
- df: Optional[DataFrame] = None
75
- if isinstance(schema_or_df, dict):
76
- schema = schema_or_df
77
- elif isinstance(schema_or_df, DataFrame):
78
- df = schema_or_df
79
- schema = df.schema
80
- elif isinstance(schema_or_df, DataFrameResultSet):
81
- raise excs.Error('`schema_or_df` must be either a schema dictionary or a Pixeltable DataFrame. (Is there an extraneous call to `collect()`?)')
82
- else:
83
- raise excs.Error('`schema_or_df` must be either a schema dictionary or a Pixeltable DataFrame.')
84
133
 
85
- if len(schema) == 0:
86
- raise excs.Error(f'Table schema is empty: `{path_str}`')
134
+ Create a table if it does not already exist, otherwise get the existing table:
135
+
136
+ >>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String}, if_exists='ignore')
137
+
138
+ Create a table with an int and a float column, and replace any existing table:
139
+
140
+ >>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.Float}, if_exists='replace')
141
+
142
+ Create a table from a CSV file:
143
+
144
+ >>> tbl = pxt.create_table('my_table', source='data.csv')
87
145
 
88
- if primary_key is None:
89
- primary_key = []
90
- elif isinstance(primary_key, str):
91
- primary_key = [primary_key]
146
+ Create a table with an auto-generated UUID primary key:
147
+
148
+ >>> tbl = pxt.create_table(
149
+ ... 'my_table',
150
+ ... schema={'id': pxt.functions.uuid.uuid4(), 'data': pxt.String},
151
+ ... primary_key=['id']
152
+ ... )
153
+ """
154
+ from pixeltable.io.table_data_conduit import UnkTableDataConduit
155
+ from pixeltable.io.utils import normalize_primary_key_parameter
156
+
157
+ if (schema is None) == (source is None):
158
+ raise excs.Error('Either a `schema` or a `source` must be provided (but not both)')
159
+
160
+ if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
161
+ raise excs.Error('`schema` must be a non-empty dictionary')
162
+
163
+ path_obj = catalog.Path.parse(path)
164
+ if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
165
+ media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
166
+ primary_key: list[str] | None = normalize_primary_key_parameter(primary_key)
167
+ data_source: TableDataConduit | None = None
168
+ if source is not None:
169
+ if isinstance(source, str) and source.strip().startswith('pxt://'):
170
+ raise excs.Error(
171
+ 'create_table(): Creating a table directly from a cloud URI is not supported.'
172
+ ' Please replicate the table locally first using `pxt.replicate()`:\n'
173
+ "replica_tbl = pxt.replicate('pxt://path/to/remote_table', 'local_replica_name')\n"
174
+ "pxt.create_table('new_table_name', source=replica_tbl)"
175
+ )
176
+ tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
177
+ tds.check_source_format()
178
+ data_source = tds.specialize()
179
+ src_schema_overrides: dict[str, ts.ColumnType] = {}
180
+ if schema_overrides is not None:
181
+ for col_name, py_type in schema_overrides.items():
182
+ col_type = ts.ColumnType.normalize_type(py_type, nullable_default=True, allow_builtin_types=False)
183
+ if col_type is None:
184
+ raise excs.Error(f'Invalid type for column {col_name!r} in `schema_overrides`: {py_type}')
185
+ src_schema_overrides[col_name] = col_type
186
+ data_source.src_schema_overrides = src_schema_overrides
187
+ data_source.src_pk = primary_key
188
+ data_source.infer_schema()
189
+ schema = data_source.pxt_schema
190
+ primary_key = data_source.pxt_pk
191
+ is_direct_query = data_source.is_direct_query()
92
192
  else:
93
- if not isinstance(primary_key, list) or not all(isinstance(pk, str) for pk in primary_key):
94
- raise excs.Error('primary_key must be a single column name or a list of column names')
193
+ is_direct_query = False
194
+
195
+ if len(schema) == 0 or not isinstance(schema, dict):
196
+ raise excs.Error(
197
+ 'Unable to create a proper schema from supplied `source`. Please use appropriate `schema_overrides`.'
198
+ )
199
+
200
+ tbl, was_created = Catalog.get().create_table(
201
+ path_obj,
202
+ schema,
203
+ if_exists=if_exists_,
204
+ primary_key=primary_key,
205
+ comment=comment,
206
+ media_validation=media_validation_,
207
+ num_retained_versions=num_retained_versions,
208
+ create_default_idxs=create_default_idxs,
209
+ )
95
210
 
96
- tbl = catalog.InsertableTable._create(
97
- dir._id, path.name, schema, df, primary_key=primary_key, num_retained_versions=num_retained_versions,
98
- comment=comment, media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
99
- Catalog.get().paths[path] = tbl
211
+ # TODO: combine data loading with table creation into a single transaction
212
+ if was_created:
213
+ fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
214
+ if isinstance(data_source, QueryTableDataConduit):
215
+ query = data_source.pxt_query
216
+ with Catalog.get().begin_xact(tbl=tbl._tbl_version_path, for_write=True, lock_mutable_tree=True):
217
+ tbl._tbl_version.get().insert(None, query, fail_on_exception=fail_on_exception)
218
+ elif data_source is not None and not is_direct_query:
219
+ tbl.insert_table_data_source(data_source=data_source, fail_on_exception=fail_on_exception)
100
220
 
101
- _logger.info(f'Created table `{path_str}`.')
102
221
  return tbl
103
222
 
104
223
 
105
224
  def create_view(
106
- path_str: str,
107
- base: Union[catalog.Table, DataFrame],
225
+ path: str,
226
+ base: catalog.Table | Query,
108
227
  *,
109
- additional_columns: Optional[dict[str, Any]] = None,
228
+ additional_columns: dict[str, Any] | None = None,
110
229
  is_snapshot: bool = False,
111
- iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
230
+ create_default_idxs: bool = False,
231
+ iterator: tuple[type[ComponentIterator], dict[str, Any]] | None = None,
112
232
  num_retained_versions: int = 10,
113
233
  comment: str = '',
114
234
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
115
- ignore_errors: bool = False,
116
- ) -> Optional[catalog.Table]:
235
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
236
+ ) -> catalog.Table | None:
117
237
  """Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
118
238
 
119
239
  Args:
120
- path_str: A name for the view; can be either a simple name such as `my_view`, or a pathname such as
240
+ path: A name for the view; can be either a simple name such as `my_view`, or a pathname such as
121
241
  `dir1.my_view`.
122
- base: [`Table`][pixeltable.Table] (i.e., table or view or snapshot) or [`DataFrame`][pixeltable.DataFrame] to
242
+ base: [`Table`][pixeltable.Table] (i.e., table or view or snapshot) or [`Query`][pixeltable.Query] to
123
243
  base the view on.
124
244
  additional_columns: If specified, will add these columns to the view once it is created. The format
125
- of the `additional_columns` parameter is identical to the format of the `schema_or_df` parameter in
245
+ of the `additional_columns` parameter is identical to the format of the `schema` parameter in
126
246
  [`create_table`][pixeltable.create_table].
127
247
  is_snapshot: Whether the view is a snapshot. Setting this to `True` is equivalent to calling
128
248
  [`create_snapshot`][pixeltable.create_snapshot].
249
+ create_default_idxs: Whether to create default indexes on the view's columns (the base's columns are excluded).
250
+ Cannot be `True` for snapshots.
129
251
  iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
130
252
  the base table.
131
253
  num_retained_versions: Number of versions of the view to retain.
132
254
  comment: Optional comment for the view.
133
- ignore_errors: if True, fail silently if the path already exists or is invalid.
255
+ media_validation: Media validation policy for the view.
256
+
257
+ - `'on_read'`: validate media files at query time
258
+ - `'on_write'`: validate media files during insert/update operations
259
+ if_exists: Directive regarding how to handle if the path already exists.
260
+ Must be one of the following:
261
+
262
+ - `'error'`: raise an error
263
+ - `'ignore'`: do nothing and return the existing view handle
264
+ - `'replace'`: if the existing view has no dependents, drop and replace it with a new one
265
+ - `'replace_force'`: drop the existing view and all its dependents, and create a new one
134
266
 
135
267
  Returns:
136
268
  A handle to the [`Table`][pixeltable.Table] representing the newly created view. If the path already
137
- exists or is invalid and `ignore_errors=True`, returns `None`.
269
+ exists and `if_exists='ignore'`, returns a handle to the existing view. Please note the schema
270
+ or the base of the existing view may not match those provided in the call.
138
271
 
139
272
  Raises:
140
- Error: if the path already exists or is invalid and `ignore_errors=False`.
273
+ Error: if
274
+
275
+ - the path is invalid, or
276
+ - the path already exists and `if_exists='error'`, or
277
+ - the path already exists and is not a view, or
278
+ - an error occurs while attempting to create the view.
141
279
 
142
280
  Examples:
143
281
  Create a view `my_view` of an existing table `my_table`, filtering on rows where `col1` is greater than 10:
144
282
 
145
283
  >>> tbl = pxt.get_table('my_table')
146
284
  ... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 10))
285
+
286
+ Create a view `my_view` of an existing table `my_table`, filtering on rows where `col1` is greater than 10,
287
+ and if it not already exist. Otherwise, get the existing view named `my_view`:
288
+
289
+ >>> tbl = pxt.get_table('my_table')
290
+ ... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 10), if_exists='ignore')
291
+
292
+ Create a view `my_view` of an existing table `my_table`, filtering on rows where `col1` is greater than 100,
293
+ and replace any existing view named `my_view`:
294
+
295
+ >>> tbl = pxt.get_table('my_table')
296
+ ... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 100), if_exists='replace_force')
147
297
  """
148
- where: Optional[exprs.Expr] = None
298
+ if is_snapshot and create_default_idxs is True:
299
+ raise excs.Error('Cannot create default indexes on a snapshot')
300
+ tbl_version_path: TableVersionPath
301
+ select_list: list[tuple[exprs.Expr, str | None]] | None = None
302
+ where: exprs.Expr | None = None
149
303
  if isinstance(base, catalog.Table):
150
304
  tbl_version_path = base._tbl_version_path
151
- elif isinstance(base, DataFrame):
152
- base._validate_mutable('create_view')
153
- if len(base._from_clause.tbls) > 1:
154
- raise excs.Error('Cannot create a view of a join')
305
+ sample_clause = None
306
+ elif isinstance(base, Query):
307
+ base._validate_mutable_op_sequence('create_view', allow_select=True)
155
308
  tbl_version_path = base._from_clause.tbls[0]
156
309
  where = base.where_clause
310
+ sample_clause = base.sample_clause
311
+ select_list = base.select_list
312
+ if sample_clause is not None and not is_snapshot and not sample_clause.is_repeatable:
313
+ raise excs.Error('Non-snapshot views cannot be created with non-fractional or stratified sampling')
157
314
  else:
158
- raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
159
- assert isinstance(base, catalog.Table) or isinstance(base, DataFrame)
160
- path = catalog.Path(path_str)
161
- try:
162
- Catalog.get().paths.check_is_valid(path, expected=None)
163
- except Exception as e:
164
- if ignore_errors:
165
- return None
166
- else:
167
- raise e
168
- dir = Catalog.get().paths[path.parent]
315
+ raise excs.Error('`base` must be an instance of `Table` or `Query`')
316
+ assert isinstance(base, (catalog.Table, Query))
317
+
318
+ if tbl_version_path.is_replica():
319
+ raise excs.Error('Cannot create a view or snapshot on top of a replica')
320
+
321
+ path_obj = catalog.Path.parse(path)
322
+ if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
323
+ media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
169
324
 
170
325
  if additional_columns is None:
171
326
  additional_columns = {}
172
- if iterator is None:
173
- iterator_class, iterator_args = None, None
174
327
  else:
175
- iterator_class, iterator_args = iterator
176
-
177
- view = catalog.View._create(
178
- dir._id, path.name, base=tbl_version_path, additional_columns=additional_columns, predicate=where,
179
- is_snapshot=is_snapshot, iterator_cls=iterator_class, iterator_args=iterator_args,
180
- num_retained_versions=num_retained_versions, comment=comment,
181
- media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
182
- Catalog.get().paths[path] = view
183
- _logger.info(f'Created view `{path_str}`.')
184
- FileCache.get().emit_eviction_warnings()
185
- return view
328
+ # additional columns should not be in the base table
329
+ for col_name in additional_columns:
330
+ if col_name in [c.name for c in tbl_version_path.columns()]:
331
+ raise excs.Error(
332
+ f'Column {col_name!r} already exists in the base table '
333
+ f'{tbl_version_path.get_column(col_name).get_tbl().name}.'
334
+ )
335
+
336
+ return Catalog.get().create_view(
337
+ path_obj,
338
+ tbl_version_path,
339
+ select_list=select_list,
340
+ where=where,
341
+ sample_clause=sample_clause,
342
+ additional_columns=additional_columns,
343
+ is_snapshot=is_snapshot,
344
+ create_default_idxs=create_default_idxs,
345
+ iterator=iterator,
346
+ num_retained_versions=num_retained_versions,
347
+ comment=comment,
348
+ media_validation=media_validation_,
349
+ if_exists=if_exists_,
350
+ )
186
351
 
187
352
 
188
353
  def create_snapshot(
189
354
  path_str: str,
190
- base: Union[catalog.Table, DataFrame],
355
+ base: catalog.Table | Query,
191
356
  *,
192
- additional_columns: Optional[dict[str, Any]] = None,
193
- iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
357
+ additional_columns: dict[str, Any] | None = None,
358
+ iterator: tuple[type[ComponentIterator], dict[str, Any]] | None = None,
194
359
  num_retained_versions: int = 10,
195
360
  comment: str = '',
196
361
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
197
- ignore_errors: bool = False,
198
- ) -> Optional[catalog.Table]:
362
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
363
+ ) -> catalog.Table | None:
199
364
  """Create a snapshot of an existing table object (which itself can be a view or a snapshot or a base table).
200
365
 
201
366
  Args:
202
367
  path_str: A name for the snapshot; can be either a simple name such as `my_snapshot`, or a pathname such as
203
368
  `dir1.my_snapshot`.
204
- base: [`Table`][pixeltable.Table] (i.e., table or view or snapshot) or [`DataFrame`][pixeltable.DataFrame] to
369
+ base: [`Table`][pixeltable.Table] (i.e., table or view or snapshot) or [`Query`][pixeltable.Query] to
205
370
  base the snapshot on.
206
371
  additional_columns: If specified, will add these columns to the snapshot once it is created. The format
207
- of the `additional_columns` parameter is identical to the format of the `schema_or_df` parameter in
372
+ of the `additional_columns` parameter is identical to the format of the `schema` parameter in
208
373
  [`create_table`][pixeltable.create_table].
209
374
  iterator: The iterator to use for this snapshot. If specified, then this snapshot will be a one-to-many view of
210
375
  the base table.
211
376
  num_retained_versions: Number of versions of the view to retain.
212
- comment: Optional comment for the view.
213
- ignore_errors: if True, fail silently if the path already exists or is invalid.
377
+ comment: Optional comment for the snapshot.
378
+ media_validation: Media validation policy for the snapshot.
379
+
380
+ - `'on_read'`: validate media files at query time
381
+ - `'on_write'`: validate media files during insert/update operations
382
+ if_exists: Directive regarding how to handle if the path already exists.
383
+ Must be one of the following:
384
+
385
+ - `'error'`: raise an error
386
+ - `'ignore'`: do nothing and return the existing snapshot handle
387
+ - `'replace'`: if the existing snapshot has no dependents, drop and replace it with a new one
388
+ - `'replace_force'`: drop the existing snapshot and all its dependents, and create a new one
214
389
 
215
390
  Returns:
216
- A handle to the [`Table`][pixeltable.Table] representing the newly created snapshot. If the path already
217
- exists or is invalid and `ignore_errors=True`, returns `None`.
391
+ A handle to the [`Table`][pixeltable.Table] representing the newly created snapshot.
392
+ Please note the schema or base of the existing snapshot may not match those provided in the call.
218
393
 
219
394
  Raises:
220
- Error: if the path already exists or is invalid and `ignore_errors=False`.
395
+ Error: if
396
+
397
+ - the path is invalid, or
398
+ - the path already exists and `if_exists='error'`, or
399
+ - the path already exists and is not a snapshot, or
400
+ - an error occurs while attempting to create the snapshot.
221
401
 
222
402
  Examples:
223
- Create a snapshot of `my_table`:
403
+ Create a snapshot `my_snapshot` of a table `my_table`:
224
404
 
225
405
  >>> tbl = pxt.get_table('my_table')
226
406
  ... snapshot = pxt.create_snapshot('my_snapshot', tbl)
407
+
408
+ Create a snapshot `my_snapshot` of a view `my_view` with additional int column `col3`,
409
+ if `my_snapshot` does not already exist:
410
+
411
+ >>> view = pxt.get_table('my_view')
412
+ ... snapshot = pxt.create_snapshot(
413
+ ... 'my_snapshot', view, additional_columns={'col3': pxt.Int}, if_exists='ignore'
414
+ ... )
415
+
416
+ Create a snapshot `my_snapshot` on a table `my_table`, and replace any existing snapshot named `my_snapshot`:
417
+
418
+ >>> tbl = pxt.get_table('my_table')
419
+ ... snapshot = pxt.create_snapshot('my_snapshot', tbl, if_exists='replace_force')
227
420
  """
228
421
  return create_view(
229
422
  path_str,
@@ -234,15 +427,71 @@ def create_snapshot(
234
427
  num_retained_versions=num_retained_versions,
235
428
  comment=comment,
236
429
  media_validation=media_validation,
237
- ignore_errors=ignore_errors,
430
+ if_exists=if_exists,
238
431
  )
239
432
 
240
433
 
241
- def get_table(path: str) -> catalog.Table:
434
+ def publish(
435
+ source: str | catalog.Table,
436
+ destination_uri: str,
437
+ bucket_name: str | None = None,
438
+ access: Literal['public', 'private'] = 'private',
439
+ ) -> None:
440
+ """
441
+ Publishes a replica of a local Pixeltable table to Pixeltable cloud. A given table can be published to at most one
442
+ URI per Pixeltable cloud database.
443
+
444
+ Args:
445
+ source: Path or table handle of the local table to be published.
446
+ destination_uri: Remote URI where the replica will be published, such as `'pxt://org_name/my_dir/my_table'`.
447
+ bucket_name: The name of the bucket to use to store replica's data. The bucket must be registered with
448
+ Pixeltable cloud. If no `bucket_name` is provided, the default storage bucket for the destination
449
+ database will be used.
450
+ access: Access control for the replica.
451
+
452
+ - `'public'`: Anyone can access this replica.
453
+ - `'private'`: Only the host organization can access.
454
+ """
455
+ if not destination_uri.startswith('pxt://'):
456
+ raise excs.Error("`destination_uri` must be a remote Pixeltable URI with the prefix 'pxt://'")
457
+
458
+ if isinstance(source, str):
459
+ source = get_table(source)
460
+
461
+ share.push_replica(destination_uri, source, bucket_name, access)
462
+
463
+
464
+ def replicate(remote_uri: str, local_path: str) -> catalog.Table:
465
+ """
466
+ Retrieve a replica from Pixeltable cloud as a local table. This will create a full local copy of the replica in a
467
+ way that preserves the table structure of the original source data. Once replicated, the local table can be
468
+ queried offline just as any other Pixeltable table.
469
+
470
+ Args:
471
+ remote_uri: Remote URI of the table to be replicated, such as `'pxt://org_name/my_dir/my_table'` or
472
+ `'pxt://org_name/my_dir/my_table:5'` (with version 5).
473
+ local_path: Local table path where the replica will be created, such as `'my_new_dir.my_new_tbl'`. It can be
474
+ the same or different from the cloud table name.
475
+
476
+ Returns:
477
+ A handle to the newly created local replica table.
478
+ """
479
+ if not remote_uri.startswith('pxt://'):
480
+ raise excs.Error("`remote_uri` must be a remote Pixeltable URI with the prefix 'pxt://'")
481
+
482
+ return share.pull_replica(local_path, remote_uri)
483
+
484
+
485
+ def get_table(path: str, if_not_exists: Literal['error', 'ignore'] = 'error') -> catalog.Table | None:
242
486
  """Get a handle to an existing table, view, or snapshot.
243
487
 
244
488
  Args:
245
489
  path: Path to the table.
490
+ if_not_exists: Directive regarding how to handle if the path does not exist.
491
+ Must be one of the following:
492
+
493
+ - `'error'`: raise an error
494
+ - `'ignore'`: do nothing and return `None`
246
495
 
247
496
  Returns:
248
497
  A handle to the [`Table`][pixeltable.Table].
@@ -262,20 +511,39 @@ def get_table(path: str) -> catalog.Table:
262
511
  Handles to views and snapshots are retrieved in the same way:
263
512
 
264
513
  >>> tbl = pxt.get_table('my_snapshot')
514
+
515
+ Get a handle to a specific version of a table:
516
+
517
+ >>> tbl = pxt.get_table('my_table:722')
265
518
  """
266
- p = catalog.Path(path)
267
- Catalog.get().paths.check_is_valid(p, expected=catalog.Table)
268
- obj = Catalog.get().paths[p]
269
- assert isinstance(obj, catalog.Table)
270
- return obj
519
+ if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
520
+ path_obj = catalog.Path.parse(path, allow_versioned_path=True)
521
+ tbl = Catalog.get().get_table(path_obj, if_not_exists_)
522
+ return tbl
271
523
 
272
524
 
273
- def move(path: str, new_path: str) -> None:
525
+ def move(
526
+ path: str,
527
+ new_path: str,
528
+ *,
529
+ if_exists: Literal['error', 'ignore'] = 'error',
530
+ if_not_exists: Literal['error', 'ignore'] = 'error',
531
+ ) -> None:
274
532
  """Move a schema object to a new directory and/or rename a schema object.
275
533
 
276
534
  Args:
277
535
  path: absolute path to the existing schema object.
278
536
  new_path: absolute new path for the schema object.
537
+ if_exists: Directive regarding how to handle if a schema object already exists at the new path.
538
+ Must be one of the following:
539
+
540
+ - `'error'`: raise an error
541
+ - `'ignore'`: do nothing and return
542
+ if_not_exists: Directive regarding how to handle if the source path does not exist.
543
+ Must be one of the following:
544
+
545
+ - `'error'`: raise an error
546
+ - `'ignore'`: do nothing and return
279
547
 
280
548
  Raises:
281
549
  Error: If path does not exist or new_path already exists.
@@ -289,26 +557,40 @@ def move(path: str, new_path: str) -> None:
289
557
 
290
558
  >>>> pxt.move('dir1.my_table', 'dir1.new_name')
291
559
  """
292
- p = catalog.Path(path)
293
- Catalog.get().paths.check_is_valid(p, expected=catalog.SchemaObject)
294
- new_p = catalog.Path(new_path)
295
- Catalog.get().paths.check_is_valid(new_p, expected=None)
296
- obj = Catalog.get().paths[p]
297
- Catalog.get().paths.move(p, new_p)
298
- new_dir = Catalog.get().paths[new_p.parent]
299
- obj._move(new_p.name, new_dir._id)
300
-
301
-
302
- def drop_table(table: Union[str, catalog.Table], force: bool = False, ignore_errors: bool = False) -> None:
303
- """Drop a table, view, or snapshot.
560
+ if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
561
+ if if_exists_ not in (catalog.IfExistsParam.ERROR, catalog.IfExistsParam.IGNORE):
562
+ raise excs.Error("`if_exists` must be one of 'error' or 'ignore'")
563
+ if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
564
+ if path == new_path:
565
+ raise excs.Error('move(): source and destination cannot be identical')
566
+ path_obj, new_path_obj = catalog.Path.parse(path), catalog.Path.parse(new_path)
567
+ if path_obj.is_ancestor(new_path_obj):
568
+ raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
569
+ Catalog.get().move(path_obj, new_path_obj, if_exists_, if_not_exists_)
570
+
571
+
572
+ def drop_table(
573
+ table: str | catalog.Table, force: bool = False, if_not_exists: Literal['error', 'ignore'] = 'error'
574
+ ) -> None:
575
+ """Drop a table, view, snapshot, or replica.
304
576
 
305
577
  Args:
306
- table: Fully qualified name, or handle, of the table to be dropped.
578
+ table: Fully qualified name or table handle of the table to be dropped; or a remote URI of a cloud replica to
579
+ be deleted.
307
580
  force: If `True`, will also drop all views and sub-views of this table.
308
- ignore_errors: If `True`, return silently if the table does not exist (without throwing an exception).
581
+ if_not_exists: Directive regarding how to handle if the path does not exist.
582
+ Must be one of the following:
583
+
584
+ - `'error'`: raise an error
585
+ - `'ignore'`: do nothing and return
309
586
 
310
587
  Raises:
311
- Error: If the name does not exist or does not designate a table object, and `ignore_errors=False`.
588
+ Error: if the qualified name
589
+
590
+ - is invalid, or
591
+ - does not exist and `if_not_exists='error'`, or
592
+ - does not designate a table object, or
593
+ - designates a table object but has dependents and `force=False`.
312
594
 
313
595
  Examples:
314
596
  Drop a table by its fully qualified name:
@@ -318,34 +600,85 @@ def drop_table(table: Union[str, catalog.Table], force: bool = False, ignore_err
318
600
  >>> t = pxt.get_table('subdir.my_table')
319
601
  ... pxt.drop_table(t)
320
602
 
603
+ Drop a table if it exists, otherwise do nothing:
604
+ >>> pxt.drop_table('subdir.my_table', if_not_exists='ignore')
605
+
606
+ Drop a table and all its dependents:
607
+ >>> pxt.drop_table('subdir.my_table', force=True)
321
608
  """
322
- cat = Catalog.get()
323
- if isinstance(table, str):
324
- tbl_path_obj = catalog.Path(table)
325
- try:
326
- cat.paths.check_is_valid(tbl_path_obj, expected=catalog.Table)
327
- except Exception as e:
328
- if ignore_errors or force:
329
- _logger.info(f'Skipped table `{table}` (does not exist).')
330
- return
331
- else:
332
- raise e
333
- tbl = cat.paths[tbl_path_obj]
609
+ tbl_path: str
610
+ if isinstance(table, catalog.Table):
611
+ # if we're dropping a table by handle, we first need to get the current path, then drop the S lock on
612
+ # the Table record, and then get X locks in the correct order (first containing directory, then table)
613
+ with Catalog.get().begin_xact(for_write=False):
614
+ tbl_path = table._path()
334
615
  else:
335
- tbl = table
336
- tbl_path_obj = catalog.Path(tbl._path)
616
+ assert isinstance(table, str)
617
+ tbl_path = table
618
+
619
+ if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
337
620
 
338
- assert isinstance(tbl, catalog.Table)
339
- if len(cat.tbl_dependents[tbl._id]) > 0:
340
- dependent_paths = [dep._path for dep in cat.tbl_dependents[tbl._id]]
621
+ if tbl_path.startswith('pxt://'):
622
+ # Remote table
341
623
  if force:
342
- for dependent_path in dependent_paths:
343
- drop_table(dependent_path, force=True)
624
+ raise excs.Error('Cannot use `force=True` with a cloud replica URI.')
625
+ # TODO: Handle if_not_exists properly
626
+ share.delete_replica(tbl_path)
627
+ else:
628
+ # Local table
629
+ path_obj = catalog.Path.parse(tbl_path)
630
+ Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
631
+
632
+
633
+ def get_dir_contents(dir_path: str = '', recursive: bool = True) -> 'DirContents':
634
+ """Get the contents of a Pixeltable directory.
635
+
636
+ Args:
637
+ dir_path: Path to the directory. Defaults to the root directory.
638
+ recursive: If `False`, returns only those tables and directories that are directly contained in specified
639
+ directory; if `True`, returns all tables and directories that are descendants of the specified directory,
640
+ recursively.
641
+
642
+ Returns:
643
+ A [`DirContents`][pixeltable.DirContents] object representing the contents of the specified directory.
644
+
645
+ Raises:
646
+ Error: If the path does not exist or does not designate a directory.
647
+
648
+ Examples:
649
+ Get contents of top-level directory:
650
+
651
+ >>> pxt.get_dir_contents()
652
+
653
+ Get contents of 'dir1':
654
+
655
+ >>> pxt.get_dir_contents('dir1')
656
+ """
657
+ path_obj = catalog.Path.parse(dir_path, allow_empty_path=True)
658
+ catalog_entries = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
659
+ dirs: list[str] = []
660
+ tables: list[str] = []
661
+ _assemble_dir_contents(dir_path, catalog_entries, dirs, tables)
662
+ dirs.sort()
663
+ tables.sort()
664
+ return DirContents(dirs=dirs, tables=tables)
665
+
666
+
667
+ def _assemble_dir_contents(
668
+ dir_path: str, catalog_entries: dict[str, Catalog.DirEntry], dirs: list[str], tables: list[str]
669
+ ) -> None:
670
+ for name, entry in catalog_entries.items():
671
+ if name.startswith('_'):
672
+ continue # Skip system paths
673
+ path = f'{dir_path}.{name}' if len(dir_path) > 0 else name
674
+ if entry.dir is not None:
675
+ dirs.append(path)
676
+ if entry.dir_entries is not None:
677
+ _assemble_dir_contents(path, entry.dir_entries, dirs, tables)
344
678
  else:
345
- raise excs.Error(f'Table {tbl._path} has dependents: {", ".join(dependent_paths)}')
346
- tbl._drop()
347
- del cat.paths[tbl_path_obj]
348
- _logger.info(f'Dropped table `{tbl._path}`.')
679
+ assert entry.table is not None
680
+ assert not entry.dir_entries
681
+ tables.append(path)
349
682
 
350
683
 
351
684
  def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
@@ -371,21 +704,42 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
371
704
 
372
705
  >>> pxt.list_tables('dir1')
373
706
  """
374
- assert dir_path is not None
375
- path = catalog.Path(dir_path, empty_is_valid=True)
376
- Catalog.get().paths.check_is_valid(path, expected=catalog.Dir)
377
- return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Table, recursive=recursive)]
707
+ return _list_tables(dir_path, recursive=recursive, allow_system_paths=False)
708
+
378
709
 
710
+ def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths: bool = False) -> list[str]:
711
+ path_obj = catalog.Path.parse(dir_path, allow_empty_path=True, allow_system_path=allow_system_paths)
712
+ contents = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
713
+ return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
379
714
 
380
- def create_dir(path_str: str, ignore_errors: bool = False) -> Optional[catalog.Dir]:
715
+
716
+ def create_dir(
717
+ path: str, *, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
718
+ ) -> catalog.Dir | None:
381
719
  """Create a directory.
382
720
 
383
721
  Args:
384
- path_str: Path to the directory.
385
- ignore_errors: if `True`, will return silently instead of throwing an exception if an error occurs.
722
+ path: Path to the directory.
723
+ if_exists: Directive regarding how to handle if the path already exists.
724
+ Must be one of the following:
725
+
726
+ - `'error'`: raise an error
727
+ - `'ignore'`: do nothing and return the existing directory handle
728
+ - `'replace'`: if the existing directory is empty, drop it and create a new one
729
+ - `'replace_force'`: drop the existing directory and all its children, and create a new one
730
+ parents: Create missing parent directories.
731
+
732
+ Returns:
733
+ A handle to the newly created directory, or to an already existing directory at the path when
734
+ `if_exists='ignore'`. Please note the existing directory may not be empty.
386
735
 
387
736
  Raises:
388
- Error: If the path already exists or the parent is not a directory, and `ignore_errors=False`.
737
+ Error: If
738
+
739
+ - the path is invalid, or
740
+ - the path already exists and `if_exists='error'`, or
741
+ - the path already exists and is not a directory, or
742
+ - an error occurs while attempting to create the directory.
389
743
 
390
744
  Examples:
391
745
  >>> pxt.create_dir('my_dir')
@@ -393,96 +747,155 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> Optional[catalog.D
393
747
  Create a subdirectory:
394
748
 
395
749
  >>> pxt.create_dir('my_dir.sub_dir')
750
+
751
+ Create a subdirectory only if it does not already exist, otherwise do nothing:
752
+
753
+ >>> pxt.create_dir('my_dir.sub_dir', if_exists='ignore')
754
+
755
+ Create a directory and replace if it already exists:
756
+
757
+ >>> pxt.create_dir('my_dir', if_exists='replace_force')
758
+
759
+ Create a subdirectory along with its ancestors:
760
+
761
+ >>> pxt.create_dir('parent1.parent2.sub_dir', parents=True)
396
762
  """
397
- try:
398
- path = catalog.Path(path_str)
399
- Catalog.get().paths.check_is_valid(path, expected=None)
400
- parent = Catalog.get().paths[path.parent]
401
- assert parent is not None
402
- with orm.Session(Env.get().engine, future=True) as session:
403
- dir_md = schema.DirMd(name=path.name)
404
- dir_record = schema.Dir(parent_id=parent._id, md=dataclasses.asdict(dir_md))
405
- session.add(dir_record)
406
- session.flush()
407
- assert dir_record.id is not None
408
- assert isinstance(dir_record.id, UUID)
409
- dir = catalog.Dir(dir_record.id, parent._id, path.name)
410
- Catalog.get().paths[path] = dir
411
- session.commit()
412
- _logger.info(f'Created directory `{path_str}`.')
413
- print(f'Created directory `{path_str}`.')
414
- return dir
415
- except excs.Error as e:
416
- if ignore_errors:
417
- return None
418
- else:
419
- raise e
763
+ path_obj = catalog.Path.parse(path)
764
+ if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
765
+ return Catalog.get().create_dir(path_obj, if_exists=if_exists_, parents=parents)
420
766
 
421
767
 
422
- def drop_dir(path_str: str, force: bool = False, ignore_errors: bool = False) -> None:
768
+ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
423
769
  """Remove a directory.
424
770
 
425
771
  Args:
426
- path_str: Name or path of the directory.
772
+ path: Name or path of the directory.
427
773
  force: If `True`, will also drop all tables and subdirectories of this directory, recursively, along
428
774
  with any views or snapshots that depend on any of the dropped tables.
429
- ignore_errors: if `True`, will return silently instead of throwing an exception if the directory
430
- does not exist.
775
+ if_not_exists: Directive regarding how to handle if the path does not exist.
776
+ Must be one of the following:
777
+
778
+ - `'error'`: raise an error
779
+ - `'ignore'`: do nothing and return
431
780
 
432
781
  Raises:
433
- Error: If the path does not exist or does not designate a directory, or if the directory is not empty.
782
+ Error: If the path
783
+
784
+ - is invalid, or
785
+ - does not exist and `if_not_exists='error'`, or
786
+ - is not designate a directory, or
787
+ - is a direcotory but is not empty and `force=False`.
434
788
 
435
789
  Examples:
790
+ Remove a directory, if it exists and is empty:
436
791
  >>> pxt.drop_dir('my_dir')
437
792
 
438
793
  Remove a subdirectory:
439
794
 
440
795
  >>> pxt.drop_dir('my_dir.sub_dir')
796
+
797
+ Remove an existing directory if it is empty, but do nothing if it does not exist:
798
+
799
+ >>> pxt.drop_dir('my_dir.sub_dir', if_not_exists='ignore')
800
+
801
+ Remove an existing directory and all its contents:
802
+
803
+ >>> pxt.drop_dir('my_dir', force=True)
804
+ """
805
+ path_obj = catalog.Path.parse(path) # validate format
806
+ if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
807
+ Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
808
+
809
+
810
+ def ls(path: str = '') -> pd.DataFrame:
441
811
  """
812
+ List the contents of a Pixeltable directory.
813
+
814
+ This function returns a Pandas DataFrame representing a human-readable listing of the specified directory,
815
+ including various attributes such as version and base table, as appropriate.
816
+
817
+ To get a programmatic list of the directory's contents, use [get_dir_contents()][pixeltable.get_dir_contents]
818
+ instead.
819
+ """
820
+ from pixeltable.catalog import retry_loop
821
+ from pixeltable.metadata import schema
822
+
442
823
  cat = Catalog.get()
443
- path = catalog.Path(path_str)
444
-
445
- try:
446
- cat.paths.check_is_valid(path, expected=catalog.Dir)
447
- except Exception as e:
448
- if ignore_errors or force:
449
- _logger.info(f'Skipped directory `{path}` (does not exist).')
450
- return
451
- else:
452
- raise e
453
-
454
- children = cat.paths.get_children(path, child_type=None, recursive=True)
455
-
456
- if len(children) > 0 and not force:
457
- raise excs.Error(f'Directory `{path_str}` is not empty.')
458
-
459
- for child in children:
460
- assert isinstance(child, catalog.Path)
461
- # We need to check that the child is still in `cat.paths`, since it is possible it was
462
- # already deleted as a dependent of a preceding child in the iteration.
463
- try:
464
- obj = cat.paths[child]
465
- except excs.Error:
466
- continue
467
- if isinstance(obj, catalog.Dir):
468
- drop_dir(str(child), force=True)
469
- else:
470
- assert isinstance(obj, catalog.Table)
471
- assert not obj._is_dropped # else it should have been removed from `cat.paths` already
472
- drop_table(str(child), force=True)
824
+ path_obj = catalog.Path.parse(path, allow_empty_path=True)
825
+ dir_entries = cat.get_dir_contents(path_obj)
826
+
827
+ @retry_loop(for_write=False)
828
+ def op() -> list[list[str]]:
829
+ rows: list[list[str]] = []
830
+ for name, entry in dir_entries.items():
831
+ if name.startswith('_'):
832
+ continue
833
+ if entry.dir is not None:
834
+ kind = 'dir'
835
+ version = ''
836
+ base = ''
837
+ else:
838
+ assert entry.table is not None
839
+ assert isinstance(entry.table, schema.Table)
840
+ tbl = cat.get_table_by_id(entry.table.id)
841
+ md = tbl.get_metadata()
842
+ base = md['base'] or ''
843
+ if base.startswith('_'):
844
+ base = '<anonymous base table>'
845
+ if md['is_replica']:
846
+ kind = 'replica'
847
+ elif md['is_snapshot']:
848
+ kind = 'snapshot'
849
+ elif md['is_view']:
850
+ kind = 'view'
851
+ else:
852
+ kind = 'table'
853
+ version = '' if kind == 'snapshot' else str(md['version'])
854
+ rows.append([name, kind, version, base])
855
+ return rows
856
+
857
+ rows = op()
858
+
859
+ rows = sorted(rows, key=lambda x: x[0])
860
+ df = pd.DataFrame(
861
+ {
862
+ 'Name': [row[0] for row in rows],
863
+ 'Kind': [row[1] for row in rows],
864
+ 'Version': [row[2] for row in rows],
865
+ 'Base': [row[3] for row in rows],
866
+ },
867
+ index=([''] * len(rows)),
868
+ )
869
+ return df
870
+
871
+
872
+ def _extract_paths(
873
+ dir_entries: dict[str, Catalog.DirEntry], parent: catalog.Path, entry_type: type[catalog.SchemaObject] | None = None
874
+ ) -> list[catalog.Path]:
875
+ """Convert nested dir_entries structure to a flattened list of paths."""
876
+ matches: list[str]
877
+ if entry_type is None:
878
+ matches = list(dir_entries.keys())
879
+ elif entry_type is catalog.Dir:
880
+ matches = [name for name, entry in dir_entries.items() if entry.dir is not None]
881
+ else:
882
+ matches = [name for name, entry in dir_entries.items() if entry.table is not None]
883
+
884
+ # Filter out system paths
885
+ matches = [name for name in matches if catalog.is_valid_identifier(name)]
886
+ result = [parent.append(name) for name in matches]
473
887
 
474
- with Env.get().engine.begin() as conn:
475
- dir = Catalog.get().paths[path]
476
- conn.execute(sql.delete(schema.Dir.__table__).where(schema.Dir.id == dir._id))
477
- del Catalog.get().paths[path]
478
- _logger.info(f'Removed directory `{path_str}`.')
888
+ for name, entry in dir_entries.items():
889
+ if len(entry.dir_entries) > 0 and catalog.is_valid_identifier(name):
890
+ result.extend(_extract_paths(entry.dir_entries, parent=parent.append(name), entry_type=entry_type))
891
+ return result
479
892
 
480
893
 
481
- def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
894
+ def list_dirs(path: str = '', recursive: bool = True) -> list[str]:
482
895
  """List the directories in a directory.
483
896
 
484
897
  Args:
485
- path_str: Name or path of the directory.
898
+ path: Name or path of the directory.
486
899
  recursive: If `True`, lists all descendants of this directory recursively.
487
900
 
488
901
  Returns:
@@ -495,9 +908,10 @@ def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
495
908
  >>> cl.list_dirs('my_dir', recursive=True)
496
909
  ['my_dir', 'my_dir.sub_dir1']
497
910
  """
498
- path = catalog.Path(path_str, empty_is_valid=True)
499
- Catalog.get().paths.check_is_valid(path, expected=catalog.Dir)
500
- return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Dir, recursive=recursive)]
911
+ path_obj = catalog.Path.parse(path, allow_empty_path=True) # validate format
912
+ cat = Catalog.get()
913
+ contents = cat.get_dir_contents(path_obj, recursive=recursive)
914
+ return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Dir)]
501
915
 
502
916
 
503
917
  def list_functions() -> Styler:
@@ -510,7 +924,9 @@ def list_functions() -> Styler:
510
924
  paths = ['.'.join(f.self_path.split('.')[:-1]) for f in functions]
511
925
  names = [f.name for f in functions]
512
926
  params = [
513
- ', '.join([param_name + ': ' + str(param_type) for param_name, param_type in f.signature.parameters.items()])
927
+ ', '.join(
928
+ [param_name + ': ' + str(param_type) for param_name, param_type in f.signatures[0].parameters.items()]
929
+ )
514
930
  for f in functions
515
931
  ]
516
932
  pd_df = pd.DataFrame(
@@ -518,21 +934,74 @@ def list_functions() -> Styler:
518
934
  'Path': paths,
519
935
  'Function Name': names,
520
936
  'Parameters': params,
521
- 'Return Type': [str(f.signature.get_return_type()) for f in functions],
937
+ 'Return Type': [str(f.signatures[0].get_return_type()) for f in functions],
522
938
  }
523
939
  )
524
940
  pd_df = pd_df.style.set_properties(None, **{'text-align': 'left'}).set_table_styles(
525
- [dict(selector='th', props=[('text-align', 'center')])]
941
+ [{'selector': 'th', 'props': [('text-align', 'center')]}]
526
942
  ) # center-align headings
527
943
  return pd_df.hide(axis='index')
528
944
 
529
945
 
946
+ def tools(*args: func.Function | func.tools.Tool) -> func.tools.Tools:
947
+ """
948
+ Specifies a collection of UDFs to be used as LLM tools. Pixeltable allows any UDF to be used as an input into an
949
+ LLM tool-calling API. To use one or more UDFs as tools, wrap them in a `pxt.tools` call and pass the return value
950
+ to an LLM API.
951
+
952
+ The UDFs can be specified directly or wrapped inside a [pxt.tool()][pixeltable.tool] invocation. If a UDF is
953
+ specified directly, the tool name will be the (unqualified) UDF name, and the tool description will consist of the
954
+ entire contents of the UDF docstring. If a UDF is wrapped in a `pxt.tool()` invocation, then the name and/or
955
+ description may be customized.
956
+
957
+ Args:
958
+ args: The UDFs to use as tools.
959
+
960
+ Returns:
961
+ A `Tools` instance that can be passed to an LLM tool-calling API or invoked to generate tool results.
962
+
963
+ Examples:
964
+ Create a tools instance with a single UDF:
965
+
966
+ >>> tools = pxt.tools(stock_price)
967
+
968
+ Create a tools instance with several UDFs:
969
+
970
+ >>> tools = pxt.tools(stock_price, weather_quote)
971
+
972
+ Create a tools instance, some of whose UDFs have customized metadata:
973
+
974
+ >>> tools = pxt.tools(
975
+ ... stock_price,
976
+ ... pxt.tool(weather_quote, description='Returns information about the weather in a particular location.'),
977
+ ... pxt.tool(traffic_quote, name='traffic_conditions'),
978
+ ... )
979
+ """
980
+ return func.tools.Tools(tools=[arg if isinstance(arg, func.tools.Tool) else tool(arg) for arg in args])
981
+
982
+
983
+ def tool(fn: func.Function, name: str | None = None, description: str | None = None) -> func.tools.Tool:
984
+ """
985
+ Specifies a Pixeltable UDF to be used as an LLM tool with customizable metadata. See the documentation for
986
+ [pxt.tools()][pixeltable.tools] for more details.
987
+
988
+ Args:
989
+ fn: The UDF to use as a tool.
990
+ name: The name of the tool. If not specified, then the unqualified name of the UDF will be used by default.
991
+ description: The description of the tool. If not specified, then the entire contents of the UDF docstring
992
+ will be used by default.
993
+
994
+ Returns:
995
+ A `Tool` instance that can be passed to an LLM tool-calling API.
996
+ """
997
+ if isinstance(fn, func.AggregateFunction):
998
+ raise excs.Error('Aggregator UDFs cannot be used as tools')
999
+
1000
+ return func.tools.Tool(fn=fn, name=name, description=description)
1001
+
1002
+
530
1003
  def configure_logging(
531
- *,
532
- to_stdout: Optional[bool] = None,
533
- level: Optional[int] = None,
534
- add: Optional[str] = None,
535
- remove: Optional[str] = None,
1004
+ *, to_stdout: bool | None = None, level: int | None = None, add: str | None = None, remove: str | None = None
536
1005
  ) -> None:
537
1006
  """Configure logging.
538
1007
 
@@ -546,4 +1015,15 @@ def configure_logging(
546
1015
 
547
1016
 
548
1017
  def array(elements: Iterable) -> exprs.Expr:
549
- return exprs.InlineArray(elements)
1018
+ return exprs.Expr.from_array(elements)
1019
+
1020
+
1021
+ class DirContents(TypedDict):
1022
+ """
1023
+ Represents the contents of a Pixeltable directory.
1024
+ """
1025
+
1026
+ dirs: list[str]
1027
+ """List of directory paths contained in this directory."""
1028
+ tables: list[str]
1029
+ """List of table paths contained in this directory."""