pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -2,25 +2,28 @@ from __future__ import annotations
2
2
 
3
3
  import inspect
4
4
  import logging
5
- from typing import TYPE_CHECKING, Any, List, Literal, Optional
5
+ from typing import TYPE_CHECKING, Any, List, Literal
6
6
  from uuid import UUID
7
7
 
8
8
  import pixeltable.exceptions as excs
9
9
  import pixeltable.metadata.schema as md_schema
10
10
  import pixeltable.type_system as ts
11
11
  from pixeltable import catalog, exprs, func
12
- from pixeltable.env import Env
13
12
  from pixeltable.iterators import ComponentIterator
14
13
 
15
14
  from .column import Column
16
- from .globals import _POS_COLUMN_NAME, MediaValidation, UpdateStatus
15
+ from .globals import _POS_COLUMN_NAME, MediaValidation
17
16
  from .table import Table
18
- from .table_version import TableVersion
17
+ from .table_version import TableVersion, TableVersionKey, TableVersionMd
19
18
  from .table_version_handle import TableVersionHandle
20
19
  from .table_version_path import TableVersionPath
20
+ from .tbl_ops import CreateStoreTableOp, LoadViewOp, TableOp
21
+ from .update_status import UpdateStatus
21
22
 
22
23
  if TYPE_CHECKING:
24
+ from pixeltable.catalog.table import TableMetadata
23
25
  from pixeltable.globals import TableDataSource
26
+ from pixeltable.plan import SampleClause
24
27
 
25
28
  _logger = logging.getLogger('pixeltable')
26
29
 
@@ -37,21 +40,28 @@ class View(Table):
37
40
  def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath, snapshot_only: bool):
38
41
  super().__init__(id, dir_id, name, tbl_version_path)
39
42
  self._snapshot_only = snapshot_only
43
+ if not snapshot_only:
44
+ self._tbl_version = tbl_version_path.tbl_version
45
+
46
+ def _display_name(self) -> str:
47
+ if self._tbl_version_path.is_replica():
48
+ return 'replica'
49
+ if self._tbl_version_path.is_snapshot():
50
+ return 'snapshot'
51
+ if self._tbl_version_path.is_view():
52
+ return 'view'
53
+ return 'table'
40
54
 
41
55
  @classmethod
42
- def _display_name(cls) -> str:
43
- return 'view'
44
-
45
- @classmethod
46
- def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
56
+ def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, str | None]]) -> dict[str, dict]:
47
57
  """Returns a list of columns in the same format as the additional_columns parameter of View.create.
48
- The source is the list of expressions from a select() statement on a DataFrame.
58
+ The source is the list of expressions from a select() statement on a Query.
49
59
  If the column is a ColumnRef, to a base table column, it is marked to not be stored.sy
50
60
  """
51
- from pixeltable.dataframe import DataFrame
61
+ from pixeltable._query import Query
52
62
 
53
63
  r: dict[str, dict] = {}
54
- exps, names = DataFrame._normalize_select_list([], select_list)
64
+ exps, names = Query._normalize_select_list([], select_list)
55
65
  for expr, name in zip(exps, names):
56
66
  stored = not isinstance(expr, exprs.ColumnRef)
57
67
  r[name] = {'value': expr, 'stored': stored}
@@ -63,16 +73,20 @@ class View(Table):
63
73
  dir_id: UUID,
64
74
  name: str,
65
75
  base: TableVersionPath,
66
- select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
76
+ select_list: list[tuple[exprs.Expr, str | None]] | None,
67
77
  additional_columns: dict[str, Any],
68
- predicate: Optional['exprs.Expr'],
78
+ predicate: 'exprs.Expr' | None,
79
+ sample_clause: 'SampleClause' | None,
69
80
  is_snapshot: bool,
81
+ create_default_idxs: bool,
70
82
  num_retained_versions: int,
71
83
  comment: str,
72
84
  media_validation: MediaValidation,
73
- iterator_cls: Optional[type[ComponentIterator]],
74
- iterator_args: Optional[dict],
75
- ) -> View:
85
+ iterator_cls: type[ComponentIterator] | None,
86
+ iterator_args: dict | None,
87
+ ) -> tuple[TableVersionMd, list[TableOp] | None]:
88
+ from pixeltable.plan import SampleClause
89
+
76
90
  # Convert select_list to more additional_columns if present
77
91
  include_base_columns: bool = select_list is None
78
92
  select_list_columns: List[Column] = []
@@ -84,12 +98,25 @@ class View(Table):
84
98
  columns = select_list_columns + columns_from_additional_columns
85
99
  cls._verify_schema(columns)
86
100
 
87
- # verify that filter can be evaluated in the context of the base
101
+ # verify that filters can be evaluated in the context of the base
88
102
  if predicate is not None:
89
103
  if not predicate.is_bound_by([base]):
90
- raise excs.Error(f'Filter cannot be computed in the context of the base {base.tbl_name()}')
104
+ raise excs.Error(f'View filter cannot be computed in the context of the base table {base.tbl_name()!r}')
91
105
  # create a copy that we can modify and store
92
106
  predicate = predicate.copy()
107
+ if sample_clause is not None:
108
+ # make sure that the sample clause can be computed in the context of the base
109
+ if sample_clause.stratify_exprs is not None and not all(
110
+ stratify_expr.is_bound_by([base]) for stratify_expr in sample_clause.stratify_exprs
111
+ ):
112
+ raise excs.Error(
113
+ f'View sample clause cannot be computed in the context of the base table {base.tbl_name()!r}'
114
+ )
115
+ # create a copy that we can modify and store
116
+ sc = sample_clause
117
+ sample_clause = SampleClause(
118
+ sc.version, sc.n, sc.n_per_stratum, sc.fraction, sc.seed, sc.stratify_exprs.copy()
119
+ )
93
120
 
94
121
  # same for value exprs
95
122
  for col in columns:
@@ -98,8 +125,8 @@ class View(Table):
98
125
  # make sure that the value can be computed in the context of the base
99
126
  if col.value_expr is not None and not col.value_expr.is_bound_by([base]):
100
127
  raise excs.Error(
101
- f'Column {col.name}: value expression cannot be computed in the context of the '
102
- f'base {base.tbl_name()}'
128
+ f'Column {col.name!r}: Value expression cannot be computed in the context of the '
129
+ f'base table {base.tbl_name()!r}'
103
130
  )
104
131
 
105
132
  if iterator_cls is not None:
@@ -126,18 +153,18 @@ class View(Table):
126
153
  sig = func.Signature(ts.InvalidType(), params)
127
154
 
128
155
  expr_args = {k: exprs.Expr.from_object(v) for k, v in bound_args.items()}
129
- sig.validate_args(expr_args, context=f'in iterator {iterator_cls.__name__!r}')
156
+ sig.validate_args(expr_args, context=f'in iterator of type `{iterator_cls.__name__}`')
130
157
  literal_args = {k: v.val if isinstance(v, exprs.Literal) else v for k, v in expr_args.items()}
131
158
 
132
159
  # prepend pos and output_schema columns to cols:
133
160
  # a component view exposes the pos column of its rowid;
134
161
  # we create that column here, so it gets assigned a column id;
135
162
  # stored=False: it is not stored separately (it's already stored as part of the rowid)
136
- iterator_cols = [Column(_POS_COLUMN_NAME, ts.IntType(), stored=False)]
163
+ iterator_cols = [Column(_POS_COLUMN_NAME, ts.IntType(), is_iterator_col=True, stored=False)]
137
164
  output_dict, unstored_cols = iterator_cls.output_schema(**literal_args)
138
165
  iterator_cols.extend(
139
166
  [
140
- Column(col_name, col_type, stored=col_name not in unstored_cols)
167
+ Column(col_name, col_type, is_iterator_col=True, stored=col_name not in unstored_cols)
141
168
  for col_name, col_type in output_dict.items()
142
169
  ]
143
170
  )
@@ -146,11 +173,10 @@ class View(Table):
146
173
  for col in columns:
147
174
  if col.name in iterator_col_names:
148
175
  raise excs.Error(
149
- f'Duplicate name: column {col.name} is already present in the iterator output schema'
176
+ f'Duplicate name: column {col.name!r} is already present in the iterator output schema'
150
177
  )
151
178
  columns = iterator_cols + columns
152
179
 
153
- session = Env.get().session
154
180
  from pixeltable.exprs import InlineDict
155
181
 
156
182
  iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
@@ -160,6 +186,8 @@ class View(Table):
160
186
  # if this is a snapshot, we need to retarget all exprs to the snapshot tbl versions
161
187
  if is_snapshot:
162
188
  predicate = predicate.retarget(base_version_path) if predicate is not None else None
189
+ if sample_clause is not None:
190
+ exprs.Expr.retarget_list(sample_clause.stratify_exprs, base_version_path)
163
191
  iterator_args_expr = (
164
192
  iterator_args_expr.retarget(base_version_path) if iterator_args_expr is not None else None
165
193
  )
@@ -171,51 +199,43 @@ class View(Table):
171
199
  is_snapshot=is_snapshot,
172
200
  include_base_columns=include_base_columns,
173
201
  predicate=predicate.as_dict() if predicate is not None else None,
202
+ sample_clause=sample_clause.as_dict() if sample_clause is not None else None,
174
203
  base_versions=base_version_path.as_md(),
175
204
  iterator_class_fqn=iterator_class_fqn,
176
205
  iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
177
206
  )
178
207
 
179
- id, tbl_version = TableVersion.create(
180
- dir_id,
208
+ md = TableVersion.create_initial_md(
181
209
  name,
182
210
  columns,
183
211
  num_retained_versions,
184
212
  comment,
185
213
  media_validation=media_validation,
186
- # base_path=base_version_path,
187
214
  view_md=view_md,
215
+ create_default_idxs=create_default_idxs,
188
216
  )
189
- if tbl_version is None:
190
- # this is purely a snapshot: we use the base's tbl version path
191
- view = cls(id, dir_id, name, base_version_path, snapshot_only=True)
192
- _logger.info(f'created snapshot {name}')
217
+ if md.tbl_md.is_pure_snapshot:
218
+ # this is purely a snapshot: no store table to create or load
219
+ return md, None
193
220
  else:
194
- view = cls(
195
- id,
196
- dir_id,
197
- name,
198
- TableVersionPath(
199
- TableVersionHandle(tbl_version.id, tbl_version.effective_version), base=base_version_path
221
+ tbl_id = md.tbl_md.tbl_id
222
+ key = TableVersionKey(UUID(tbl_id), 0 if is_snapshot else None, None)
223
+ view_path = TableVersionPath(TableVersionHandle(key), base=base_version_path)
224
+ ops = [
225
+ TableOp(
226
+ tbl_id=tbl_id, op_sn=0, num_ops=2, needs_xact=False, create_store_table_op=CreateStoreTableOp()
200
227
  ),
201
- snapshot_only=False,
202
- )
203
- _logger.info(f'Created view `{name}`, id={tbl_version.id}')
204
-
205
- from pixeltable.plan import Planner
206
-
207
- plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
208
- num_rows, num_excs, _ = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
209
- Env.get().console_logger.info(f'Created view `{name}` with {num_rows} rows, {num_excs} exceptions.')
210
-
211
- session.commit()
212
- return view
228
+ TableOp(
229
+ tbl_id=tbl_id, op_sn=1, num_ops=2, needs_xact=True, load_view_op=LoadViewOp(view_path.as_dict())
230
+ ),
231
+ ]
232
+ return md, ops
213
233
 
214
234
  @classmethod
215
235
  def _verify_column(cls, col: Column) -> None:
216
236
  # make sure that columns are nullable or have a default
217
237
  if not col.col_type.nullable and not col.is_computed:
218
- raise excs.Error(f'Column {col.name}: non-computed columns in views must be nullable')
238
+ raise excs.Error(f'Column {col.name!r}: Non-computed columns in views must be nullable')
219
239
  super()._verify_column(col)
220
240
 
221
241
  @classmethod
@@ -227,74 +247,98 @@ class View(Table):
227
247
  if tbl_version_path.is_snapshot():
228
248
  return tbl_version_path
229
249
  tbl_version = tbl_version_path.tbl_version.get()
230
- if not tbl_version.is_snapshot:
231
- # create and register snapshot version
232
- tbl_version = tbl_version.create_snapshot_copy()
233
- assert tbl_version.is_snapshot
250
+ assert not tbl_version.is_snapshot
234
251
 
235
252
  return TableVersionPath(
236
- TableVersionHandle(tbl_version.id, tbl_version.effective_version),
253
+ TableVersionHandle(TableVersionKey(tbl_version.id, tbl_version.version, None)),
237
254
  base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
238
255
  )
239
256
 
240
- def _drop(self) -> None:
241
- if self._snapshot_only:
242
- # there is not TableVersion to drop
243
- self._check_is_dropped()
244
- self.is_dropped = True
245
- catalog.Catalog.get().delete_tbl_md(self._id)
246
- else:
247
- super()._drop()
257
+ def _is_named_pure_snapshot(self) -> bool:
258
+ """
259
+ Returns True if this is a named pure snapshot (i.e., a pure snapshot that is a separate schema object).
260
+ """
261
+ return self._id != self._tbl_version_path.tbl_id
248
262
 
249
- def get_metadata(self) -> dict[str, Any]:
250
- md = super().get_metadata()
263
+ def _is_anonymous_snapshot(self) -> bool:
264
+ """
265
+ Returns True if this is an unnamed snapshot (i.e., a snapshot that is not a separate schema object).
266
+ """
267
+ return self._snapshot_only and self._id == self._tbl_version_path.tbl_id
268
+
269
+ def _get_metadata(self) -> 'TableMetadata':
270
+ md = super()._get_metadata()
251
271
  md['is_view'] = True
252
272
  md['is_snapshot'] = self._tbl_version_path.is_snapshot()
273
+ if self._is_anonymous_snapshot():
274
+ # Update name and path with version qualifiers.
275
+ md['name'] = f'{self._name}:{self._tbl_version_path.version()}'
276
+ md['path'] = f'{self._path()}:{self._tbl_version_path.version()}'
277
+ base_tbl_id = self._base_tbl_id
278
+ if base_tbl_id is not None:
279
+ base_tbl = self._get_base_table()
280
+ base_path = '<anonymous base table>' if base_tbl is None else base_tbl._path()
281
+ base_version = self._effective_base_versions[0]
282
+ md['base'] = base_path if base_version is None else f'{base_path}:{base_version}'
253
283
  return md
254
284
 
255
285
  def insert(
256
286
  self,
257
- source: Optional[TableDataSource] = None,
287
+ source: TableDataSource | None = None,
258
288
  /,
259
289
  *,
260
- source_format: Optional[Literal['csv', 'excel', 'parquet', 'json']] = None,
261
- schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
290
+ source_format: Literal['csv', 'excel', 'parquet', 'json'] | None = None,
291
+ schema_overrides: dict[str, ts.ColumnType] | None = None,
262
292
  on_error: Literal['abort', 'ignore'] = 'abort',
263
293
  print_stats: bool = False,
264
294
  **kwargs: Any,
265
295
  ) -> UpdateStatus:
266
- raise excs.Error(f'{self._display_name()} {self._name!r}: cannot insert into view')
296
+ raise excs.Error(f'{self._display_str()}: Cannot insert into a {self._display_name()}.')
267
297
 
268
- def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
269
- raise excs.Error(f'{self._display_name()} {self._name!r}: cannot delete from view')
298
+ def delete(self, where: exprs.Expr | None = None) -> UpdateStatus:
299
+ raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
270
300
 
271
301
  @property
272
- def _base_table(self) -> Optional['Table']:
273
- # if this is a pure snapshot, our tbl_version_path only reflects the base (there is no TableVersion instance
274
- # for the snapshot itself)
275
- base_id = self._tbl_version.id if self._snapshot_only else self._tbl_version_path.base.tbl_version.id
276
- return catalog.Catalog.get().get_table_by_id(base_id)
302
+ def _base_tbl_id(self) -> UUID | None:
303
+ if self._tbl_version_path.tbl_id != self._id:
304
+ # _tbl_version_path represents a different schema object from this one. This can only happen if this is a
305
+ # named pure snapshot.
306
+ return self._tbl_version_path.tbl_id
307
+ if self._tbl_version_path.base is None:
308
+ return None
309
+ return self._tbl_version_path.base.tbl_id
310
+
311
+ def _get_base_table(self) -> 'Table' | None:
312
+ """Returns None if there is no base table, or if the base table is hidden."""
313
+ base_tbl_id = self._base_tbl_id
314
+ if base_tbl_id is None:
315
+ return None
316
+ with catalog.Catalog.get().begin_xact(tbl_id=base_tbl_id, for_write=False):
317
+ return catalog.Catalog.get().get_table_by_id(base_tbl_id)
277
318
 
278
319
  @property
279
- def _effective_base_versions(self) -> list[Optional[int]]:
320
+ def _effective_base_versions(self) -> list[int | None]:
280
321
  effective_versions = [tv.effective_version for tv in self._tbl_version_path.get_tbl_versions()]
281
- if self._snapshot_only:
282
- return effective_versions
322
+ if self._snapshot_only and not self._is_anonymous_snapshot():
323
+ return effective_versions # Named pure snapshot
283
324
  else:
284
325
  return effective_versions[1:]
285
326
 
286
327
  def _table_descriptor(self) -> str:
287
- display_name = 'Snapshot' if self._snapshot_only else 'View'
288
- result = [f'{display_name} {self._path!r}']
328
+ result = [self._display_str()]
289
329
  bases_descrs: list[str] = []
290
- for base, effective_version in zip(self._base_tables, self._effective_base_versions):
330
+ for base, effective_version in zip(self._get_base_tables(), self._effective_base_versions):
291
331
  if effective_version is None:
292
- bases_descrs.append(f'{base._path!r}')
332
+ bases_descrs.append(f'{base._path()!r}')
293
333
  else:
294
- base_descr = f'{base._path}:{effective_version}'
334
+ base_descr = f'{base._path()}:{effective_version}'
295
335
  bases_descrs.append(f'{base_descr!r}')
296
- result.append(f' (of {", ".join(bases_descrs)})')
297
-
298
- if self._tbl_version.get().predicate is not None:
299
- result.append(f'\nWhere: {self._tbl_version.get().predicate!s}')
336
+ if len(bases_descrs) > 0:
337
+ # bases_descrs can be empty in the case of a table-replica
338
+ result.append(f' (of {", ".join(bases_descrs)})')
339
+
340
+ if self._tbl_version_path.tbl_version.get().predicate is not None:
341
+ result.append(f'\nWhere: {self._tbl_version_path.tbl_version.get().predicate!s}')
342
+ if self._tbl_version_path.tbl_version.get().sample_clause is not None:
343
+ result.append(f'\nSample: {self._tbl_version.get().sample_clause!s}')
300
344
  return ''.join(result)
pixeltable/config.py CHANGED
@@ -4,11 +4,11 @@ import logging
4
4
  import os
5
5
  import shutil
6
6
  from pathlib import Path
7
- from typing import Any, ClassVar, Optional, TypeVar
7
+ from typing import Any, ClassVar, TypeVar
8
8
 
9
9
  import toml
10
10
 
11
- from pixeltable import exceptions as excs
11
+ from pixeltable import env, exceptions as excs
12
12
 
13
13
  _logger = logging.getLogger('pixeltable')
14
14
 
@@ -21,23 +21,30 @@ class Config:
21
21
  configuration values, which can be set in the config file or as environment variables.
22
22
  """
23
23
 
24
- __instance: ClassVar[Optional[Config]] = None
24
+ __instance: ClassVar[Config | None] = None
25
25
 
26
26
  __home: Path
27
27
  __config_file: Path
28
+ __config_overrides: dict[str, Any]
28
29
  __config_dict: dict[str, Any]
29
30
 
30
- def __init__(self) -> None:
31
+ def __init__(self, config_overrides: dict[str, Any]) -> None:
31
32
  assert self.__instance is None, 'Config is a singleton; use Config.get() to access the instance'
32
33
 
33
- self.__home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
34
+ for var in config_overrides:
35
+ if var not in KNOWN_CONFIG_OVERRIDES:
36
+ raise excs.Error(f'Unrecognized configuration variable: {var}')
37
+
38
+ self.__config_overrides = config_overrides
39
+
40
+ self.__home = Path(self.lookup_env('pixeltable', 'home', str(Path.home() / '.pixeltable')))
34
41
  if self.__home.exists() and not self.__home.is_dir():
35
- raise RuntimeError(f'{self.__home} is not a directory')
42
+ raise excs.Error(f'Not a directory: {self.__home}')
36
43
  if not self.__home.exists():
37
44
  print(f'Creating a Pixeltable instance at: {self.__home}')
38
45
  self.__home.mkdir()
39
46
 
40
- self.__config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self.__home / 'config.toml')))
47
+ self.__config_file = Path(self.lookup_env('pixeltable', 'config', str(self.__home / 'config.toml')))
41
48
 
42
49
  self.__config_dict: dict[str, Any]
43
50
  if os.path.isfile(self.__config_file):
@@ -46,6 +53,12 @@ class Config:
46
53
  self.__config_dict = toml.load(stream)
47
54
  except Exception as exc:
48
55
  raise excs.Error(f'Could not read config file: {self.__config_file}') from exc
56
+ for section, section_dict in self.__config_dict.items():
57
+ if section not in KNOWN_CONFIG_OPTIONS:
58
+ raise excs.Error(f'Unrecognized section {section!r} in config file: {self.__config_file}')
59
+ for key in section_dict:
60
+ if key not in KNOWN_CONFIG_OPTIONS[section]:
61
+ raise excs.Error(f"Unrecognized option '{section}.{key}' in config file: {self.__config_file}")
49
62
  else:
50
63
  self.__config_dict = self.__create_default_config(self.__config_file)
51
64
  with open(self.__config_file, 'w', encoding='utf-8') as stream:
@@ -65,10 +78,22 @@ class Config:
65
78
 
66
79
  @classmethod
67
80
  def get(cls) -> Config:
68
- if cls.__instance is None:
69
- cls.__instance = cls()
81
+ cls.init({})
70
82
  return cls.__instance
71
83
 
84
+ @classmethod
85
+ def init(cls, config_overrides: dict[str, Any], reinit: bool = False) -> None:
86
+ if reinit:
87
+ cls.__instance = None
88
+ for cl in env._registered_clients.values():
89
+ cl.client_obj = None
90
+ if cls.__instance is None:
91
+ cls.__instance = cls(config_overrides)
92
+ elif len(config_overrides) > 0:
93
+ raise excs.Error(
94
+ 'Pixeltable has already been initialized; cannot specify new config values in the same session'
95
+ )
96
+
72
97
  @classmethod
73
98
  def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
74
99
  free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
@@ -76,28 +101,115 @@ class Config:
76
101
  file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
77
102
  return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
78
103
 
79
- def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
104
+ def lookup_env(self, section: str, key: str, default: Any = None) -> Any:
105
+ override_var = f'{section}.{key}'
80
106
  env_var = f'{section.upper()}_{key.upper()}'
81
- if env_var in os.environ:
82
- value = os.environ[env_var]
83
- elif section in self.__config_dict and key in self.__config_dict[section]:
84
- value = self.__config_dict[section][key]
85
- else:
86
- return None
107
+ if override_var in self.__config_overrides:
108
+ return self.__config_overrides[override_var]
109
+ if env_var in os.environ and len(os.environ[env_var]) > 0:
110
+ return os.environ[env_var]
111
+ return default
112
+
113
+ def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> T | None:
114
+ value: Any = self.lookup_env(section, key) # Try to get from environment first
115
+ # Next try the config file
116
+ if value is None:
117
+ # Resolve nested section dicts
118
+ lookup_elems = [*section.split('.'), key]
119
+ value = self.__config_dict
120
+ for el in lookup_elems:
121
+ if isinstance(value, dict):
122
+ if el not in value:
123
+ return None
124
+ value = value[el]
125
+ else:
126
+ return None
127
+
128
+ if value is None:
129
+ return None # Not specified
87
130
 
88
131
  try:
132
+ if expected_type is bool and isinstance(value, str):
133
+ if value.lower() not in ('true', 'false'):
134
+ raise excs.Error(f"Invalid value for configuration parameter '{section}.{key}': {value}")
135
+ return value.lower() == 'true' # type: ignore[return-value]
89
136
  return expected_type(value) # type: ignore[call-arg]
90
- except ValueError as exc:
91
- raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}') from exc
137
+ except (ValueError, TypeError) as exc:
138
+ raise excs.Error(f"Invalid value for configuration parameter '{section}.{key}': {value}") from exc
92
139
 
93
- def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
140
+ def get_string_value(self, key: str, section: str = 'pixeltable') -> str | None:
94
141
  return self.get_value(key, str, section)
95
142
 
96
- def get_int_value(self, key: str, section: str = 'pixeltable') -> Optional[int]:
143
+ def get_int_value(self, key: str, section: str = 'pixeltable') -> int | None:
97
144
  return self.get_value(key, int, section)
98
145
 
99
- def get_float_value(self, key: str, section: str = 'pixeltable') -> Optional[float]:
146
+ def get_float_value(self, key: str, section: str = 'pixeltable') -> float | None:
100
147
  return self.get_value(key, float, section)
101
148
 
102
- def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
149
+ def get_bool_value(self, key: str, section: str = 'pixeltable') -> bool | None:
103
150
  return self.get_value(key, bool, section)
151
+
152
+
153
+ KNOWN_CONFIG_OPTIONS = {
154
+ 'pixeltable': {
155
+ 'home': 'Path to the Pixeltable home directory',
156
+ 'config': 'Path to the Pixeltable config file',
157
+ 'pgdata': 'Path to the Pixeltable postgres data directory',
158
+ 'db': 'Postgres database name',
159
+ 'file_cache_size_g': 'Size of the file cache in GB',
160
+ 'time_zone': 'Default time zone for timestamps',
161
+ 'hide_warnings': 'Hide warnings from the console',
162
+ 'verbosity': 'Verbosity level for console output',
163
+ 'api_key': 'API key for Pixeltable cloud',
164
+ 'input_media_dest': 'Default destination URI for input media data',
165
+ 'output_media_dest': 'Default destination URI for output (computed) media data',
166
+ 'r2_profile': 'AWS config profile name used to access R2 storage',
167
+ 's3_profile': 'AWS config profile name used to access S3 storage',
168
+ 'b2_profile': 'AWS config profile name used to access Backblaze B2 storage',
169
+ 'tigris_profile': 'AWS config profile name used to access Tigris object storage',
170
+ },
171
+ 'anthropic': {'api_key': 'Anthropic API key'},
172
+ 'azure': {'storage_account_name': 'Azure storage account name', 'storage_account_key': 'Azure storage account key'},
173
+ 'bedrock': {'api_key': 'AWS Bedrock API key'},
174
+ 'deepseek': {'api_key': 'Deepseek API key', 'rate_limit': 'Rate limit for Deepseek API requests'},
175
+ 'fal': {'api_key': 'fal.ai API key', 'rate_limit': 'Rate limit for fal.ai API requests'},
176
+ 'fireworks': {'api_key': 'Fireworks API key', 'rate_limit': 'Rate limit for Fireworks API requests'},
177
+ 'gemini': {'api_key': 'Gemini API key', 'rate_limits': 'Per-model rate limits for Gemini API requests'},
178
+ 'hf': {'auth_token': 'Hugging Face access token'},
179
+ 'imagen': {'rate_limits': 'Per-model rate limits for Imagen API requests'},
180
+ 'reve': {'api_key': 'Reve API key', 'rate_limit': 'Rate limit for Reve API requests (requests per minute)'},
181
+ 'groq': {'api_key': 'Groq API key', 'rate_limit': 'Rate limit for Groq API requests'},
182
+ 'label_studio': {'api_key': 'Label Studio API key', 'url': 'Label Studio server URL'},
183
+ 'mistral': {'api_key': 'Mistral API key', 'rate_limit': 'Rate limit for Mistral API requests'},
184
+ 'openai': {
185
+ 'api_key': 'OpenAI API key',
186
+ 'base_url': 'OpenAI API base URL',
187
+ 'api_version': 'API version if using Azure OpenAI',
188
+ 'rate_limits': 'Per-model rate limits for OpenAI API requests',
189
+ 'max_connections': 'Maximum number of concurrent OpenAI API connections that can be established',
190
+ 'max_keepalive_connections': 'Maximum number of keep-alive connections in the pool.'
191
+ ' Must not exceed max_connections.',
192
+ },
193
+ 'openrouter': {
194
+ 'api_key': 'OpenRouter API key',
195
+ 'site_url': 'Optional URL for your application (for OpenRouter analytics)',
196
+ 'app_name': 'Optional name for your application (for OpenRouter analytics)',
197
+ 'rate_limit': 'Rate limit for OpenRouter API requests',
198
+ },
199
+ 'replicate': {'api_token': 'Replicate API token'},
200
+ 'together': {
201
+ 'api_key': 'Together API key',
202
+ 'rate_limits': 'Per-model category rate limits for Together API requests',
203
+ },
204
+ 'twelvelabs': {'api_key': 'TwelveLabs API key', 'rate_limit': 'Rate limit for TwelveLabs API requests'},
205
+ 'veo': {'rate_limits': 'Per-model rate limits for Veo API requests'},
206
+ 'voyage': {'api_key': 'Voyage AI API key', 'rate_limit': 'Rate limit for Voyage AI API requests'},
207
+ 'pypi': {'api_key': 'PyPI API key (for internal use only)'},
208
+ }
209
+
210
+
211
+ KNOWN_CONFIG_OVERRIDES = {
212
+ f'{section}.{key}': info
213
+ for section, section_dict in KNOWN_CONFIG_OPTIONS.items()
214
+ for key, info in section_dict.items()
215
+ }