pixeltable 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (78) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +9 -1
  4. pixeltable/catalog/catalog.py +559 -134
  5. pixeltable/catalog/column.py +36 -32
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +12 -0
  8. pixeltable/catalog/insertable_table.py +30 -25
  9. pixeltable/catalog/schema_object.py +9 -6
  10. pixeltable/catalog/table.py +334 -267
  11. pixeltable/catalog/table_version.py +358 -241
  12. pixeltable/catalog/table_version_handle.py +18 -2
  13. pixeltable/catalog/table_version_path.py +86 -16
  14. pixeltable/catalog/view.py +47 -23
  15. pixeltable/dataframe.py +198 -19
  16. pixeltable/env.py +6 -4
  17. pixeltable/exceptions.py +6 -0
  18. pixeltable/exec/__init__.py +1 -1
  19. pixeltable/exec/exec_node.py +2 -0
  20. pixeltable/exec/expr_eval/evaluators.py +4 -1
  21. pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
  22. pixeltable/exec/in_memory_data_node.py +1 -1
  23. pixeltable/exec/sql_node.py +188 -22
  24. pixeltable/exprs/column_property_ref.py +16 -6
  25. pixeltable/exprs/column_ref.py +33 -11
  26. pixeltable/exprs/comparison.py +1 -1
  27. pixeltable/exprs/data_row.py +5 -3
  28. pixeltable/exprs/expr.py +11 -4
  29. pixeltable/exprs/literal.py +2 -0
  30. pixeltable/exprs/row_builder.py +4 -6
  31. pixeltable/exprs/rowid_ref.py +8 -0
  32. pixeltable/exprs/similarity_expr.py +1 -0
  33. pixeltable/func/__init__.py +1 -0
  34. pixeltable/func/mcp.py +74 -0
  35. pixeltable/func/query_template_function.py +5 -3
  36. pixeltable/func/tools.py +12 -2
  37. pixeltable/func/udf.py +2 -2
  38. pixeltable/functions/__init__.py +1 -0
  39. pixeltable/functions/anthropic.py +19 -45
  40. pixeltable/functions/deepseek.py +19 -38
  41. pixeltable/functions/fireworks.py +9 -18
  42. pixeltable/functions/gemini.py +2 -3
  43. pixeltable/functions/groq.py +108 -0
  44. pixeltable/functions/llama_cpp.py +6 -6
  45. pixeltable/functions/mistralai.py +16 -53
  46. pixeltable/functions/ollama.py +1 -1
  47. pixeltable/functions/openai.py +82 -165
  48. pixeltable/functions/string.py +212 -58
  49. pixeltable/functions/together.py +22 -80
  50. pixeltable/globals.py +10 -4
  51. pixeltable/index/base.py +5 -0
  52. pixeltable/index/btree.py +5 -0
  53. pixeltable/index/embedding_index.py +5 -0
  54. pixeltable/io/external_store.py +10 -31
  55. pixeltable/io/label_studio.py +5 -5
  56. pixeltable/io/parquet.py +2 -2
  57. pixeltable/io/table_data_conduit.py +1 -32
  58. pixeltable/metadata/__init__.py +11 -2
  59. pixeltable/metadata/converters/convert_13.py +2 -2
  60. pixeltable/metadata/converters/convert_30.py +6 -11
  61. pixeltable/metadata/converters/convert_35.py +9 -0
  62. pixeltable/metadata/converters/convert_36.py +38 -0
  63. pixeltable/metadata/converters/convert_37.py +15 -0
  64. pixeltable/metadata/converters/util.py +3 -9
  65. pixeltable/metadata/notes.py +3 -0
  66. pixeltable/metadata/schema.py +13 -1
  67. pixeltable/plan.py +135 -12
  68. pixeltable/share/packager.py +138 -14
  69. pixeltable/share/publish.py +2 -2
  70. pixeltable/store.py +19 -13
  71. pixeltable/type_system.py +30 -0
  72. pixeltable/utils/dbms.py +1 -1
  73. pixeltable/utils/formatter.py +64 -42
  74. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
  75. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/RECORD +78 -73
  76. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
  77. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
  78. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -34,6 +34,10 @@ class TableVersionHandle:
34
34
  def __hash__(self) -> int:
35
35
  return hash((self.id, self.effective_version))
36
36
 
37
+ @property
38
+ def is_snapshot(self) -> bool:
39
+ return self.effective_version is not None
40
+
37
41
  @classmethod
38
42
  def create(cls, tbl_version: TableVersion) -> TableVersionHandle:
39
43
  return cls(tbl_version.id, tbl_version.effective_version, tbl_version)
@@ -41,8 +45,20 @@ class TableVersionHandle:
41
45
  def get(self) -> TableVersion:
42
46
  from .catalog import Catalog
43
47
 
44
- if self._tbl_version is None:
45
- self._tbl_version = Catalog.get().get_tbl_version(self.id, self.effective_version)
48
+ cat = Catalog.get()
49
+ if self._tbl_version is None or not self._tbl_version.is_validated:
50
+ if self.effective_version is not None and self._tbl_version is not None:
51
+ # this is a snapshot version; we need to make sure we refer to the instance cached
52
+ # in Catalog, in order to avoid mixing sa_tbl instances in the same transaction
53
+ # (which will lead to duplicates in the From clause generated in SqlNode.create_from_clause())
54
+ assert (self.id, self.effective_version) in cat._tbl_versions
55
+ self._tbl_version = cat._tbl_versions[self.id, self.effective_version]
56
+ self._tbl_version.is_validated = True
57
+ else:
58
+ self._tbl_version = Catalog.get().get_tbl_version(self.id, self.effective_version)
59
+ if self.effective_version is None:
60
+ tvs = list(Catalog.get()._tbl_versions.values())
61
+ assert self._tbl_version in tvs
46
62
  return self._tbl_version
47
63
 
48
64
  def as_dict(self) -> dict:
@@ -4,9 +4,12 @@ import logging
4
4
  from typing import Optional
5
5
  from uuid import UUID
6
6
 
7
+ from pixeltable.env import Env
7
8
  from pixeltable.metadata import schema
8
9
 
9
10
  from .column import Column
11
+ from .globals import MediaValidation
12
+ from .table_version import TableVersion
10
13
  from .table_version_handle import TableVersionHandle
11
14
 
12
15
  _logger = logging.getLogger('pixeltable')
@@ -22,15 +25,28 @@ class TableVersionPath:
22
25
 
23
26
  TableVersionPath contains all metadata needed to execute queries and updates against a particular version of a
24
27
  table/view.
28
+
29
+ TableVersionPath supplies metadata needed for query construction (eg, column names), for which it uses a
30
+ cached TableVersion instance.
31
+ - when running inside a transaction, this instance is guaranteed to be validated
32
+ - when running outside a transaction, we use an unvalidated instance in order to avoid repeated validation
33
+ on every metadata-related method call (the instance won't stay validated, because TableVersionHandle.get()
34
+ runs a local transaction, at the end of which the instance is again invalidated)
35
+ - supplying metadata from an unvalidated instance is okay, because it needs to get revalidated anyway when a
36
+ query actually runs (at which point there is a transaction context) - there is no guarantee that in between
37
+ constructing a DataFrame and executing it, the underlying table schema hasn't changed (eg, a concurrent process
38
+ could have dropped a column referenced in the query).
25
39
  """
26
40
 
27
41
  tbl_version: TableVersionHandle
28
42
  base: Optional[TableVersionPath]
43
+ _cached_tbl_version: Optional[TableVersion]
29
44
 
30
45
  def __init__(self, tbl_version: TableVersionHandle, base: Optional[TableVersionPath] = None):
31
46
  assert tbl_version is not None
32
47
  self.tbl_version = tbl_version
33
48
  self.base = base
49
+ self._cached_tbl_version = None
34
50
 
35
51
  @classmethod
36
52
  def from_md(cls, path: schema.TableVersionPath) -> TableVersionPath:
@@ -47,17 +63,46 @@ class TableVersionPath:
47
63
  result.extend(self.base.as_md())
48
64
  return result
49
65
 
66
+ def refresh_cached_md(self) -> None:
67
+ from pixeltable.catalog import Catalog
68
+
69
+ if Env.get().in_xact:
70
+ # when we're running inside a transaction, we need to make sure to supply current metadata;
71
+ # mixing stale metadata with current metadata leads to query construction failures
72
+ # (multiple sqlalchemy Table instances for the same underlying table create corrupted From clauses)
73
+ if self._cached_tbl_version is not None and self._cached_tbl_version.is_validated:
74
+ # nothing to refresh
75
+ return
76
+ elif self._cached_tbl_version is not None:
77
+ return
78
+
79
+ with Catalog.get().begin_xact(for_write=False):
80
+ self._cached_tbl_version = self.tbl_version.get()
81
+
82
+ def clear_cached_md(self) -> None:
83
+ self._cached_tbl_version = None
84
+ if self.base is not None:
85
+ self.base.clear_cached_md()
86
+
87
+ @property
50
88
  def tbl_id(self) -> UUID:
51
89
  """Return the id of the table/view that this path represents"""
52
90
  return self.tbl_version.id
53
91
 
54
92
  def version(self) -> int:
55
93
  """Return the version of the table/view that this path represents"""
56
- return self.tbl_version.get().version
94
+ self.refresh_cached_md()
95
+ return self._cached_tbl_version.version
96
+
97
+ def schema_version(self) -> int:
98
+ """Return the version of the table/view that this path represents"""
99
+ self.refresh_cached_md()
100
+ return self._cached_tbl_version.schema_version
57
101
 
58
102
  def tbl_name(self) -> str:
59
103
  """Return the name of the table/view that this path represents"""
60
- return self.tbl_version.get().name
104
+ self.refresh_cached_md()
105
+ return self._cached_tbl_version.name
61
106
 
62
107
  def path_len(self) -> int:
63
108
  """Return the length of the path"""
@@ -65,18 +110,39 @@ class TableVersionPath:
65
110
 
66
111
  def is_snapshot(self) -> bool:
67
112
  """Return True if this is a path of snapshot versions"""
68
- if not self.tbl_version.get().is_snapshot:
69
- return False
70
- return self.base.is_snapshot() if self.base is not None else True
113
+ return self.tbl_version.is_snapshot
71
114
 
72
115
  def is_view(self) -> bool:
73
- return self.tbl_version.get().is_view
116
+ self.refresh_cached_md()
117
+ return self._cached_tbl_version.is_view
74
118
 
75
119
  def is_component_view(self) -> bool:
76
- return self.tbl_version.get().is_component_view
120
+ self.refresh_cached_md()
121
+ return self._cached_tbl_version.is_component_view
122
+
123
+ def is_replica(self) -> bool:
124
+ self.refresh_cached_md()
125
+ return self._cached_tbl_version.is_replica
126
+
127
+ def is_mutable(self) -> bool:
128
+ self.refresh_cached_md()
129
+ return self._cached_tbl_version.is_mutable
77
130
 
78
131
  def is_insertable(self) -> bool:
79
- return self.tbl_version.get().is_insertable()
132
+ self.refresh_cached_md()
133
+ return self._cached_tbl_version.is_insertable
134
+
135
+ def comment(self) -> str:
136
+ self.refresh_cached_md()
137
+ return self._cached_tbl_version.comment
138
+
139
+ def num_retained_versions(self) -> int:
140
+ self.refresh_cached_md()
141
+ return self._cached_tbl_version.num_retained_versions
142
+
143
+ def media_validation(self) -> MediaValidation:
144
+ self.refresh_cached_md()
145
+ return self._cached_tbl_version.media_validation
80
146
 
81
147
  def get_tbl_versions(self) -> list[TableVersionHandle]:
82
148
  """Return all tbl versions"""
@@ -100,11 +166,12 @@ class TableVersionPath:
100
166
 
101
167
  def columns(self) -> list[Column]:
102
168
  """Return all user columns visible in this tbl version path, including columns from bases"""
103
- result = list(self.tbl_version.get().cols_by_name.values())
104
- if self.base is not None and self.tbl_version.get().include_base_columns:
169
+ self.refresh_cached_md()
170
+ result = list(self._cached_tbl_version.cols_by_name.values())
171
+ if self.base is not None and self._cached_tbl_version.include_base_columns:
105
172
  base_cols = self.base.columns()
106
173
  # we only include base columns that don't conflict with one of our column names
107
- result.extend(c for c in base_cols if c.name not in self.tbl_version.get().cols_by_name)
174
+ result.extend(c for c in base_cols if c.name not in self._cached_tbl_version.cols_by_name)
108
175
  return result
109
176
 
110
177
  def cols_by_name(self) -> dict[str, Column]:
@@ -119,19 +186,21 @@ class TableVersionPath:
119
186
 
120
187
  def get_column(self, name: str, include_bases: Optional[bool] = None) -> Optional[Column]:
121
188
  """Return the column with the given name, or None if not found"""
122
- col = self.tbl_version.get().cols_by_name.get(name)
189
+ self.refresh_cached_md()
190
+ col = self._cached_tbl_version.cols_by_name.get(name)
123
191
  if col is not None:
124
192
  return col
125
- elif self.base is not None and (include_bases or self.tbl_version.get().include_base_columns):
193
+ elif self.base is not None and (include_bases or self._cached_tbl_version.include_base_columns):
126
194
  return self.base.get_column(name)
127
195
  else:
128
196
  return None
129
197
 
130
198
  def get_column_by_id(self, tbl_id: UUID, col_id: int) -> Optional[Column]:
131
199
  """Return the column for the given tbl/col id"""
200
+ self.refresh_cached_md()
132
201
  if self.tbl_version.id == tbl_id:
133
- assert col_id in self.tbl_version.get().cols_by_id
134
- return self.tbl_version.get().cols_by_id[col_id]
202
+ assert col_id in self._cached_tbl_version.cols_by_id
203
+ return self._cached_tbl_version.cols_by_id[col_id]
135
204
  elif self.base is not None:
136
205
  return self.base.get_column_by_id(tbl_id, col_id)
137
206
  else:
@@ -139,11 +208,12 @@ class TableVersionPath:
139
208
 
140
209
  def has_column(self, col: Column, include_bases: bool = True) -> bool:
141
210
  """Return True if this table has the given column."""
211
+ self.refresh_cached_md()
142
212
  assert col.tbl is not None
143
213
  if (
144
214
  col.tbl.id == self.tbl_version.id
145
215
  and col.tbl.effective_version == self.tbl_version.effective_version
146
- and col.id in self.tbl_version.get().cols_by_id
216
+ and col.id in self._cached_tbl_version.cols_by_id
147
217
  ):
148
218
  # the column is visible in this table version
149
219
  return True
@@ -12,6 +12,10 @@ from pixeltable import catalog, exprs, func
12
12
  from pixeltable.env import Env
13
13
  from pixeltable.iterators import ComponentIterator
14
14
 
15
+ if TYPE_CHECKING:
16
+ from pixeltable.plan import SampleClause
17
+
18
+
15
19
  from .column import Column
16
20
  from .globals import _POS_COLUMN_NAME, MediaValidation, UpdateStatus
17
21
  from .table import Table
@@ -37,6 +41,8 @@ class View(Table):
37
41
  def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath, snapshot_only: bool):
38
42
  super().__init__(id, dir_id, name, tbl_version_path)
39
43
  self._snapshot_only = snapshot_only
44
+ if not snapshot_only:
45
+ self._tbl_version = tbl_version_path.tbl_version
40
46
 
41
47
  @classmethod
42
48
  def _display_name(cls) -> str:
@@ -66,6 +72,7 @@ class View(Table):
66
72
  select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
67
73
  additional_columns: dict[str, Any],
68
74
  predicate: Optional['exprs.Expr'],
75
+ sample_clause: Optional['SampleClause'],
69
76
  is_snapshot: bool,
70
77
  num_retained_versions: int,
71
78
  comment: str,
@@ -73,6 +80,8 @@ class View(Table):
73
80
  iterator_cls: Optional[type[ComponentIterator]],
74
81
  iterator_args: Optional[dict],
75
82
  ) -> View:
83
+ from pixeltable.plan import SampleClause
84
+
76
85
  # Convert select_list to more additional_columns if present
77
86
  include_base_columns: bool = select_list is None
78
87
  select_list_columns: List[Column] = []
@@ -84,12 +93,23 @@ class View(Table):
84
93
  columns = select_list_columns + columns_from_additional_columns
85
94
  cls._verify_schema(columns)
86
95
 
87
- # verify that filter can be evaluated in the context of the base
96
+ # verify that filters can be evaluated in the context of the base
88
97
  if predicate is not None:
89
98
  if not predicate.is_bound_by([base]):
90
99
  raise excs.Error(f'Filter cannot be computed in the context of the base {base.tbl_name()}')
91
100
  # create a copy that we can modify and store
92
101
  predicate = predicate.copy()
102
+ if sample_clause is not None:
103
+ # make sure that the sample clause can be computed in the context of the base
104
+ if sample_clause.stratify_exprs is not None and not all(
105
+ stratify_expr.is_bound_by([base]) for stratify_expr in sample_clause.stratify_exprs
106
+ ):
107
+ raise excs.Error(f'Sample clause cannot be computed in the context of the base {base.tbl_name()}')
108
+ # create a copy that we can modify and store
109
+ sc = sample_clause
110
+ sample_clause = SampleClause(
111
+ sc.version, sc.n, sc.n_per_stratum, sc.fraction, sc.seed, sc.stratify_exprs.copy()
112
+ )
93
113
 
94
114
  # same for value exprs
95
115
  for col in columns:
@@ -160,6 +180,8 @@ class View(Table):
160
180
  # if this is a snapshot, we need to retarget all exprs to the snapshot tbl versions
161
181
  if is_snapshot:
162
182
  predicate = predicate.retarget(base_version_path) if predicate is not None else None
183
+ if sample_clause is not None:
184
+ exprs.Expr.retarget_list(sample_clause.stratify_exprs, base_version_path)
163
185
  iterator_args_expr = (
164
186
  iterator_args_expr.retarget(base_version_path) if iterator_args_expr is not None else None
165
187
  )
@@ -171,6 +193,7 @@ class View(Table):
171
193
  is_snapshot=is_snapshot,
172
194
  include_base_columns=include_base_columns,
173
195
  predicate=predicate.as_dict() if predicate is not None else None,
196
+ sample_clause=sample_clause.as_dict() if sample_clause is not None else None,
174
197
  base_versions=base_version_path.as_md(),
175
198
  iterator_class_fqn=iterator_class_fqn,
176
199
  iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
@@ -204,8 +227,17 @@ class View(Table):
204
227
 
205
228
  from pixeltable.plan import Planner
206
229
 
207
- plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
208
- num_rows, num_excs, _ = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
230
+ try:
231
+ plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
232
+ num_rows, num_excs, _ = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
233
+ except:
234
+ # we need to remove the orphaned TableVersion instance
235
+ del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
236
+ base_tbl_version = base.tbl_version.get()
237
+ if tbl_version.effective_version is None and not base_tbl_version.is_snapshot:
238
+ # also remove tbl_version from the base
239
+ base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
240
+ raise
209
241
  Env.get().console_logger.info(f'Created view `{name}` with {num_rows} rows, {num_excs} exceptions.')
210
242
 
211
243
  session.commit()
@@ -237,17 +269,8 @@ class View(Table):
237
269
  base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
238
270
  )
239
271
 
240
- def _drop(self) -> None:
241
- if self._snapshot_only:
242
- # there is not TableVersion to drop
243
- self._check_is_dropped()
244
- self.is_dropped = True
245
- catalog.Catalog.get().delete_tbl_md(self._id)
246
- else:
247
- super()._drop()
248
-
249
- def get_metadata(self) -> dict[str, Any]:
250
- md = super().get_metadata()
272
+ def _get_metadata(self) -> dict[str, Any]:
273
+ md = super()._get_metadata()
251
274
  md['is_view'] = True
252
275
  md['is_snapshot'] = self._tbl_version_path.is_snapshot()
253
276
  return md
@@ -268,11 +291,10 @@ class View(Table):
268
291
  def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
269
292
  raise excs.Error(f'{self._display_name()} {self._name!r}: cannot delete from view')
270
293
 
271
- @property
272
- def _base_table(self) -> Optional['Table']:
294
+ def _get_base_table(self) -> Optional['Table']:
273
295
  # if this is a pure snapshot, our tbl_version_path only reflects the base (there is no TableVersion instance
274
296
  # for the snapshot itself)
275
- base_id = self._tbl_version.id if self._snapshot_only else self._tbl_version_path.base.tbl_version.id
297
+ base_id = self._tbl_version_path.tbl_id if self._snapshot_only else self._tbl_version_path.base.tbl_id
276
298
  return catalog.Catalog.get().get_table_by_id(base_id)
277
299
 
278
300
  @property
@@ -285,16 +307,18 @@ class View(Table):
285
307
 
286
308
  def _table_descriptor(self) -> str:
287
309
  display_name = 'Snapshot' if self._snapshot_only else 'View'
288
- result = [f'{display_name} {self._path!r}']
310
+ result = [f'{display_name} {self._path()!r}']
289
311
  bases_descrs: list[str] = []
290
- for base, effective_version in zip(self._base_tables, self._effective_base_versions):
312
+ for base, effective_version in zip(self._get_base_tables(), self._effective_base_versions):
291
313
  if effective_version is None:
292
- bases_descrs.append(f'{base._path!r}')
314
+ bases_descrs.append(f'{base._path()!r}')
293
315
  else:
294
- base_descr = f'{base._path}:{effective_version}'
316
+ base_descr = f'{base._path()}:{effective_version}'
295
317
  bases_descrs.append(f'{base_descr!r}')
296
318
  result.append(f' (of {", ".join(bases_descrs)})')
297
319
 
298
- if self._tbl_version.get().predicate is not None:
299
- result.append(f'\nWhere: {self._tbl_version.get().predicate!s}')
320
+ if self._tbl_version_path.tbl_version.get().predicate is not None:
321
+ result.append(f'\nWhere: {self._tbl_version_path.tbl_version.get().predicate!s}')
322
+ if self._tbl_version_path.tbl_version.get().sample_clause is not None:
323
+ result.append(f'\nSample: {self._tbl_version.get().sample_clause!s}')
300
324
  return ''.join(result)