pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (58) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/catalog.py +296 -105
  3. pixeltable/catalog/column.py +10 -8
  4. pixeltable/catalog/dir.py +1 -2
  5. pixeltable/catalog/insertable_table.py +25 -20
  6. pixeltable/catalog/schema_object.py +3 -6
  7. pixeltable/catalog/table.py +261 -189
  8. pixeltable/catalog/table_version.py +333 -202
  9. pixeltable/catalog/table_version_handle.py +15 -2
  10. pixeltable/catalog/table_version_path.py +60 -14
  11. pixeltable/catalog/view.py +38 -6
  12. pixeltable/dataframe.py +196 -18
  13. pixeltable/env.py +4 -4
  14. pixeltable/exec/__init__.py +1 -1
  15. pixeltable/exec/expr_eval/evaluators.py +4 -1
  16. pixeltable/exec/in_memory_data_node.py +1 -1
  17. pixeltable/exec/sql_node.py +171 -22
  18. pixeltable/exprs/column_property_ref.py +15 -6
  19. pixeltable/exprs/column_ref.py +32 -11
  20. pixeltable/exprs/comparison.py +1 -1
  21. pixeltable/exprs/data_row.py +5 -3
  22. pixeltable/exprs/expr.py +7 -0
  23. pixeltable/exprs/literal.py +2 -0
  24. pixeltable/exprs/row_builder.py +4 -6
  25. pixeltable/exprs/rowid_ref.py +8 -0
  26. pixeltable/exprs/similarity_expr.py +1 -0
  27. pixeltable/func/query_template_function.py +1 -1
  28. pixeltable/func/tools.py +1 -1
  29. pixeltable/functions/gemini.py +0 -1
  30. pixeltable/functions/string.py +212 -58
  31. pixeltable/globals.py +12 -4
  32. pixeltable/index/base.py +5 -0
  33. pixeltable/index/btree.py +5 -0
  34. pixeltable/index/embedding_index.py +5 -0
  35. pixeltable/io/external_store.py +8 -29
  36. pixeltable/io/label_studio.py +1 -1
  37. pixeltable/io/parquet.py +2 -2
  38. pixeltable/io/table_data_conduit.py +0 -31
  39. pixeltable/metadata/__init__.py +11 -2
  40. pixeltable/metadata/converters/convert_13.py +2 -2
  41. pixeltable/metadata/converters/convert_30.py +6 -11
  42. pixeltable/metadata/converters/convert_35.py +9 -0
  43. pixeltable/metadata/converters/convert_36.py +38 -0
  44. pixeltable/metadata/converters/util.py +3 -9
  45. pixeltable/metadata/notes.py +2 -0
  46. pixeltable/metadata/schema.py +8 -1
  47. pixeltable/plan.py +221 -14
  48. pixeltable/share/packager.py +137 -13
  49. pixeltable/share/publish.py +2 -2
  50. pixeltable/store.py +19 -13
  51. pixeltable/utils/dbms.py +1 -1
  52. pixeltable/utils/formatter.py +64 -42
  53. pixeltable/utils/sample.py +25 -0
  54. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA +2 -1
  55. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/RECORD +58 -55
  56. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/LICENSE +0 -0
  57. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/WHEEL +0 -0
  58. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/entry_points.txt +0 -0
@@ -41,8 +41,21 @@ class TableVersionHandle:
41
41
  def get(self) -> TableVersion:
42
42
  from .catalog import Catalog
43
43
 
44
- if self._tbl_version is None:
45
- self._tbl_version = Catalog.get().get_tbl_version(self.id, self.effective_version)
44
+ cat = Catalog.get()
45
+ if self._tbl_version is None or not self._tbl_version.is_validated:
46
+ if self.effective_version is not None and self._tbl_version is not None:
47
+ # this is a snapshot version; we need to make sure we refer to the instance cached
48
+ # in Catalog, in order to avoid mixing sa_tbl instances in the same transaction
49
+ # (which will lead to duplicates in the From clause generated in SqlNode.create_from_clause())
50
+ assert (self.id, self.effective_version) in cat._tbl_versions
51
+ self._tbl_version = cat._tbl_versions[self.id, self.effective_version]
52
+ self._tbl_version.is_validated = True
53
+ else:
54
+ self._tbl_version = Catalog.get().get_tbl_version(self.id, self.effective_version)
55
+ if self.effective_version is None:
56
+ # make sure we don't see a discarded instance of a live TableVersion
57
+ tvs = list(Catalog.get()._tbl_versions.values())
58
+ assert self._tbl_version in tvs
46
59
  return self._tbl_version
47
60
 
48
61
  def as_dict(self) -> dict:
@@ -4,9 +4,11 @@ import logging
4
4
  from typing import Optional
5
5
  from uuid import UUID
6
6
 
7
+ from pixeltable.env import Env
7
8
  from pixeltable.metadata import schema
8
9
 
9
10
  from .column import Column
11
+ from .table_version import TableVersion
10
12
  from .table_version_handle import TableVersionHandle
11
13
 
12
14
  _logger = logging.getLogger('pixeltable')
@@ -22,15 +24,28 @@ class TableVersionPath:
22
24
 
23
25
  TableVersionPath contains all metadata needed to execute queries and updates against a particular version of a
24
26
  table/view.
27
+
28
+ TableVersionPath supplies metadata needed for query construction (eg, column names), for which it uses a
29
+ cached TableVersion instance.
30
+ - when running inside a transaction, this instance is guaranteed to be validated
31
+ - when running outside a transaction, we use an unvalidated instance in order to avoid repeated validation
32
+ on every metadata-related method call (the instance won't stay validated, because TableVersionHandle.get()
33
+ runs a local transaction, at the end of which the instance is again invalidated)
34
+ - supplying metadata from an unvalidated instance is okay, because it needs to get revalidated anyway when a
35
+ query actually runs (at which point there is a transaction context) - there is no guarantee that in between
36
+ constructing a DataFrame and executing it, the underlying table schema hasn't changed (eg, a concurrent process
37
+ could have dropped a column referenced in the query).
25
38
  """
26
39
 
27
40
  tbl_version: TableVersionHandle
28
41
  base: Optional[TableVersionPath]
42
+ _cached_tbl_version: Optional[TableVersion]
29
43
 
30
44
  def __init__(self, tbl_version: TableVersionHandle, base: Optional[TableVersionPath] = None):
31
45
  assert tbl_version is not None
32
46
  self.tbl_version = tbl_version
33
47
  self.base = base
48
+ self._cached_tbl_version = None
34
49
 
35
50
  @classmethod
36
51
  def from_md(cls, path: schema.TableVersionPath) -> TableVersionPath:
@@ -47,17 +62,40 @@ class TableVersionPath:
47
62
  result.extend(self.base.as_md())
48
63
  return result
49
64
 
65
+ def refresh_cached_md(self) -> None:
66
+ from pixeltable.catalog import Catalog
67
+
68
+ if Env.get().in_xact:
69
+ # when we're running inside a transaction, we need to make sure to supply current metadata;
70
+ # mixing stale metadata with current metadata leads to query construction failures
71
+ # (multiple sqlalchemy Table instances for the same underlying table create corrupted From clauses)
72
+ if self._cached_tbl_version is not None and self._cached_tbl_version.is_validated:
73
+ # nothing to refresh
74
+ return
75
+ elif self._cached_tbl_version is not None:
76
+ return
77
+
78
+ with Catalog.get().begin_xact(for_write=False):
79
+ self._cached_tbl_version = self.tbl_version.get()
80
+
81
+ def clear_cached_md(self) -> None:
82
+ self._cached_tbl_version = None
83
+ if self.base is not None:
84
+ self.base.clear_cached_md()
85
+
50
86
  def tbl_id(self) -> UUID:
51
87
  """Return the id of the table/view that this path represents"""
52
88
  return self.tbl_version.id
53
89
 
54
90
  def version(self) -> int:
55
91
  """Return the version of the table/view that this path represents"""
56
- return self.tbl_version.get().version
92
+ self.refresh_cached_md()
93
+ return self._cached_tbl_version.version
57
94
 
58
95
  def tbl_name(self) -> str:
59
96
  """Return the name of the table/view that this path represents"""
60
- return self.tbl_version.get().name
97
+ self.refresh_cached_md()
98
+ return self._cached_tbl_version.name
61
99
 
62
100
  def path_len(self) -> int:
63
101
  """Return the length of the path"""
@@ -65,18 +103,22 @@ class TableVersionPath:
65
103
 
66
104
  def is_snapshot(self) -> bool:
67
105
  """Return True if this is a path of snapshot versions"""
68
- if not self.tbl_version.get().is_snapshot:
106
+ self.refresh_cached_md()
107
+ if not self._cached_tbl_version.is_snapshot:
69
108
  return False
70
109
  return self.base.is_snapshot() if self.base is not None else True
71
110
 
72
111
  def is_view(self) -> bool:
73
- return self.tbl_version.get().is_view
112
+ self.refresh_cached_md()
113
+ return self._cached_tbl_version.is_view
74
114
 
75
115
  def is_component_view(self) -> bool:
76
- return self.tbl_version.get().is_component_view
116
+ self.refresh_cached_md()
117
+ return self._cached_tbl_version.is_component_view
77
118
 
78
119
  def is_insertable(self) -> bool:
79
- return self.tbl_version.get().is_insertable()
120
+ self.refresh_cached_md()
121
+ return self._cached_tbl_version.is_insertable
80
122
 
81
123
  def get_tbl_versions(self) -> list[TableVersionHandle]:
82
124
  """Return all tbl versions"""
@@ -100,11 +142,12 @@ class TableVersionPath:
100
142
 
101
143
  def columns(self) -> list[Column]:
102
144
  """Return all user columns visible in this tbl version path, including columns from bases"""
103
- result = list(self.tbl_version.get().cols_by_name.values())
104
- if self.base is not None and self.tbl_version.get().include_base_columns:
145
+ self.refresh_cached_md()
146
+ result = list(self._cached_tbl_version.cols_by_name.values())
147
+ if self.base is not None and self._cached_tbl_version.include_base_columns:
105
148
  base_cols = self.base.columns()
106
149
  # we only include base columns that don't conflict with one of our column names
107
- result.extend(c for c in base_cols if c.name not in self.tbl_version.get().cols_by_name)
150
+ result.extend(c for c in base_cols if c.name not in self._cached_tbl_version.cols_by_name)
108
151
  return result
109
152
 
110
153
  def cols_by_name(self) -> dict[str, Column]:
@@ -119,19 +162,21 @@ class TableVersionPath:
119
162
 
120
163
  def get_column(self, name: str, include_bases: Optional[bool] = None) -> Optional[Column]:
121
164
  """Return the column with the given name, or None if not found"""
122
- col = self.tbl_version.get().cols_by_name.get(name)
165
+ self.refresh_cached_md()
166
+ col = self._cached_tbl_version.cols_by_name.get(name)
123
167
  if col is not None:
124
168
  return col
125
- elif self.base is not None and (include_bases or self.tbl_version.get().include_base_columns):
169
+ elif self.base is not None and (include_bases or self._cached_tbl_version.include_base_columns):
126
170
  return self.base.get_column(name)
127
171
  else:
128
172
  return None
129
173
 
130
174
  def get_column_by_id(self, tbl_id: UUID, col_id: int) -> Optional[Column]:
131
175
  """Return the column for the given tbl/col id"""
176
+ self.refresh_cached_md()
132
177
  if self.tbl_version.id == tbl_id:
133
- assert col_id in self.tbl_version.get().cols_by_id
134
- return self.tbl_version.get().cols_by_id[col_id]
178
+ assert col_id in self._cached_tbl_version.cols_by_id
179
+ return self._cached_tbl_version.cols_by_id[col_id]
135
180
  elif self.base is not None:
136
181
  return self.base.get_column_by_id(tbl_id, col_id)
137
182
  else:
@@ -139,11 +184,12 @@ class TableVersionPath:
139
184
 
140
185
  def has_column(self, col: Column, include_bases: bool = True) -> bool:
141
186
  """Return True if this table has the given column."""
187
+ self.refresh_cached_md()
142
188
  assert col.tbl is not None
143
189
  if (
144
190
  col.tbl.id == self.tbl_version.id
145
191
  and col.tbl.effective_version == self.tbl_version.effective_version
146
- and col.id in self.tbl_version.get().cols_by_id
192
+ and col.id in self._cached_tbl_version.cols_by_id
147
193
  ):
148
194
  # the column is visible in this table version
149
195
  return True
@@ -12,6 +12,10 @@ from pixeltable import catalog, exprs, func
12
12
  from pixeltable.env import Env
13
13
  from pixeltable.iterators import ComponentIterator
14
14
 
15
+ if TYPE_CHECKING:
16
+ from pixeltable.plan import SampleClause
17
+
18
+
15
19
  from .column import Column
16
20
  from .globals import _POS_COLUMN_NAME, MediaValidation, UpdateStatus
17
21
  from .table import Table
@@ -66,6 +70,7 @@ class View(Table):
66
70
  select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
67
71
  additional_columns: dict[str, Any],
68
72
  predicate: Optional['exprs.Expr'],
73
+ sample_clause: Optional['SampleClause'],
69
74
  is_snapshot: bool,
70
75
  num_retained_versions: int,
71
76
  comment: str,
@@ -73,6 +78,8 @@ class View(Table):
73
78
  iterator_cls: Optional[type[ComponentIterator]],
74
79
  iterator_args: Optional[dict],
75
80
  ) -> View:
81
+ from pixeltable.plan import SampleClause
82
+
76
83
  # Convert select_list to more additional_columns if present
77
84
  include_base_columns: bool = select_list is None
78
85
  select_list_columns: List[Column] = []
@@ -84,12 +91,23 @@ class View(Table):
84
91
  columns = select_list_columns + columns_from_additional_columns
85
92
  cls._verify_schema(columns)
86
93
 
87
- # verify that filter can be evaluated in the context of the base
94
+ # verify that filters can be evaluated in the context of the base
88
95
  if predicate is not None:
89
96
  if not predicate.is_bound_by([base]):
90
97
  raise excs.Error(f'Filter cannot be computed in the context of the base {base.tbl_name()}')
91
98
  # create a copy that we can modify and store
92
99
  predicate = predicate.copy()
100
+ if sample_clause is not None:
101
+ # make sure that the sample clause can be computed in the context of the base
102
+ if sample_clause.stratify_exprs is not None and not all(
103
+ stratify_expr.is_bound_by([base]) for stratify_expr in sample_clause.stratify_exprs
104
+ ):
105
+ raise excs.Error(f'Sample clause cannot be computed in the context of the base {base.tbl_name()}')
106
+ # create a copy that we can modify and store
107
+ sc = sample_clause
108
+ sample_clause = SampleClause(
109
+ sc.version, sc.n, sc.n_per_stratum, sc.fraction, sc.seed, sc.stratify_exprs.copy()
110
+ )
93
111
 
94
112
  # same for value exprs
95
113
  for col in columns:
@@ -160,6 +178,8 @@ class View(Table):
160
178
  # if this is a snapshot, we need to retarget all exprs to the snapshot tbl versions
161
179
  if is_snapshot:
162
180
  predicate = predicate.retarget(base_version_path) if predicate is not None else None
181
+ if sample_clause is not None:
182
+ exprs.Expr.retarget_list(sample_clause.stratify_exprs, base_version_path)
163
183
  iterator_args_expr = (
164
184
  iterator_args_expr.retarget(base_version_path) if iterator_args_expr is not None else None
165
185
  )
@@ -171,6 +191,7 @@ class View(Table):
171
191
  is_snapshot=is_snapshot,
172
192
  include_base_columns=include_base_columns,
173
193
  predicate=predicate.as_dict() if predicate is not None else None,
194
+ sample_clause=sample_clause.as_dict() if sample_clause is not None else None,
174
195
  base_versions=base_version_path.as_md(),
175
196
  iterator_class_fqn=iterator_class_fqn,
176
197
  iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
@@ -204,8 +225,17 @@ class View(Table):
204
225
 
205
226
  from pixeltable.plan import Planner
206
227
 
207
- plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
208
- num_rows, num_excs, _ = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
228
+ try:
229
+ plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
230
+ num_rows, num_excs, _ = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
231
+ except:
232
+ # we need to remove the orphaned TableVersion instance
233
+ del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
234
+ base_tbl_version = base.tbl_version.get()
235
+ if tbl_version.effective_version is None and not base_tbl_version.is_snapshot:
236
+ # also remove tbl_version from the base
237
+ base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
238
+ raise
209
239
  Env.get().console_logger.info(f'Created view `{name}` with {num_rows} rows, {num_excs} exceptions.')
210
240
 
211
241
  session.commit()
@@ -285,16 +315,18 @@ class View(Table):
285
315
 
286
316
  def _table_descriptor(self) -> str:
287
317
  display_name = 'Snapshot' if self._snapshot_only else 'View'
288
- result = [f'{display_name} {self._path!r}']
318
+ result = [f'{display_name} {self._path()!r}']
289
319
  bases_descrs: list[str] = []
290
320
  for base, effective_version in zip(self._base_tables, self._effective_base_versions):
291
321
  if effective_version is None:
292
- bases_descrs.append(f'{base._path!r}')
322
+ bases_descrs.append(f'{base._path()!r}')
293
323
  else:
294
- base_descr = f'{base._path}:{effective_version}'
324
+ base_descr = f'{base._path()}:{effective_version}'
295
325
  bases_descrs.append(f'{base_descr!r}')
296
326
  result.append(f' (of {", ".join(bases_descrs)})')
297
327
 
298
328
  if self._tbl_version.get().predicate is not None:
299
329
  result.append(f'\nWhere: {self._tbl_version.get().predicate!s}')
330
+ if self._tbl_version.get().sample_clause is not None:
331
+ result.append(f'\nSample: {self._tbl_version.get().sample_clause!s}')
300
332
  return ''.join(result)