pixeltable 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

pixeltable/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  # These version placeholders will be replaced during build.
2
- __version__ = '0.3.14'
3
- __version_tuple__ = (0, 3, 14)
2
+ __version__ = '0.3.15'
3
+ __version_tuple__ = (0, 3, 15)
@@ -324,6 +324,7 @@ class TableVersion:
324
324
  @classmethod
325
325
  def create_replica(cls, md: schema.FullTableMd) -> TableVersion:
326
326
  tbl_id = UUID(md.tbl_md.tbl_id)
327
+ _logger.info(f'Creating replica table version {tbl_id}:{md.version_md.version}.')
327
328
  view_md = md.tbl_md.view_md
328
329
  base_path = pxt.catalog.TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
329
330
  base = base_path.tbl_version if base_path is not None else None
@@ -331,6 +332,7 @@ class TableVersion:
331
332
  tbl_id, md.tbl_md, md.version_md.version, md.schema_version_md, [], base_path=base_path, base=base
332
333
  )
333
334
  tbl_version.store_tbl.create()
335
+ tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
334
336
  return tbl_version
335
337
 
336
338
  def drop(self) -> None:
@@ -98,13 +98,6 @@ class TableVersionPath:
98
98
  return None
99
99
  return self.base.find_tbl_version(id)
100
100
 
101
- @property
102
- def ancestor_paths(self) -> list[TableVersionPath]:
103
- if self.base is None:
104
- return [self]
105
- else:
106
- return [self, *self.base.ancestor_paths]
107
-
108
101
  def columns(self) -> list[Column]:
109
102
  """Return all user columns visible in this tbl version path, including columns from bases"""
110
103
  result = list(self.tbl_version.get().cols_by_name.values())
@@ -5,10 +5,16 @@ first `pip install google-genai` and configure your Gemini credentials, as descr
5
5
  the [Working with Gemini](https://pixeltable.readme.io/docs/working-with-gemini) tutorial.
6
6
  """
7
7
 
8
+ import asyncio
9
+ import io
10
+ import tempfile
11
+ from pathlib import Path
8
12
  from typing import TYPE_CHECKING, Optional
9
13
 
14
+ import PIL.Image
15
+
10
16
  import pixeltable as pxt
11
- from pixeltable import env
17
+ from pixeltable import env, exceptions as excs, exprs
12
18
 
13
19
  if TYPE_CHECKING:
14
20
  from google import genai
@@ -27,23 +33,11 @@ def _genai_client() -> 'genai.client.Client':
27
33
 
28
34
  @pxt.udf(resource_pool='request-rate:gemini')
29
35
  async def generate_content(
30
- contents: str,
31
- *,
32
- model: str,
33
- candidate_count: Optional[int] = None,
34
- stop_sequences: Optional[list[str]] = None,
35
- max_output_tokens: Optional[int] = None,
36
- temperature: Optional[float] = None,
37
- top_p: Optional[float] = None,
38
- top_k: Optional[int] = None,
39
- response_mime_type: Optional[str] = None,
40
- response_schema: Optional[dict] = None,
41
- presence_penalty: Optional[float] = None,
42
- frequency_penalty: Optional[float] = None,
36
+ contents: str, *, model: str, config: Optional[dict] = None, tools: Optional[list[dict]] = None
43
37
  ) -> dict:
44
38
  """
45
39
  Generate content from the specified model. For additional details, see:
46
- <https://ai.google.dev/gemini-api/docs>
40
+ <https://ai.google.dev/gemini-api/docs/text-generation>
47
41
 
48
42
  Request throttling:
49
43
  Applies the rate limit set in the config (section `gemini`, key `rate_limit`). If no rate
@@ -56,38 +50,177 @@ async def generate_content(
56
50
  Args:
57
51
  contents: The input content to generate from.
58
52
  model: The name of the model to use.
59
-
60
- For details on the other parameters, see: <https://ai.google.dev/gemini-api/docs>
53
+ config: Configuration for generation, corresponding to keyword arguments of
54
+ `genai.types.GenerateContentConfig`. For details on the parameters, see:
55
+ <https://googleapis.github.io/python-genai/genai.html#module-genai.types>
56
+ tools: Optional list of Pixeltable tools to use. It is also possible to specify tools manually via the
57
+ `config.tools` parameter, but at most one of `config.tools` or `tools` may be used.
61
58
 
62
59
  Returns:
63
60
  A dictionary containing the response and other metadata.
64
61
 
65
62
  Examples:
66
- Add a computed column that applies the model `gemini-1.5-flash`
63
+ Add a computed column that applies the model `gemini-2.0-flash`
67
64
  to an existing Pixeltable column `tbl.prompt` of the table `tbl`:
68
65
 
69
- >>> tbl.add_computed_column(response=generate_content(tbl.prompt, model='gemini-1.5-flash'))
66
+ >>> tbl.add_computed_column(response=generate_content(tbl.prompt, model='gemini-2.0-flash'))
70
67
  """
71
68
  env.Env.get().require_package('google.genai')
72
69
  from google.genai import types
73
70
 
74
- config = types.GenerateContentConfig(
75
- candidate_count=candidate_count,
76
- stop_sequences=stop_sequences,
77
- max_output_tokens=max_output_tokens,
78
- temperature=temperature,
79
- top_p=top_p,
80
- top_k=top_k,
81
- response_mime_type=response_mime_type,
82
- response_schema=response_schema,
83
- presence_penalty=presence_penalty,
84
- frequency_penalty=frequency_penalty,
85
- )
86
-
87
- response = await _genai_client().aio.models.generate_content(model=model, contents=contents, config=config)
71
+ config_: types.GenerateContentConfig
72
+ if config is None and tools is None:
73
+ config_ = None
74
+ else:
75
+ if config is None:
76
+ config_ = types.GenerateContentConfig()
77
+ else:
78
+ config_ = types.GenerateContentConfig(**config)
79
+ if tools is not None:
80
+ gemini_tools = [__convert_pxt_tool(tool) for tool in tools]
81
+ config_.tools = [types.Tool(function_declarations=gemini_tools)]
82
+
83
+ response = await _genai_client().aio.models.generate_content(model=model, contents=contents, config=config_)
88
84
  return response.model_dump()
89
85
 
90
86
 
87
+ def __convert_pxt_tool(pxt_tool: dict) -> dict:
88
+ return {
89
+ 'name': pxt_tool['name'],
90
+ 'description': pxt_tool['description'],
91
+ 'parameters': {
92
+ 'type': 'object',
93
+ 'properties': pxt_tool['parameters']['properties'],
94
+ 'required': pxt_tool['required'],
95
+ },
96
+ }
97
+
98
+
99
+ def invoke_tools(tools: pxt.func.Tools, response: exprs.Expr) -> exprs.InlineDict:
100
+ """Converts an OpenAI response dict to Pixeltable tool invocation format and calls `tools._invoke()`."""
101
+ return tools._invoke(_gemini_response_to_pxt_tool_calls(response))
102
+
103
+
104
+ @pxt.udf
105
+ def _gemini_response_to_pxt_tool_calls(response: dict) -> Optional[dict]:
106
+ print(response)
107
+ pxt_tool_calls: dict[str, list[dict]] = {}
108
+ for part in response['candidates'][0]['content']['parts']:
109
+ tool_call = part.get('function_call')
110
+ if tool_call is not None:
111
+ tool_name = tool_call['name']
112
+ if tool_name not in pxt_tool_calls:
113
+ pxt_tool_calls[tool_name] = []
114
+ pxt_tool_calls[tool_name].append({'args': tool_call['args']})
115
+ if len(pxt_tool_calls) == 0:
116
+ return None
117
+ return pxt_tool_calls
118
+
119
+
91
120
  @generate_content.resource_pool
92
121
  def _(model: str) -> str:
93
122
  return f'request-rate:gemini:{model}'
123
+
124
+
125
+ @pxt.udf(resource_pool='request-rate:imagen')
126
+ async def generate_images(prompt: str, *, model: str, config: Optional[dict] = None) -> PIL.Image.Image:
127
+ """
128
+ Generates images based on a text description and configuration. For additional details, see:
129
+ <https://ai.google.dev/gemini-api/docs/image-generation>
130
+
131
+ __Requirements:__
132
+
133
+ - `pip install google-genai`
134
+
135
+ Args:
136
+ prompt: A text description of the images to generate.
137
+ model: The model to use.
138
+ config: Configuration for generation, corresponding to keyword arguments of
139
+ `genai.types.GenerateImagesConfig`. For details on the parameters, see:
140
+ <https://googleapis.github.io/python-genai/genai.html#module-genai.types>
141
+
142
+ Returns:
143
+ The generated image.
144
+
145
+ Examples:
146
+ Add a computed column that applies the model `imagen-3.0-generate-002`
147
+ to an existing Pixeltable column `tbl.prompt` of the table `tbl`:
148
+
149
+ >>> tbl.add_computed_column(response=generate_images(tbl.prompt, model='imagen-3.0-generate-002'))
150
+ """
151
+ env.Env.get().require_package('google.genai')
152
+ from google.genai.types import GenerateImagesConfig
153
+
154
+ config_ = GenerateImagesConfig(**config) if config else None
155
+ response = await _genai_client().aio.models.generate_images(model=model, prompt=prompt, config=config_)
156
+ return response.generated_images[0].image._pil_image
157
+
158
+
159
+ @generate_images.resource_pool
160
+ def _(model: str) -> str:
161
+ return f'request-rate:imagen:{model}'
162
+
163
+
164
+ @pxt.udf(resource_pool='request-rate:veo')
165
+ async def generate_videos(
166
+ prompt: Optional[str] = None, image: Optional[PIL.Image.Image] = None, *, model: str, config: Optional[dict] = None
167
+ ) -> pxt.Video:
168
+ """
169
+ Generates videos based on a text description and configuration. For additional details, see:
170
+ <https://ai.google.dev/gemini-api/docs/video-generation>
171
+
172
+ __Requirements:__
173
+
174
+ - `pip install google-genai`
175
+
176
+ Args:
177
+ prompt: A text description of the videos to generate.
178
+ image: An optional image to use as the first frame of the video. At least one of `prompt` or `image` must be
179
+ provided. (It is ok to specify both.)
180
+ model: The model to use.
181
+ config: Configuration for generation, corresponding to keyword arguments of
182
+ `genai.types.GenerateVideosConfig`. For details on the parameters, see:
183
+ <https://googleapis.github.io/python-genai/genai.html#module-genai.types>
184
+
185
+ Returns:
186
+ The generated video.
187
+
188
+ Examples:
189
+ Add a computed column that applies the model `veo-2.0-generate-001`
190
+ to an existing Pixeltable column `tbl.prompt` of the table `tbl`:
191
+
192
+ >>> tbl.add_computed_column(response=generate_videos(tbl.prompt, model='veo-2.0-generate-001'))
193
+ """
194
+ env.Env.get().require_package('google.genai')
195
+ from google.genai import types
196
+
197
+ if prompt is None and image is None:
198
+ raise excs.Error('At least one of `prompt` or `image` must be provided.')
199
+
200
+ image_: Optional[types.Image] = None
201
+ if image is not None:
202
+ with io.BytesIO() as buffer:
203
+ image.save(buffer, format='jpeg')
204
+ image_ = types.Image(image_bytes=buffer.getvalue(), mime_type='image/jpeg')
205
+
206
+ config_ = types.GenerateVideosConfig(**config) if config else None
207
+ operation = await _genai_client().aio.models.generate_videos(
208
+ model=model, prompt=prompt, image=image_, config=config_
209
+ )
210
+ while not operation.done:
211
+ await asyncio.sleep(3)
212
+ operation = await _genai_client().aio.operations.get(operation)
213
+
214
+ video = operation.response.generated_videos[0]
215
+
216
+ video_bytes = await _genai_client().aio.files.download(file=video.video) # type: ignore[arg-type]
217
+ assert video_bytes is not None
218
+
219
+ _, output_filename = tempfile.mkstemp(suffix='.mp4', dir=str(env.Env.get().tmp_dir))
220
+ Path(output_filename).write_bytes(video_bytes)
221
+ return output_filename
222
+
223
+
224
+ @generate_videos.resource_pool
225
+ def _(model: str) -> str:
226
+ return f'request-rate:veo:{model}'
@@ -100,6 +100,69 @@ def _(self: sql.ColumnElement, digits: Optional[sql.ColumnElement] = None) -> sq
100
100
  return sql.func.round(sql.cast(self, sql.Numeric), sql.cast(digits, sql.Integer))
101
101
 
102
102
 
103
+ @pxt.udf(is_method=True)
104
+ def pow(self: int, other: int) -> float:
105
+ """
106
+ Raise `self` to the power of `other`.
107
+
108
+ Equivalent to Python [`self ** other`](https://docs.python.org/3/library/functions.html#pow).
109
+ """
110
+ return self**other
111
+
112
+
113
+ @pow.to_sql
114
+ def _(self: sql.ColumnElement, other: sql.ColumnElement) -> sql.ColumnElement:
115
+ return sql.func.pow(self, other)
116
+
117
+
118
+ @pxt.udf(is_method=True)
119
+ def bitwise_and(self: int, other: int) -> int:
120
+ """
121
+ Bitwise AND of two integers.
122
+
123
+ Equivalent to Python
124
+ [`self & other`](https://docs.python.org/3/library/stdtypes.html#bitwise-operations-on-integer-types).
125
+ """
126
+ return self & other
127
+
128
+
129
+ @bitwise_and.to_sql
130
+ def _(self: sql.ColumnElement, other: sql.ColumnElement) -> sql.ColumnElement:
131
+ return self.bitwise_and(other)
132
+
133
+
134
+ @pxt.udf(is_method=True)
135
+ def bitwise_or(self: int, other: int) -> int:
136
+ """
137
+ Bitwise OR of two integers.
138
+
139
+ Equivalent to Python
140
+ [`self | other`](https://docs.python.org/3/library/stdtypes.html#bitwise-operations-on-integer-types).
141
+ """
142
+ return self | other
143
+
144
+
145
+ @bitwise_or.to_sql
146
+ def _(self: sql.ColumnElement, other: sql.ColumnElement) -> sql.ColumnElement:
147
+ return self.bitwise_or(other)
148
+
149
+
150
+ @pxt.udf(is_method=True)
151
+ def bitwise_xor(self: int, other: int) -> int:
152
+ """
153
+ Bitwise XOR of two integers.
154
+
155
+ Equivalent to Python
156
+ [`self ^ other`](https://docs.python.org/3/library/stdtypes.html#bitwise-operations-on-integer-types).
157
+ """
158
+ return self ^ other
159
+
160
+
161
+ @bitwise_xor.to_sql
162
+ def _(self: sql.ColumnElement, other: sql.ColumnElement) -> sql.ColumnElement:
163
+ return self.bitwise_xor(other)
164
+
165
+
103
166
  __all__ = local_public_names(__name__)
104
167
 
105
168
 
pixeltable/io/parquet.py CHANGED
@@ -112,11 +112,11 @@ def export_parquet(
112
112
  length = len(val)
113
113
  elif col_type.is_string_type():
114
114
  length = len(val)
115
- elif col_type.is_video_type():
115
+ elif col_type.is_video_type() or col_type.is_audio_type():
116
116
  if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
117
117
  val = data_row.file_paths[e.slot_idx]
118
118
  else:
119
- raise excs.Error(f'unknown video type {type(val)}')
119
+ raise excs.Error(f'unknown audio/video type {type(val)}')
120
120
  length = len(val)
121
121
  elif col_type.is_json_type():
122
122
  val = json.dumps(val)
@@ -17,6 +17,7 @@ import pixeltable as pxt
17
17
  from pixeltable import catalog, exceptions as excs, metadata
18
18
  from pixeltable.env import Env
19
19
  from pixeltable.metadata import schema
20
+ from pixeltable.utils import sha256sum
20
21
  from pixeltable.utils.media_store import MediaStore
21
22
 
22
23
  _logger = logging.getLogger('pixeltable')
@@ -88,7 +89,7 @@ class TablePackager:
88
89
  assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
89
90
  sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
90
91
  media_cols: set[str] = set()
91
- for col in tv.cols_by_name.values():
92
+ for col in tv.cols:
92
93
  if col.is_stored and col.col_type.is_media_type():
93
94
  media_cols.add(col.store_name())
94
95
 
@@ -182,7 +183,12 @@ class TablePackager:
182
183
  path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_url.path)))
183
184
  if path not in self.media_files:
184
185
  # Create a new entry in the `media_files` dict so that we can copy the file into the tarball later.
185
- dest_name = f'{uuid.uuid4().hex}{path.suffix}'
186
+ # We name the media files in the archive by their SHA256 hash. This ensures that we can properly
187
+ # deduplicate and validate them later.
188
+ # If we get a collision, it's not a problem; it just means we have two identical files (which will
189
+ # be conveniently deduplicated in the bundle).
190
+ sha = sha256sum(path)
191
+ dest_name = f'{sha}{path.suffix}'
186
192
  self.media_files[path] = dest_name
187
193
  return f'pxtmedia://{self.media_files[path]}'
188
194
  # For any type of URL other than a local file, just return the URL as-is.
@@ -276,11 +282,182 @@ class TableRestorer:
276
282
  tbl_id = uuid.UUID(tbl_md.tbl_md.tbl_id)
277
283
  parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
278
284
  parquet_table = pq.read_table(str(parquet_dir))
279
-
280
- for batch in parquet_table.to_batches():
285
+ replica_version = tv.version
286
+
287
+ conn = Env.get().conn
288
+ store_sa_tbl = tv.store_tbl.sa_tbl
289
+ store_sa_tbl_name = tv.store_tbl._storage_name()
290
+
291
+ # Sometimes we are importing a table that has never been seen before. Other times, however, we are importing
292
+ # an existing replica table, and the table version and/or row selection differs from what was imported
293
+ # previously. Care must be taken to ensure that the new data is merged with existing data in a way that
294
+ # yields an internally consistent version history for each row.
295
+
296
+ # The overall strategy is this:
297
+ # 1. Import the parquet data into a temporary table;
298
+ # 2. "rectify" the v_max values in both the temporary table and the existing table (more on this below);
299
+ # 3. Delete any row instances from the temporary table that are already present in the existing table;
300
+ # 4. Copy the remaining rows from the temporary table into the existing table.
301
+
302
+ # Create a temporary table for the initial data load, containing columns for all columns present in the
303
+ # parquet table. The parquet columns have identical names to those in the store table, so we can use the
304
+ # store table schema to get their SQL types (which are not necessarily derivable from their Parquet types,
305
+ # e.g., pa.string() may hold either VARCHAR or serialized JSONB).
306
+ temp_cols: dict[str, sql.Column] = {}
307
+ for field in parquet_table.schema:
308
+ assert field.name in store_sa_tbl.columns
309
+ col_type = store_sa_tbl.columns[field.name].type
310
+ temp_cols[field.name] = sql.Column(field.name, col_type)
311
+ temp_sa_tbl_name = f'temp_{uuid.uuid4().hex}'
312
+ _logger.debug(f'Creating temporary table: {temp_sa_tbl_name}')
313
+ temp_md = sql.MetaData()
314
+ temp_sa_tbl = sql.Table(temp_sa_tbl_name, temp_md, *temp_cols.values(), prefixes=['TEMPORARY'])
315
+ temp_sa_tbl.create(conn)
316
+
317
+ # Populate the temporary table with data from the Parquet file.
318
+ _logger.debug(f'Loading {parquet_table.num_rows} row(s) into temporary table: {temp_sa_tbl_name}')
319
+ for batch in parquet_table.to_batches(max_chunksize=10_000):
281
320
  pydict = batch.to_pydict()
282
321
  rows = self.__from_pa_pydict(tv, pydict)
283
- tv.store_tbl.load_rows(rows)
322
+ conn.execute(sql.insert(temp_sa_tbl), rows)
323
+
324
+ # Each row version is identified uniquely by its pk, a tuple (row_id, pos_0, pos_1, ..., pos_k, v_min).
325
+ # Conversely, v_max is not part of the primary key, but is simply a bookkeeping device.
326
+ # In an original table, v_max is always equal to the v_min of the succeeding row instance with the same
327
+ # row id, or MAX_VERSION if no such row instance exists. But in the replica, we need to be careful, since
328
+ # we might see only a subset of the original table's versions, and we might see them out of order.
329
+
330
+ # We'll adjust the v_max values according to the principle of "latest provable v_max":
331
+ # they will always correspond to the latest version for which we can prove the row instance was alive. This
332
+ # will enable us to maintain consistency of the v_max values if additional table versions are later imported,
333
+ # regardless of the order in which they are seen. It also means that replica tables (unlike original tables)
334
+ # may have gaps in their row version histories, but this is fine; the gaps simply correspond to table versions
335
+ # that have never been observed.
336
+
337
+ pk_predicates = [col == temp_cols[col.name] for col in tv.store_tbl.pk_columns()]
338
+ pk_clause = sql.and_(*pk_predicates)
339
+
340
+ # If the same pk exists in both the temporary table and the existing table, then the corresponding row data
341
+ # must be identical; the rows can differ only in their v_max value. As a sanity check, we go through the
342
+ # motion of verifying this; a failure implies data corruption in either the replica being imported or in a
343
+ # previously imported replica.
344
+
345
+ system_col_names = {col.name for col in tv.store_tbl.system_columns()}
346
+ media_col_names = {col.store_name() for col in tv.cols if col.col_type.is_media_type() and col.is_stored}
347
+ value_store_cols = [
348
+ store_sa_tbl.c[col_name]
349
+ for col_name in temp_cols
350
+ if col_name not in system_col_names and col_name not in media_col_names
351
+ ]
352
+ value_temp_cols = [
353
+ col
354
+ for col_name, col in temp_cols.items()
355
+ if col_name not in system_col_names and col_name not in media_col_names
356
+ ]
357
+ mismatch_predicates = [store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)]
358
+ mismatch_clause = sql.or_(*mismatch_predicates)
359
+
360
+ # This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
361
+ # one value column. Pseudo-SQL:
362
+ #
363
+ # SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
364
+ # FROM store_tbl, temp_tbl
365
+ # WHERE store_tbl.rowid = temp_tbl.rowid
366
+ # AND store_tbl.pos_0 = temp_tbl.pos_0
367
+ # AND ... AND store_tbl.pos_k = temp_tbl.pos_k
368
+ # AND store_tbl.v_min = temp_tbl.v_min
369
+ # AND (
370
+ # store_tbl.col_0 != temp_tbl.col_0
371
+ # OR store_tbl.col_1 != temp_tbl.col_1
372
+ # OR ... OR store_tbl.col_n != temp_tbl.col_n
373
+ # )
374
+ #
375
+ # The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
376
+ # either column is NULL; this is what we want, since it may indicate a column that is present in one version
377
+ # but not the other.
378
+ q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
379
+ _logger.debug(q.compile())
380
+ result = conn.execute(q)
381
+ if result.rowcount > 0:
382
+ _logger.debug(
383
+ f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
384
+ f'{result.rowcount} inconsistent row(s).'
385
+ )
386
+ row = result.first()
387
+ _logger.debug('Example mismatch:')
388
+ _logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
389
+ _logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
390
+ raise excs.Error(
391
+ 'Data corruption error: the replica data are inconsistent with data retrieved from a previous replica.'
392
+ )
393
+ _logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
394
+
395
+ # Now rectify the v_max values in the temporary table.
396
+ # If a row instance has a concrete v_max value, then we know it's genuine: it's the unique and immutable
397
+ # version when the row was deleted. (This can only happen if later versions of the base table already
398
+ # existed at the time this replica was published.)
399
+ # But if a row instance has a v_max value of MAX_VERSION, then we don't know anything about its future.
400
+ # It might live indefinitely, or it might be deleted as early as version `n + 1`. Following the principle
401
+ # of "latest provable v_max", we simply set v_max equal to `n + 1`.
402
+ q = (
403
+ temp_sa_tbl.update()
404
+ .values(v_max=(replica_version + 1))
405
+ .where(temp_sa_tbl.c.v_max == schema.Table.MAX_VERSION)
406
+ )
407
+ _logger.debug(q.compile())
408
+ result = conn.execute(q)
409
+ _logger.debug(f'Rectified {result.rowcount} row(s) in {temp_sa_tbl_name!r}.')
410
+
411
+ # Now rectify the v_max values in the existing table. This is done by simply taking the later of the two v_max
412
+ # values (the existing one and the new one) for each row instance, following the "latest provable v_max"
413
+ # principle. Obviously we only need to do this for rows that exist in both tables (it's a simple join).
414
+ q = (
415
+ store_sa_tbl.update()
416
+ .values(v_max=sql.func.greatest(store_sa_tbl.c.v_max, temp_sa_tbl.c.v_max))
417
+ .where(pk_clause)
418
+ )
419
+ _logger.debug(q.compile())
420
+ result = conn.execute(q)
421
+ _logger.debug(f'Rectified {result.rowcount} row(s) in {store_sa_tbl_name!r}.')
422
+
423
+ # Now we need to update rows in the existing table that are also present in the temporary table. This is to
424
+ # account for the scenario where the temporary table has columns that are not present in the existing table.
425
+ # (We can't simply replace the rows with their versions in the temporary table, because the converse scenario
426
+ # might also occur; there may be columns in the existing table that are not present in the temporary table.)
427
+ value_update_clauses: dict[str, sql.ColumnElement] = {}
428
+ for temp_col in temp_cols.values():
429
+ if temp_col.name not in system_col_names:
430
+ store_col = store_sa_tbl.c[temp_col.name]
431
+ # Prefer the value from the existing table, substituting the value from the temporary table if it's
432
+ # NULL. This works in all cases (including media columns, where we prefer the existing media file).
433
+ clause = sql.case((store_col == None, temp_col), else_=store_col)
434
+ value_update_clauses[temp_col.name] = clause
435
+ if len(value_update_clauses) > 0:
436
+ q = store_sa_tbl.update().values(**value_update_clauses).where(pk_clause)
437
+ _logger.debug(q.compile())
438
+ result = conn.execute(q)
439
+ _logger.debug(
440
+ f'Merged values from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r} for {result.rowcount} row(s).'
441
+ )
442
+
443
+ # Now drop any rows from the temporary table that are also present in the existing table.
444
+ # The v_max values have been rectified, data has been merged into NULL cells, and all other row values have
445
+ # been verified identical.
446
+ # TODO: Delete any media files that were orphaned by this operation (they're necessarily duplicates of media
447
+ # files that are already present in the existing table).
448
+ q = temp_sa_tbl.delete().where(pk_clause)
449
+ _logger.debug(q.compile())
450
+ result = conn.execute(q)
451
+ _logger.debug(f'Deleted {result.rowcount} row(s) from {temp_sa_tbl_name!r}.')
452
+
453
+ # Finally, copy the remaining data (consisting entirely of new row instances) from the temporary table into
454
+ # the actual table.
455
+ q = store_sa_tbl.insert().from_select(
456
+ [store_sa_tbl.c[col_name] for col_name in temp_cols], sql.select(*temp_cols.values())
457
+ )
458
+ _logger.debug(q.compile())
459
+ result = conn.execute(q)
460
+ _logger.debug(f'Inserted {result.rowcount} row(s) from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r}.')
284
461
 
285
462
  def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
286
463
  # Data conversions from pyarrow to Pixeltable
@@ -289,7 +466,7 @@ class TableRestorer:
289
466
  assert col_name in tv.store_tbl.sa_tbl.columns
290
467
  sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
291
468
  media_col_ids: dict[str, int] = {}
292
- for col in tv.cols_by_name.values():
469
+ for col in tv.cols:
293
470
  if col.is_stored and col.col_type.is_media_type():
294
471
  media_col_ids[col.store_name()] = col.id
295
472
 
pixeltable/store.py CHANGED
@@ -54,6 +54,9 @@ class StoreBase:
54
54
  self.base = tbl_version.base.get().store_tbl if tbl_version.base is not None else None
55
55
  self.create_sa_tbl()
56
56
 
57
+ def system_columns(self) -> list[sql.Column]:
58
+ return [*self._pk_cols, self.v_max_col]
59
+
57
60
  def pk_columns(self) -> list[sql.Column]:
58
61
  return self._pk_cols
59
62
 
@@ -215,6 +218,15 @@ class StoreBase:
215
218
  log_stmt(_logger, stmt)
216
219
  Env.get().conn.execute(stmt)
217
220
 
221
+ def ensure_columns_exist(self, cols: Iterable[catalog.Column]) -> None:
222
+ conn = Env.get().conn
223
+ sql_text = f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r}'
224
+ result = conn.execute(sql.text(sql_text))
225
+ existing_cols = {row[0] for row in result}
226
+ for col in cols:
227
+ if col.store_name() not in existing_cols:
228
+ self.add_column(col)
229
+
218
230
  def load_column(
219
231
  self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int, on_error: Literal['abort', 'ignore']
220
232
  ) -> int:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: pixeltable
3
- Version: 0.3.14
3
+ Version: 0.3.15
4
4
  Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
5
5
  License: Apache-2.0
6
6
  Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai
@@ -1,5 +1,5 @@
1
1
  pixeltable/__init__.py,sha256=-uXHuiXH98kAlCupUTPbkBh4ToZgxcYUOk7-c9hqCC8,1439
2
- pixeltable/__version__.py,sha256=XhiJRUjdjrwJPbZIKiObIa0uwoOqWB_MxrLdbtgVGS8,114
2
+ pixeltable/__version__.py,sha256=HD2KV9QVAFaAv42gJZzYHDLMgRFeCQBTYL5eYWvU_Kk,114
3
3
  pixeltable/catalog/__init__.py,sha256=rQmjveID4bk6NI4Ql09lGsZ0K0HVE2l1yqKAveipHzc,558
4
4
  pixeltable/catalog/catalog.py,sha256=E-Fqf3xDsFWLuau41trwqFRQbmNjy-LlT_votgYRM8k,48780
5
5
  pixeltable/catalog/column.py,sha256=f_jSvdV7DIe3MYXc4n4tSSJmAKIiNfPyT6i97rt2ewA,11133
@@ -10,9 +10,9 @@ pixeltable/catalog/named_function.py,sha256=vZ-j7P4HugWh9OmUzBMwyRYvO3tQn9jWyJz_
10
10
  pixeltable/catalog/path.py,sha256=gk8TIlO_7Jpji5mAN0dUNvHmvU0uneTHeB_qCTWnszQ,2529
11
11
  pixeltable/catalog/schema_object.py,sha256=J96iXsKMvqTNN0jbcMOPZDSZDNq8688Vkybs5bFcqNk,1818
12
12
  pixeltable/catalog/table.py,sha256=mlMbV6cWafm6OC9AHmMfpHuTYTyhGf5rhMHV2537tF8,65129
13
- pixeltable/catalog/table_version.py,sha256=1Yff0GylzRpsZZS0I8sEGkSxJUMdGqpGoB1QsYjtjGs,62249
13
+ pixeltable/catalog/table_version.py,sha256=BlxPBec17v_zcZ2pJGqyT9rJfj2Jyxau472DfZYc3jg,62440
14
14
  pixeltable/catalog/table_version_handle.py,sha256=LYJFTdRssPu4uOBPbP93wKqXygJXr3Gwdc9wHzzZRag,1654
15
- pixeltable/catalog/table_version_path.py,sha256=r4WHtP2dkF05UafiQe47RWkszPiy8ZsdcTMA5mEzRp4,6847
15
+ pixeltable/catalog/table_version_path.py,sha256=_g9knGenpMOlhaK8DZa8iLz5CorsMcbnFqTLnvaUkYM,6653
16
16
  pixeltable/catalog/view.py,sha256=u9c9YL9dHw21qGYf-QYST7nULhut-ECK-DWR1_MH3ik,13027
17
17
  pixeltable/config.py,sha256=gnRI4G9GE7mQJDcMcn8JsEzYk8oKVfHB-BwoLRWnRDo,3971
18
18
  pixeltable/dataframe.py,sha256=n_ZF_JdCq-DkpOzLhlnCspjDS2I8_7M0nlkTcDQVqyA,51593
@@ -84,13 +84,13 @@ pixeltable/functions/bedrock.py,sha256=lTCFHjYunF3minBGWcjXR90yJ8resFjXr4niyKhfx
84
84
  pixeltable/functions/date.py,sha256=WUwqyrOWB8A00cTNEd6Vd7anQZo40_-7EWhpfpI-P6c,5323
85
85
  pixeltable/functions/deepseek.py,sha256=KYIa-UJJUTOt9cCfmP6k6nM4MpKm1MBU8F-jWk3CycY,3827
86
86
  pixeltable/functions/fireworks.py,sha256=k0vUXxeeNYWfL_tdLgF9c-vOilr0g2tTeLkAL9SJ6ws,4972
87
- pixeltable/functions/gemini.py,sha256=GTtYBCNQG0DXBrBCdd92A2KVfP6JLuBESdCJ2XRrlBU,2969
87
+ pixeltable/functions/gemini.py,sha256=zWPsvtq0mPFBC_-4N7NDuhTYZfAzRMmZa9S4GFjIpLg,8328
88
88
  pixeltable/functions/globals.py,sha256=ZXBV2LPXT2-yQYHHE7q8N1WdAr0WxiIO1ax0qwxhmK8,5118
89
89
  pixeltable/functions/huggingface.py,sha256=KM1OH0Jt6XWF2jfpHb6rGhi1mV-AQNYAsHAyQfzW4qw,20560
90
90
  pixeltable/functions/image.py,sha256=IKXljMma-uU88efptC3F4aywau7DYcD-Nqd3YpmRNRw,13971
91
91
  pixeltable/functions/json.py,sha256=d7-AvwytUQtQYF_JnWJkptT_Yq0NgMpWfVk-m3U6qTY,807
92
92
  pixeltable/functions/llama_cpp.py,sha256=uf7WSZIhKDa492snnQv5ojGVLNdBWvuw0Ou3Mch1c_I,3874
93
- pixeltable/functions/math.py,sha256=gmkeAWm_FbWqiekVOK8fyRs7A87kKE9rCdYE0ETfGj4,3357
93
+ pixeltable/functions/math.py,sha256=eZEFjXxNHDHjcCsOMhzfNbJthTsmtNxtSFV8AEeRIfM,4979
94
94
  pixeltable/functions/mistralai.py,sha256=yZge5T385RoiFGXEZ6OhwWHj0JnsZ8tN8Jb3VkfDmXc,6274
95
95
  pixeltable/functions/ollama.py,sha256=AmkP532HwWeTyWkTnHm_hIk0CFjzV5MwCCPnM9Kb7KM,4231
96
96
  pixeltable/functions/openai.py,sha256=aDh1L2mBbSlrM8c1Rbh2QsCnmBESItLqzZ-frdgb05k,29259
@@ -115,7 +115,7 @@ pixeltable/io/globals.py,sha256=Z8ww-Pcm59ql1tvame8z0Mu1thIy5BPbW-TswGRXt4s,1136
115
115
  pixeltable/io/hf_datasets.py,sha256=gWyBH_0iFvxcrrxMY9_W399ZRcNDCmWFOAMmb1apnY0,5246
116
116
  pixeltable/io/label_studio.py,sha256=uB-LReXf1l2OMuzJEENxJP-0C14r14VEmsIulK8Yr3s,31261
117
117
  pixeltable/io/pandas.py,sha256=AbOeRDlA4MvUvianSKixsU-x-64nasPWw4HCHD6emz4,8981
118
- pixeltable/io/parquet.py,sha256=MC2n1ybf0l9O2h873SuNEJHv1bTGA2cV0ei_wQCgbwo,7757
118
+ pixeltable/io/parquet.py,sha256=Aav5tkqY22gHSWTxsAnlk6MkRLi0OarXgp_N0JYOfHA,7791
119
119
  pixeltable/io/table_data_conduit.py,sha256=gdjr82HxJpDfH55xmbIUCX5V-Hkaj6Kmo25NESKChtk,23205
120
120
  pixeltable/io/utils.py,sha256=YMfhpqMitWz1PhXJGkCNOgNtEM1AZ55S0zXVhljC5kY,4260
121
121
  pixeltable/iterators/__init__.py,sha256=bU4EmbX85J1URmRw6G71f2I77b1ctqngEOwDmRB3T0w,455
@@ -156,9 +156,9 @@ pixeltable/metadata/schema.py,sha256=EKmx29vfQo3eGD2uCJW_lPalPialSb2oUSBGTyewduE
156
156
  pixeltable/plan.py,sha256=VfXTvEYYiiLPBdw0hoTmdXHE5IeQKZc1ej8l9a3XAns,43632
157
157
  pixeltable/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
158
  pixeltable/share/__init__.py,sha256=AtR4nS6YkfkFRkXA-zZXFTK5pSQjHry8MnxdVLUk5SA,68
159
- pixeltable/share/packager.py,sha256=hF1_jFgEFq-94zF-PykwcgXbl4h40dgemBwMeSj8e9M,15590
159
+ pixeltable/share/packager.py,sha256=hYpss8DKggZcY9-6Tl8oX6IRUq97p1bqduyfMSfjaZ4,26675
160
160
  pixeltable/share/publish.py,sha256=MZ_tsCSM9nUnrz8P1gbwatlpbS6EJYjYAd7S06lHw9M,6533
161
- pixeltable/store.py,sha256=j9rieIVD2fD7aIEcJ2xPf0vGCi0OirYk-A_JKEB9HVo,23693
161
+ pixeltable/store.py,sha256=HVe99eQ_fk9CYZHFjYFfFy-sc95R9ADc4clPnHZeNZ0,24233
162
162
  pixeltable/type_system.py,sha256=DSrof2NgKhBzvt7pbDNrGlZ3rkkDJ7MQsQ9rqk9N9pA,53988
163
163
  pixeltable/utils/__init__.py,sha256=Pwgu-Sg1XkxzdCZ4ZhWP77UgLP3tnQsyCKaUJLF4ajo,1741
164
164
  pixeltable/utils/arrow.py,sha256=74wIy58rDYZJBVQ1g85NqzFyiQBvEQhnJ0Gi82iZ0dw,6421
@@ -179,8 +179,8 @@ pixeltable/utils/pytorch.py,sha256=564VHRdDHwD9h0v5lBHEDTJ8c6zx8wuzWYx8ZYjBxlI,3
179
179
  pixeltable/utils/s3.py,sha256=pxip2MlCqd2Qon2dzJXzfxvwtZyc-BAsjAnLL4J_OXY,587
180
180
  pixeltable/utils/sql.py,sha256=Sa4Lh-VGe8GToU5W7DRiWf2lMl9B6saPqemiT0ZdHEc,806
181
181
  pixeltable/utils/transactional_directory.py,sha256=OFKmu90oP7KwBAljwjnzP_w8euGdAXob3y4Nx9SCNHA,1357
182
- pixeltable-0.3.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
183
- pixeltable-0.3.14.dist-info/METADATA,sha256=Z7hUCAAUayqhytbLmw1_uIW_IS2lUBq-rRd9sSOoHx8,20540
184
- pixeltable-0.3.14.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
185
- pixeltable-0.3.14.dist-info/entry_points.txt,sha256=ToOd-pRgG7AitEBgYoBCRRB4-KVDQ0pj_9T4a1LgwA4,97
186
- pixeltable-0.3.14.dist-info/RECORD,,
182
+ pixeltable-0.3.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
183
+ pixeltable-0.3.15.dist-info/METADATA,sha256=YAC_YaXgK70Eo19mvT9rhBj0WKkwYVrMvz4b9YMB8Ag,20540
184
+ pixeltable-0.3.15.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
185
+ pixeltable-0.3.15.dist-info/entry_points.txt,sha256=ToOd-pRgG7AitEBgYoBCRRB4-KVDQ0pj_9T4a1LgwA4,97
186
+ pixeltable-0.3.15.dist-info/RECORD,,