pixeltable 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +3 -10
- pixeltable/catalog/catalog.py +139 -59
- pixeltable/catalog/column.py +32 -23
- pixeltable/catalog/globals.py +2 -45
- pixeltable/catalog/insertable_table.py +5 -2
- pixeltable/catalog/path.py +6 -0
- pixeltable/catalog/table.py +173 -23
- pixeltable/catalog/table_version.py +156 -92
- pixeltable/catalog/table_version_handle.py +26 -1
- pixeltable/catalog/update_status.py +179 -0
- pixeltable/catalog/view.py +12 -3
- pixeltable/config.py +76 -12
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +29 -0
- pixeltable/exec/exec_node.py +7 -24
- pixeltable/exec/expr_eval/schedulers.py +134 -7
- pixeltable/exprs/column_property_ref.py +23 -20
- pixeltable/exprs/column_ref.py +24 -18
- pixeltable/exprs/data_row.py +9 -0
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/row_builder.py +46 -14
- pixeltable/exprs/rowid_ref.py +0 -4
- pixeltable/func/function.py +3 -3
- pixeltable/functions/audio.py +36 -9
- pixeltable/functions/video.py +57 -10
- pixeltable/globals.py +61 -1
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +39 -64
- pixeltable/io/globals.py +4 -4
- pixeltable/io/hf_datasets.py +10 -2
- pixeltable/io/label_studio.py +52 -48
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +125 -0
- pixeltable/metadata/converters/util.py +3 -0
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +14 -2
- pixeltable/metadata/utils.py +78 -0
- pixeltable/plan.py +26 -18
- pixeltable/share/packager.py +20 -38
- pixeltable/store.py +121 -142
- pixeltable/type_system.py +2 -2
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/media_store.py +39 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/RECORD +51 -47
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
pixeltable/store.py
CHANGED
|
@@ -2,18 +2,16 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
4
|
import logging
|
|
5
|
-
import os
|
|
6
5
|
import sys
|
|
7
|
-
import urllib.parse
|
|
8
|
-
import urllib.request
|
|
9
6
|
import warnings
|
|
10
|
-
from typing import Any, Iterable, Iterator,
|
|
7
|
+
from typing import Any, Iterable, Iterator, Optional, Union
|
|
11
8
|
|
|
12
9
|
import more_itertools
|
|
13
10
|
import sqlalchemy as sql
|
|
14
11
|
from tqdm import TqdmWarning, tqdm
|
|
15
12
|
|
|
16
|
-
from pixeltable import catalog, exceptions as excs
|
|
13
|
+
from pixeltable import catalog, exceptions as excs
|
|
14
|
+
from pixeltable.catalog.update_status import RowCountStats
|
|
17
15
|
from pixeltable.env import Env
|
|
18
16
|
from pixeltable.exec import ExecNode
|
|
19
17
|
from pixeltable.metadata import schema
|
|
@@ -41,7 +39,10 @@ class StoreBase:
|
|
|
41
39
|
v_max_col: sql.Column
|
|
42
40
|
base: Optional[StoreBase]
|
|
43
41
|
|
|
44
|
-
|
|
42
|
+
# In my cursory experiments this was the optimal batch size: it was an improvement over 5_000 and there was no real
|
|
43
|
+
# benefit to going higher.
|
|
44
|
+
# TODO: Perform more rigorous experiments with different table structures and OS environments to refine this.
|
|
45
|
+
__INSERT_BATCH_SIZE = 10_000
|
|
45
46
|
|
|
46
47
|
def __init__(self, tbl_version: catalog.TableVersion):
|
|
47
48
|
self.tbl_version = catalog.TableVersionHandle(
|
|
@@ -89,9 +90,8 @@ class StoreBase:
|
|
|
89
90
|
# to the last sql.Table version we created and cannot be reused
|
|
90
91
|
col.create_sa_cols()
|
|
91
92
|
all_cols.append(col.sa_col)
|
|
92
|
-
if col.
|
|
93
|
-
all_cols.append(col.
|
|
94
|
-
all_cols.append(col.sa_errortype_col)
|
|
93
|
+
if col.stores_cellmd:
|
|
94
|
+
all_cols.append(col.sa_cellmd_col)
|
|
95
95
|
|
|
96
96
|
if self.sa_tbl is not None:
|
|
97
97
|
# if we're called in response to a schema change, we need to remove the old table first
|
|
@@ -123,49 +123,14 @@ class StoreBase:
|
|
|
123
123
|
"""Return the name of the data store table"""
|
|
124
124
|
|
|
125
125
|
def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column, v_min: int) -> str:
|
|
126
|
-
|
|
127
|
-
pxt_tmp_dir = str(Env.get().tmp_dir)
|
|
128
|
-
if file_url is None:
|
|
129
|
-
return None
|
|
130
|
-
parsed = urllib.parse.urlparse(file_url)
|
|
131
|
-
# We should never be passed a local file path here. The "len > 1" ensures that Windows
|
|
132
|
-
# file paths aren't mistaken for URLs with a single-character scheme.
|
|
133
|
-
assert len(parsed.scheme) > 1
|
|
134
|
-
if parsed.scheme != 'file':
|
|
135
|
-
# remote url
|
|
136
|
-
return file_url
|
|
137
|
-
file_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
|
|
138
|
-
if not file_path.startswith(pxt_tmp_dir):
|
|
139
|
-
# not a tmp file
|
|
140
|
-
return file_url
|
|
141
|
-
_, ext = os.path.splitext(file_path)
|
|
142
|
-
new_path = str(MediaStore.prepare_media_path(self.tbl_version.id, col.id, v_min, ext=ext))
|
|
143
|
-
os.rename(file_path, new_path)
|
|
144
|
-
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(new_path))
|
|
145
|
-
return new_file_url
|
|
126
|
+
return MediaStore.move_tmp_media_file(file_url, self.tbl_version.id, col.id, v_min)
|
|
146
127
|
|
|
147
128
|
def _move_tmp_media_files(
|
|
148
|
-
self,
|
|
129
|
+
self, table_row: list[Any], media_cols_by_sql_idx: dict[int, catalog.Column], v_min: int
|
|
149
130
|
) -> None:
|
|
150
131
|
"""Move tmp media files that we generated to a permanent location"""
|
|
151
|
-
for
|
|
152
|
-
|
|
153
|
-
file_url = table_row[c.store_name()]
|
|
154
|
-
table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
|
|
155
|
-
|
|
156
|
-
def _create_table_row(
|
|
157
|
-
self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, exc_col_ids: set[int], pk: tuple[int, ...]
|
|
158
|
-
) -> tuple[dict[str, Any], int]:
|
|
159
|
-
"""Return Tuple[complete table row, # of exceptions] for insert()
|
|
160
|
-
Creates a row that includes the PK columns, with the values from input_row.pk.
|
|
161
|
-
Returns:
|
|
162
|
-
Tuple[complete table row, # of exceptions]
|
|
163
|
-
"""
|
|
164
|
-
table_row, num_excs = row_builder.create_table_row(input_row, exc_col_ids)
|
|
165
|
-
assert len(pk) == len(self._pk_cols)
|
|
166
|
-
for pk_col, pk_val in zip(self._pk_cols, pk):
|
|
167
|
-
table_row[pk_col.name] = pk_val
|
|
168
|
-
return table_row, num_excs
|
|
132
|
+
for n, col in media_cols_by_sql_idx.items():
|
|
133
|
+
table_row[n] = self._move_tmp_media_file(table_row[n], col, v_min)
|
|
169
134
|
|
|
170
135
|
def count(self) -> int:
|
|
171
136
|
"""Return the number of rows visible in self.tbl_version"""
|
|
@@ -200,11 +165,10 @@ class StoreBase:
|
|
|
200
165
|
col_type_str = col.get_sa_col_type().compile(dialect=conn.dialect)
|
|
201
166
|
s_txt = f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.store_name()} {col_type_str} NULL'
|
|
202
167
|
added_storage_cols = [col.store_name()]
|
|
203
|
-
if col.
|
|
204
|
-
|
|
205
|
-
s_txt += f' , ADD COLUMN {col.
|
|
206
|
-
|
|
207
|
-
added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
|
|
168
|
+
if col.stores_cellmd:
|
|
169
|
+
cellmd_type_str = col.sa_cellmd_type().compile(dialect=conn.dialect)
|
|
170
|
+
s_txt += f' , ADD COLUMN {col.cellmd_store_name()} {cellmd_type_str} DEFAULT NULL'
|
|
171
|
+
added_storage_cols.append(col.cellmd_store_name())
|
|
208
172
|
|
|
209
173
|
stmt = sql.text(s_txt)
|
|
210
174
|
log_stmt(_logger, stmt)
|
|
@@ -215,9 +179,8 @@ class StoreBase:
|
|
|
215
179
|
def drop_column(self, col: catalog.Column) -> None:
|
|
216
180
|
"""Execute Alter Table Drop Column statement"""
|
|
217
181
|
s_txt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.store_name()}'
|
|
218
|
-
if col.
|
|
219
|
-
s_txt += f' , DROP COLUMN {col.
|
|
220
|
-
s_txt += f' , DROP COLUMN {col.errortype_store_name()}'
|
|
182
|
+
if col.stores_cellmd:
|
|
183
|
+
s_txt += f' , DROP COLUMN {col.cellmd_store_name()}'
|
|
221
184
|
stmt = sql.text(s_txt)
|
|
222
185
|
log_stmt(_logger, stmt)
|
|
223
186
|
Env.get().conn.execute(stmt)
|
|
@@ -231,9 +194,7 @@ class StoreBase:
|
|
|
231
194
|
if col.store_name() not in existing_cols:
|
|
232
195
|
self.add_column(col)
|
|
233
196
|
|
|
234
|
-
def load_column(
|
|
235
|
-
self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int, on_error: Literal['abort', 'ignore']
|
|
236
|
-
) -> int:
|
|
197
|
+
def load_column(self, col: catalog.Column, exec_plan: ExecNode, abort_on_exc: bool) -> int:
|
|
237
198
|
"""Update store column of a computed column with values produced by an execution plan
|
|
238
199
|
|
|
239
200
|
Returns:
|
|
@@ -247,72 +208,61 @@ class StoreBase:
|
|
|
247
208
|
num_rows = 0
|
|
248
209
|
# create temp table to store output of exec_plan, with the same primary key as the store table
|
|
249
210
|
tmp_name = f'temp_{self._storage_name()}'
|
|
250
|
-
tmp_pk_cols =
|
|
251
|
-
|
|
211
|
+
tmp_pk_cols = tuple(sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns())
|
|
212
|
+
tmp_val_col_sql_idx = len(tmp_pk_cols)
|
|
252
213
|
tmp_val_col = sql.Column(col.sa_col.name, col.sa_col.type)
|
|
253
|
-
tmp_cols
|
|
214
|
+
tmp_cols = [*tmp_pk_cols, tmp_val_col]
|
|
254
215
|
# add error columns if the store column records errors
|
|
255
|
-
if col.
|
|
256
|
-
|
|
257
|
-
tmp_cols.append(
|
|
258
|
-
|
|
259
|
-
|
|
216
|
+
if col.stores_cellmd:
|
|
217
|
+
tmp_cellmd_col = sql.Column(col.sa_cellmd_col.name, col.sa_cellmd_col.type)
|
|
218
|
+
tmp_cols.append(tmp_cellmd_col)
|
|
219
|
+
tmp_col_names = [col.name for col in tmp_cols]
|
|
220
|
+
|
|
260
221
|
tmp_tbl = sql.Table(tmp_name, self.sa_md, *tmp_cols, prefixes=['TEMPORARY'])
|
|
261
222
|
conn = Env.get().conn
|
|
262
223
|
tmp_tbl.create(bind=conn)
|
|
263
224
|
|
|
225
|
+
row_builder = exec_plan.row_builder
|
|
226
|
+
|
|
264
227
|
try:
|
|
228
|
+
table_rows: list[tuple[Any]] = []
|
|
229
|
+
|
|
265
230
|
# insert rows from exec_plan into temp table
|
|
266
|
-
# TODO: unify the table row construction logic with RowBuilder.create_table_row()
|
|
267
231
|
for row_batch in exec_plan:
|
|
268
232
|
num_rows += len(row_batch)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
# we have yet to store this image
|
|
292
|
-
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
|
|
293
|
-
result_row.flush_img(value_expr_slot_idx, filepath)
|
|
294
|
-
val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
|
|
295
|
-
if col.col_type.is_media_type():
|
|
296
|
-
val = self._move_tmp_media_file(val, col, result_row.pk[-1])
|
|
297
|
-
tbl_row[col.sa_col.name] = val
|
|
298
|
-
if col.records_errors:
|
|
299
|
-
tbl_row[col.sa_errortype_col.name] = None
|
|
300
|
-
tbl_row[col.sa_errormsg_col.name] = None
|
|
301
|
-
|
|
302
|
-
tbl_rows.append(tbl_row)
|
|
303
|
-
conn.execute(sql.insert(tmp_tbl), tbl_rows)
|
|
233
|
+
batch_table_rows: list[tuple[Any]] = []
|
|
234
|
+
|
|
235
|
+
for row in row_batch:
|
|
236
|
+
if abort_on_exc and row.has_exc():
|
|
237
|
+
exc = row.get_first_exc()
|
|
238
|
+
raise excs.Error(f'Error while evaluating computed column {col.name!r}:\n{exc}') from exc
|
|
239
|
+
table_row, num_row_exc = row_builder.create_table_row(row, None, row.pk)
|
|
240
|
+
if col.col_type.is_media_type():
|
|
241
|
+
table_row[tmp_val_col_sql_idx] = self._move_tmp_media_file(
|
|
242
|
+
table_row[tmp_val_col_sql_idx], col, row.pk[-1]
|
|
243
|
+
)
|
|
244
|
+
num_excs += num_row_exc
|
|
245
|
+
batch_table_rows.append(tuple(table_row))
|
|
246
|
+
|
|
247
|
+
table_rows.extend(batch_table_rows)
|
|
248
|
+
|
|
249
|
+
if len(table_rows) >= self.__INSERT_BATCH_SIZE:
|
|
250
|
+
self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
|
|
251
|
+
table_rows.clear()
|
|
252
|
+
|
|
253
|
+
if len(table_rows) > 0:
|
|
254
|
+
self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
|
|
304
255
|
|
|
305
256
|
# update store table with values from temp table
|
|
306
257
|
update_stmt = sql.update(self.sa_tbl)
|
|
307
258
|
for pk_col, tmp_pk_col in zip(self.pk_columns(), tmp_pk_cols):
|
|
308
259
|
update_stmt = update_stmt.where(pk_col == tmp_pk_col)
|
|
309
260
|
update_stmt = update_stmt.values({col.sa_col: tmp_val_col})
|
|
310
|
-
if col.
|
|
311
|
-
update_stmt = update_stmt.values(
|
|
312
|
-
{col.sa_errortype_col: tmp_errortype_col, col.sa_errormsg_col: tmp_errormsg_col}
|
|
313
|
-
)
|
|
261
|
+
if col.stores_cellmd:
|
|
262
|
+
update_stmt = update_stmt.values({col.sa_cellmd_col: tmp_cellmd_col})
|
|
314
263
|
log_explain(_logger, update_stmt, conn)
|
|
315
264
|
conn.execute(update_stmt)
|
|
265
|
+
|
|
316
266
|
finally:
|
|
317
267
|
|
|
318
268
|
def remove_tmp_tbl() -> None:
|
|
@@ -320,6 +270,7 @@ class StoreBase:
|
|
|
320
270
|
tmp_tbl.drop(bind=conn)
|
|
321
271
|
|
|
322
272
|
run_cleanup(remove_tmp_tbl, raise_error=True)
|
|
273
|
+
|
|
323
274
|
return num_excs
|
|
324
275
|
|
|
325
276
|
def insert_rows(
|
|
@@ -329,7 +280,7 @@ class StoreBase:
|
|
|
329
280
|
show_progress: bool = True,
|
|
330
281
|
rowids: Optional[Iterator[int]] = None,
|
|
331
282
|
abort_on_exc: bool = False,
|
|
332
|
-
) -> tuple[
|
|
283
|
+
) -> tuple[set[int], RowCountStats]:
|
|
333
284
|
"""Insert rows into the store table and update the catalog table's md
|
|
334
285
|
Returns:
|
|
335
286
|
number of inserted rows, number of exceptions, set of column ids that have exceptions
|
|
@@ -341,50 +292,78 @@ class StoreBase:
|
|
|
341
292
|
cols_with_excs: set[int] = set()
|
|
342
293
|
progress_bar: Optional[tqdm] = None # create this only after we started executing
|
|
343
294
|
row_builder = exec_plan.row_builder
|
|
344
|
-
|
|
345
|
-
|
|
295
|
+
|
|
296
|
+
store_col_names, media_cols_by_idx = row_builder.store_column_names()
|
|
346
297
|
|
|
347
298
|
try:
|
|
299
|
+
table_rows: list[tuple[Any]] = []
|
|
348
300
|
exec_plan.open()
|
|
301
|
+
|
|
349
302
|
for row_batch in exec_plan:
|
|
350
303
|
num_rows += len(row_batch)
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
304
|
+
batch_table_rows: list[tuple[Any]] = []
|
|
305
|
+
|
|
306
|
+
# compute batch of rows and convert them into table rows
|
|
307
|
+
for row in row_batch:
|
|
308
|
+
# if abort_on_exc == True, we need to check for media validation exceptions
|
|
309
|
+
if abort_on_exc and row.has_exc():
|
|
310
|
+
exc = row.get_first_exc()
|
|
311
|
+
raise exc
|
|
312
|
+
|
|
313
|
+
rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
|
|
314
|
+
pk = (*rowid, v_min)
|
|
315
|
+
assert len(pk) == len(self._pk_cols)
|
|
316
|
+
table_row, num_row_exc = row_builder.create_table_row(row, cols_with_excs, pk)
|
|
317
|
+
num_excs += num_row_exc
|
|
318
|
+
|
|
319
|
+
if show_progress:
|
|
320
|
+
if progress_bar is None:
|
|
321
|
+
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
322
|
+
progress_bar = tqdm(
|
|
323
|
+
desc=f'Inserting rows into `{self.tbl_version.get().name}`',
|
|
324
|
+
unit=' rows',
|
|
325
|
+
ncols=100,
|
|
326
|
+
file=sys.stdout,
|
|
327
|
+
)
|
|
328
|
+
progress_bar.update(1)
|
|
329
|
+
|
|
330
|
+
self._move_tmp_media_files(table_row, media_cols_by_idx, v_min)
|
|
331
|
+
batch_table_rows.append(tuple(table_row))
|
|
332
|
+
|
|
333
|
+
table_rows.extend(batch_table_rows)
|
|
334
|
+
|
|
335
|
+
# if a batch is ready for insertion into the database, insert it
|
|
336
|
+
if len(table_rows) >= self.__INSERT_BATCH_SIZE:
|
|
337
|
+
self.sql_insert(self.sa_tbl, store_col_names, table_rows)
|
|
338
|
+
table_rows.clear()
|
|
339
|
+
|
|
340
|
+
# insert any remaining rows
|
|
341
|
+
if len(table_rows) > 0:
|
|
342
|
+
self.sql_insert(self.sa_tbl, store_col_names, table_rows)
|
|
343
|
+
|
|
382
344
|
if progress_bar is not None:
|
|
383
345
|
progress_bar.close()
|
|
384
|
-
|
|
346
|
+
computed_values = exec_plan.ctx.num_computed_exprs * num_rows
|
|
347
|
+
row_counts = RowCountStats(ins_rows=num_rows, num_excs=num_excs, computed_values=computed_values)
|
|
348
|
+
|
|
349
|
+
return cols_with_excs, row_counts
|
|
385
350
|
finally:
|
|
386
351
|
exec_plan.close()
|
|
387
352
|
|
|
353
|
+
@classmethod
|
|
354
|
+
def sql_insert(cls, sa_tbl: sql.Table, store_col_names: list[str], table_rows: list[tuple[Any]]) -> None:
|
|
355
|
+
assert len(table_rows) > 0
|
|
356
|
+
conn = Env.get().conn
|
|
357
|
+
conn.execute(sql.insert(sa_tbl), [dict(zip(store_col_names, table_row)) for table_row in table_rows])
|
|
358
|
+
|
|
359
|
+
# TODO: Inserting directly via psycopg delivers a small performance benefit, but is somewhat fraught due to
|
|
360
|
+
# differences in the data representation that SQLAlchemy/psycopg expect. The below code will do the
|
|
361
|
+
# insertion in psycopg and can be used if/when we decide to pursue that optimization.
|
|
362
|
+
# col_names_str = ", ".join(store_col_names)
|
|
363
|
+
# placeholders_str = ", ".join('%s' for _ in store_col_names)
|
|
364
|
+
# stmt_text = f'INSERT INTO {self.sa_tbl.name} ({col_names_str}) VALUES ({placeholders_str})'
|
|
365
|
+
# conn.exec_driver_sql(stmt_text, table_rows)
|
|
366
|
+
|
|
388
367
|
def _versions_clause(self, versions: list[Optional[int]], match_on_vmin: bool) -> sql.ColumnElement[bool]:
|
|
389
368
|
"""Return filter for base versions"""
|
|
390
369
|
v = versions[0]
|
pixeltable/type_system.py
CHANGED
|
@@ -1153,8 +1153,8 @@ class ImageType(ColumnType):
|
|
|
1153
1153
|
img.load()
|
|
1154
1154
|
return img
|
|
1155
1155
|
except Exception as exc:
|
|
1156
|
-
|
|
1157
|
-
raise excs.Error(f'data URL could not be decoded into a valid image: {
|
|
1156
|
+
error_msg_val = val if len(val) < 50 else val[:50] + '...'
|
|
1157
|
+
raise excs.Error(f'data URL could not be decoded into a valid image: {error_msg_val}') from exc
|
|
1158
1158
|
return val
|
|
1159
1159
|
|
|
1160
1160
|
def _validate_literal(self, val: Any) -> None:
|
pixeltable/utils/coroutine.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import threading
|
|
3
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
4
3
|
from typing import Any, Coroutine, TypeVar
|
|
5
4
|
|
|
6
|
-
|
|
5
|
+
from pixeltable.env import Env
|
|
7
6
|
|
|
7
|
+
T = TypeVar('T')
|
|
8
8
|
|
|
9
9
|
# TODO This is a temporary hack to be able to run async UDFs in contexts that are not properly handled by the existing
|
|
10
10
|
# scheduler logic (e.g., as an embedding function as part of a similarity lookup). Once the scheduler is fully
|
|
@@ -15,27 +15,10 @@ def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: floa
|
|
|
15
15
|
"""
|
|
16
16
|
Runs the given coroutine synchronously, even if called in the context of a running event loop.
|
|
17
17
|
"""
|
|
18
|
-
|
|
19
|
-
def run_in_new_loop() -> T:
|
|
20
|
-
new_loop = asyncio.new_event_loop()
|
|
21
|
-
asyncio.set_event_loop(new_loop)
|
|
22
|
-
try:
|
|
23
|
-
return new_loop.run_until_complete(coroutine)
|
|
24
|
-
finally:
|
|
25
|
-
new_loop.close()
|
|
26
|
-
|
|
27
|
-
try:
|
|
28
|
-
loop = asyncio.get_running_loop()
|
|
29
|
-
except RuntimeError:
|
|
30
|
-
# No event loop; just call `asyncio.run()`
|
|
31
|
-
return asyncio.run(coroutine)
|
|
18
|
+
loop = Env.get().event_loop
|
|
32
19
|
|
|
33
20
|
if threading.current_thread() is threading.main_thread():
|
|
34
|
-
|
|
35
|
-
return loop.run_until_complete(coroutine)
|
|
36
|
-
else:
|
|
37
|
-
with ThreadPoolExecutor() as pool:
|
|
38
|
-
future = pool.submit(run_in_new_loop)
|
|
39
|
-
return future.result(timeout=timeout)
|
|
21
|
+
return loop.run_until_complete(coroutine)
|
|
40
22
|
else:
|
|
41
|
-
|
|
23
|
+
# Not in main thread, use run_coroutine_threadsafe
|
|
24
|
+
return asyncio.run_coroutine_threadsafe(coroutine, loop).result(timeout)
|
pixeltable/utils/media_store.py
CHANGED
|
@@ -2,6 +2,7 @@ import glob
|
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
4
|
import shutil
|
|
5
|
+
import urllib
|
|
5
6
|
import uuid
|
|
6
7
|
from collections import defaultdict
|
|
7
8
|
from pathlib import Path
|
|
@@ -34,6 +35,44 @@ class MediaStore:
|
|
|
34
35
|
parent.mkdir(parents=True, exist_ok=True)
|
|
35
36
|
return parent / f'{tbl_id.hex}_{col_id}_{version}_{id_hex}{ext or ""}'
|
|
36
37
|
|
|
38
|
+
@classmethod
|
|
39
|
+
def move_tmp_media_file(cls, file_url: Optional[str], tbl_id: UUID, col_id: int, v_min: int) -> Optional[str]:
|
|
40
|
+
"""Move a tmp media file with given url into the MediaStore, and return new url
|
|
41
|
+
If it is not a tmp file in the tmp_dir, return the original url.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
file_url: URL of the tmp media file to move
|
|
45
|
+
tbl_id: Table ID to associate with the media file
|
|
46
|
+
col_id: Column ID to associate with the media file
|
|
47
|
+
v_min: Version number to associate with the media file
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
URL of the media final location of the file
|
|
51
|
+
"""
|
|
52
|
+
if file_url is None:
|
|
53
|
+
return None
|
|
54
|
+
assert isinstance(file_url, str), type(file_url)
|
|
55
|
+
pxt_tmp_dir = str(Env.get().tmp_dir)
|
|
56
|
+
parsed = urllib.parse.urlparse(file_url)
|
|
57
|
+
# We should never be passed a local file path here. The "len > 1" ensures that Windows
|
|
58
|
+
# file paths aren't mistaken for URLs with a single-character scheme.
|
|
59
|
+
assert len(parsed.scheme) > 1, file_url
|
|
60
|
+
if parsed.scheme != 'file':
|
|
61
|
+
# remote url
|
|
62
|
+
return file_url
|
|
63
|
+
file_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
|
|
64
|
+
if not file_path.startswith(pxt_tmp_dir):
|
|
65
|
+
# not a tmp file
|
|
66
|
+
return file_url
|
|
67
|
+
new_file_url = cls.relocate_local_media_file(Path(file_path), tbl_id, col_id, v_min)
|
|
68
|
+
return new_file_url
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def relocate_local_media_file(cls, src_path: Path, tbl_id: UUID, col_id: int, tbl_version: int) -> str:
|
|
72
|
+
dest_path = MediaStore.prepare_media_path(tbl_id, col_id, tbl_version, ext=src_path.suffix)
|
|
73
|
+
src_path.rename(dest_path)
|
|
74
|
+
return urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
75
|
+
|
|
37
76
|
@classmethod
|
|
38
77
|
def delete(cls, tbl_id: UUID, version: Optional[int] = None) -> None:
|
|
39
78
|
"""Delete all files belonging to tbl_id. If version is not None, delete
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: pixeltable
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai
|