pixeltable 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/catalog.py +75 -21
- pixeltable/catalog/column.py +10 -0
- pixeltable/catalog/globals.py +121 -18
- pixeltable/catalog/insertable_table.py +2 -1
- pixeltable/catalog/table.py +135 -4
- pixeltable/catalog/table_version.py +106 -66
- pixeltable/catalog/table_version_handle.py +26 -1
- pixeltable/catalog/view.py +4 -2
- pixeltable/exprs/column_property_ref.py +2 -11
- pixeltable/exprs/column_ref.py +19 -17
- pixeltable/exprs/data_row.py +9 -0
- pixeltable/exprs/row_builder.py +44 -13
- pixeltable/io/external_store.py +79 -52
- pixeltable/io/globals.py +1 -1
- pixeltable/io/label_studio.py +45 -41
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/utils.py +78 -0
- pixeltable/plan.py +22 -18
- pixeltable/store.py +114 -103
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/METADATA +1 -1
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/RECORD +28 -26
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=38)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
|
|
15
|
+
if k == 'col_mapping':
|
|
16
|
+
assert isinstance(v, list)
|
|
17
|
+
return k, [__col_mapping_entry(e) for e in v]
|
|
18
|
+
if k == 'stored_proxies':
|
|
19
|
+
assert isinstance(v, list)
|
|
20
|
+
return k, [__stored_proxies_entry(e) for e in v]
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def __col_mapping_entry(e: list) -> list:
|
|
25
|
+
assert isinstance(e, list)
|
|
26
|
+
assert isinstance(e[0], dict)
|
|
27
|
+
assert isinstance(e[1], str)
|
|
28
|
+
return [__col_handle(e[0]), e[1]]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def __stored_proxies_entry(e: list) -> list:
|
|
32
|
+
assert isinstance(e, list)
|
|
33
|
+
assert isinstance(e[0], dict)
|
|
34
|
+
assert isinstance(e[1], dict)
|
|
35
|
+
return [__col_handle(e[0]), __col_handle(e[1])]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def __col_handle(e: dict) -> dict:
|
|
39
|
+
return {'tbl_version': {'id': e['tbl_id'], 'effective_version': None}, 'col_id': e['col_id']}
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
39: 'ColumnHandles in external stores',
|
|
5
6
|
38: 'Added TableMd.view_sn',
|
|
6
7
|
37: 'Add support for the sample() method on DataFrames',
|
|
7
8
|
36: 'Added Table.lock_dummy',
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import schema
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MetadataUtils:
|
|
9
|
+
@classmethod
|
|
10
|
+
def _diff_md(
|
|
11
|
+
cls, old_md: Optional[dict[int, schema.SchemaColumn]], new_md: Optional[dict[int, schema.SchemaColumn]]
|
|
12
|
+
) -> str:
|
|
13
|
+
"""Return a string reporting the differences in a specific entry in two dictionaries
|
|
14
|
+
|
|
15
|
+
Results are formatted as follows:
|
|
16
|
+
- If `old_md` is `None`, returns 'Initial Version'.
|
|
17
|
+
- If `old_md` and `new_md` are the same, returns an empty string.
|
|
18
|
+
- If there are additions, changes, or deletions, returns a string summarizing the changes.
|
|
19
|
+
"""
|
|
20
|
+
assert new_md is not None
|
|
21
|
+
if old_md is None:
|
|
22
|
+
return 'Initial Version'
|
|
23
|
+
if old_md == new_md:
|
|
24
|
+
return ''
|
|
25
|
+
added = {k: v.name for k, v in new_md.items() if k not in old_md}
|
|
26
|
+
changed = {
|
|
27
|
+
k: f'{old_md[k].name!r} to {v.name!r}'
|
|
28
|
+
for k, v in new_md.items()
|
|
29
|
+
if k in old_md and old_md[k].name != v.name
|
|
30
|
+
}
|
|
31
|
+
deleted = {k: v.name for k, v in old_md.items() if k not in new_md}
|
|
32
|
+
if len(added) == 0 and len(changed) == 0 and len(deleted) == 0:
|
|
33
|
+
return ''
|
|
34
|
+
# Format the result
|
|
35
|
+
t = []
|
|
36
|
+
if len(added) > 0:
|
|
37
|
+
t.append('Added: ' + ', '.join(added.values()))
|
|
38
|
+
if len(changed) > 0:
|
|
39
|
+
t.append('Renamed: ' + ', '.join(changed.values()))
|
|
40
|
+
if len(deleted) > 0:
|
|
41
|
+
t.append('Deleted: ' + ', '.join(deleted.values()))
|
|
42
|
+
r = ', '.join(t)
|
|
43
|
+
return r
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def _create_md_change_dict(
|
|
47
|
+
cls, md_list: Optional[list[tuple[int, dict[int, schema.SchemaColumn]]]]
|
|
48
|
+
) -> dict[int, str]:
|
|
49
|
+
"""Return a dictionary of schema changes by version
|
|
50
|
+
Args:
|
|
51
|
+
md_list: a list of tuples, each containing a version number and a metadata dictionary.
|
|
52
|
+
"""
|
|
53
|
+
r: dict[int, str] = {}
|
|
54
|
+
if md_list is None or len(md_list) == 0:
|
|
55
|
+
return r
|
|
56
|
+
|
|
57
|
+
# Sort the list in place by version number
|
|
58
|
+
md_list.sort()
|
|
59
|
+
|
|
60
|
+
first_retrieved_version = md_list[0][0]
|
|
61
|
+
if first_retrieved_version == 0:
|
|
62
|
+
prev_md = None
|
|
63
|
+
prev_ver = -1
|
|
64
|
+
start = 0
|
|
65
|
+
else:
|
|
66
|
+
prev_md = md_list[0][1]
|
|
67
|
+
prev_ver = first_retrieved_version
|
|
68
|
+
start = 1
|
|
69
|
+
|
|
70
|
+
for ver, curr_md in md_list[start:]:
|
|
71
|
+
if ver == prev_ver:
|
|
72
|
+
continue
|
|
73
|
+
assert ver > prev_ver
|
|
74
|
+
tf = cls._diff_md(prev_md, curr_md)
|
|
75
|
+
if tf != '':
|
|
76
|
+
r[ver] = tf
|
|
77
|
+
prev_md = curr_md
|
|
78
|
+
return r
|
pixeltable/plan.py
CHANGED
|
@@ -378,7 +378,7 @@ class Planner:
|
|
|
378
378
|
|
|
379
379
|
cls.__check_valid_columns(tbl, stored_cols, 'inserted into')
|
|
380
380
|
|
|
381
|
-
row_builder = exprs.RowBuilder([], stored_cols, [])
|
|
381
|
+
row_builder = exprs.RowBuilder([], stored_cols, [], tbl)
|
|
382
382
|
|
|
383
383
|
# create InMemoryDataNode for 'rows'
|
|
384
384
|
plan: exec.ExecNode = exec.InMemoryDataNode(
|
|
@@ -473,15 +473,19 @@ class Planner:
|
|
|
473
473
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
474
474
|
target = tbl.tbl_version.get() # the one we need to update
|
|
475
475
|
updated_cols = list(update_targets.keys())
|
|
476
|
+
recomputed_cols: set[Column]
|
|
476
477
|
if len(recompute_targets) > 0:
|
|
477
|
-
|
|
478
|
+
assert len(update_targets) == 0
|
|
479
|
+
recomputed_cols = {*recompute_targets}
|
|
480
|
+
if cascade:
|
|
481
|
+
recomputed_cols |= target.get_dependent_columns(recomputed_cols)
|
|
478
482
|
else:
|
|
479
483
|
recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
484
|
+
# regardless of cascade, we need to update all indices on any updated/recomputed column
|
|
485
|
+
idx_val_cols = target.get_idx_val_columns(set(updated_cols) | recomputed_cols)
|
|
486
|
+
recomputed_cols.update(idx_val_cols)
|
|
487
|
+
# we only need to recompute stored columns (unstored ones are substituted away)
|
|
488
|
+
recomputed_cols = {c for c in recomputed_cols if c.is_stored}
|
|
485
489
|
|
|
486
490
|
cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
|
|
487
491
|
|
|
@@ -588,7 +592,7 @@ class Planner:
|
|
|
588
592
|
sql_exprs = list(
|
|
589
593
|
exprs.Expr.list_subexprs(analyzer.all_exprs, filter=analyzer.sql_elements.contains, traverse_matches=False)
|
|
590
594
|
)
|
|
591
|
-
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], sql_exprs)
|
|
595
|
+
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], sql_exprs, target)
|
|
592
596
|
analyzer.finalize(row_builder)
|
|
593
597
|
sql_lookup_node = exec.SqlLookupNode(tbl, row_builder, sql_exprs, sa_key_cols, key_vals)
|
|
594
598
|
col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
|
|
@@ -602,8 +606,7 @@ class Planner:
|
|
|
602
606
|
row_builder.set_slot_idxs(select_list, remove_duplicates=False)
|
|
603
607
|
for i, col in enumerate(all_base_cols):
|
|
604
608
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
605
|
-
|
|
606
|
-
ctx = exec.ExecContext(row_builder)
|
|
609
|
+
ctx = exec.ExecContext(row_builder, num_computed_exprs=len(recomputed_exprs))
|
|
607
610
|
# we're returning everything to the user, so we might as well do it in a single batch
|
|
608
611
|
ctx.batch_size = 0
|
|
609
612
|
plan.set_ctx(ctx)
|
|
@@ -695,7 +698,7 @@ class Planner:
|
|
|
695
698
|
base_analyzer = Analyzer(
|
|
696
699
|
from_clause, iterator_args, where_clause=target.predicate, sample_clause=target.sample_clause
|
|
697
700
|
)
|
|
698
|
-
row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
|
|
701
|
+
row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [], target)
|
|
699
702
|
|
|
700
703
|
# if we're propagating an insert, we only want to see those base rows that were created for the current version
|
|
701
704
|
# execution plan:
|
|
@@ -832,7 +835,11 @@ class Planner:
|
|
|
832
835
|
order_by_clause=order_by_clause,
|
|
833
836
|
sample_clause=sample_clause,
|
|
834
837
|
)
|
|
835
|
-
|
|
838
|
+
# If the from_clause has a single table, we can use it as the context table for the RowBuilder.
|
|
839
|
+
# Otherwise there is no context table, but that's ok, because the context table is only needed for
|
|
840
|
+
# table mutations, which can't happen during a join.
|
|
841
|
+
context_tbl = from_clause.tbls[0].tbl_version.get() if len(from_clause.tbls) == 1 else None
|
|
842
|
+
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [], context_tbl)
|
|
836
843
|
|
|
837
844
|
analyzer.finalize(row_builder)
|
|
838
845
|
# select_list: we need to materialize everything that's been collected
|
|
@@ -1035,16 +1042,14 @@ class Planner:
|
|
|
1035
1042
|
return Analyzer(FromClause(tbls=[tbl]), [], where_clause=where_clause)
|
|
1036
1043
|
|
|
1037
1044
|
@classmethod
|
|
1038
|
-
def create_add_column_plan(
|
|
1039
|
-
cls, tbl: catalog.TableVersionPath, col: catalog.Column
|
|
1040
|
-
) -> tuple[exec.ExecNode, Optional[int]]:
|
|
1045
|
+
def create_add_column_plan(cls, tbl: catalog.TableVersionPath, col: catalog.Column) -> exec.ExecNode:
|
|
1041
1046
|
"""Creates a plan for InsertableTable.add_column()
|
|
1042
1047
|
Returns:
|
|
1043
1048
|
plan: the plan to execute
|
|
1044
1049
|
value_expr slot idx for the plan output (for computed cols)
|
|
1045
1050
|
"""
|
|
1046
1051
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
1047
|
-
row_builder = exprs.RowBuilder(output_exprs=[], columns=[col], input_exprs=[])
|
|
1052
|
+
row_builder = exprs.RowBuilder(output_exprs=[], columns=[col], input_exprs=[], tbl=tbl.tbl_version.get())
|
|
1048
1053
|
analyzer = Analyzer(FromClause(tbls=[tbl]), row_builder.default_eval_ctx.target_exprs)
|
|
1049
1054
|
plan = cls._create_query_plan(
|
|
1050
1055
|
row_builder=row_builder, analyzer=analyzer, eval_ctx=row_builder.default_eval_ctx, with_pk=True
|
|
@@ -1056,5 +1061,4 @@ class Planner:
|
|
|
1056
1061
|
# we want to flush images
|
|
1057
1062
|
if col.is_computed and col.is_stored and col.col_type.is_image_type():
|
|
1058
1063
|
plan.set_stored_img_cols(row_builder.output_slot_idxs())
|
|
1059
|
-
|
|
1060
|
-
return plan, value_expr_slot_idx
|
|
1064
|
+
return plan
|
pixeltable/store.py
CHANGED
|
@@ -7,13 +7,14 @@ import sys
|
|
|
7
7
|
import urllib.parse
|
|
8
8
|
import urllib.request
|
|
9
9
|
import warnings
|
|
10
|
-
from typing import Any, Iterable, Iterator,
|
|
10
|
+
from typing import Any, Iterable, Iterator, Optional, Union
|
|
11
11
|
|
|
12
12
|
import more_itertools
|
|
13
13
|
import sqlalchemy as sql
|
|
14
14
|
from tqdm import TqdmWarning, tqdm
|
|
15
15
|
|
|
16
|
-
from pixeltable import catalog, exceptions as excs
|
|
16
|
+
from pixeltable import catalog, exceptions as excs
|
|
17
|
+
from pixeltable.catalog import RowCountStats, UpdateStatus
|
|
17
18
|
from pixeltable.env import Env
|
|
18
19
|
from pixeltable.exec import ExecNode
|
|
19
20
|
from pixeltable.metadata import schema
|
|
@@ -41,7 +42,10 @@ class StoreBase:
|
|
|
41
42
|
v_max_col: sql.Column
|
|
42
43
|
base: Optional[StoreBase]
|
|
43
44
|
|
|
44
|
-
|
|
45
|
+
# In my cursory experiments this was the optimal batch size: it was an improvement over 5_000 and there was no real
|
|
46
|
+
# benefit to going higher.
|
|
47
|
+
# TODO: Perform more rigorous experiments with different table structures and OS environments to refine this.
|
|
48
|
+
__INSERT_BATCH_SIZE = 10_000
|
|
45
49
|
|
|
46
50
|
def __init__(self, tbl_version: catalog.TableVersion):
|
|
47
51
|
self.tbl_version = catalog.TableVersionHandle(
|
|
@@ -124,13 +128,14 @@ class StoreBase:
|
|
|
124
128
|
|
|
125
129
|
def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column, v_min: int) -> str:
|
|
126
130
|
"""Move tmp media file with given url to Env.media_dir and return new url, or given url if not a tmp_dir file"""
|
|
127
|
-
pxt_tmp_dir = str(Env.get().tmp_dir)
|
|
128
131
|
if file_url is None:
|
|
129
132
|
return None
|
|
133
|
+
assert isinstance(file_url, str), type(file_url)
|
|
134
|
+
pxt_tmp_dir = str(Env.get().tmp_dir)
|
|
130
135
|
parsed = urllib.parse.urlparse(file_url)
|
|
131
136
|
# We should never be passed a local file path here. The "len > 1" ensures that Windows
|
|
132
137
|
# file paths aren't mistaken for URLs with a single-character scheme.
|
|
133
|
-
assert len(parsed.scheme) > 1
|
|
138
|
+
assert len(parsed.scheme) > 1, file_url
|
|
134
139
|
if parsed.scheme != 'file':
|
|
135
140
|
# remote url
|
|
136
141
|
return file_url
|
|
@@ -145,27 +150,11 @@ class StoreBase:
|
|
|
145
150
|
return new_file_url
|
|
146
151
|
|
|
147
152
|
def _move_tmp_media_files(
|
|
148
|
-
self,
|
|
153
|
+
self, table_row: list[Any], media_cols_by_sql_idx: dict[int, catalog.Column], v_min: int
|
|
149
154
|
) -> None:
|
|
150
155
|
"""Move tmp media files that we generated to a permanent location"""
|
|
151
|
-
for
|
|
152
|
-
|
|
153
|
-
file_url = table_row[c.store_name()]
|
|
154
|
-
table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
|
|
155
|
-
|
|
156
|
-
def _create_table_row(
|
|
157
|
-
self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, exc_col_ids: set[int], pk: tuple[int, ...]
|
|
158
|
-
) -> tuple[dict[str, Any], int]:
|
|
159
|
-
"""Return Tuple[complete table row, # of exceptions] for insert()
|
|
160
|
-
Creates a row that includes the PK columns, with the values from input_row.pk.
|
|
161
|
-
Returns:
|
|
162
|
-
Tuple[complete table row, # of exceptions]
|
|
163
|
-
"""
|
|
164
|
-
table_row, num_excs = row_builder.create_table_row(input_row, exc_col_ids)
|
|
165
|
-
assert len(pk) == len(self._pk_cols)
|
|
166
|
-
for pk_col, pk_val in zip(self._pk_cols, pk):
|
|
167
|
-
table_row[pk_col.name] = pk_val
|
|
168
|
-
return table_row, num_excs
|
|
156
|
+
for n, col in media_cols_by_sql_idx.items():
|
|
157
|
+
table_row[n] = self._move_tmp_media_file(table_row[n], col, v_min)
|
|
169
158
|
|
|
170
159
|
def count(self) -> int:
|
|
171
160
|
"""Return the number of rows visible in self.tbl_version"""
|
|
@@ -231,9 +220,7 @@ class StoreBase:
|
|
|
231
220
|
if col.store_name() not in existing_cols:
|
|
232
221
|
self.add_column(col)
|
|
233
222
|
|
|
234
|
-
def load_column(
|
|
235
|
-
self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int, on_error: Literal['abort', 'ignore']
|
|
236
|
-
) -> int:
|
|
223
|
+
def load_column(self, col: catalog.Column, exec_plan: ExecNode, abort_on_exc: bool) -> int:
|
|
237
224
|
"""Update store column of a computed column with values produced by an execution plan
|
|
238
225
|
|
|
239
226
|
Returns:
|
|
@@ -247,60 +234,51 @@ class StoreBase:
|
|
|
247
234
|
num_rows = 0
|
|
248
235
|
# create temp table to store output of exec_plan, with the same primary key as the store table
|
|
249
236
|
tmp_name = f'temp_{self._storage_name()}'
|
|
250
|
-
tmp_pk_cols =
|
|
251
|
-
|
|
237
|
+
tmp_pk_cols = tuple(sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns())
|
|
238
|
+
tmp_val_col_sql_idx = len(tmp_pk_cols)
|
|
252
239
|
tmp_val_col = sql.Column(col.sa_col.name, col.sa_col.type)
|
|
253
|
-
tmp_cols
|
|
240
|
+
tmp_cols = [*tmp_pk_cols, tmp_val_col]
|
|
254
241
|
# add error columns if the store column records errors
|
|
255
242
|
if col.records_errors:
|
|
256
243
|
tmp_errortype_col = sql.Column(col.sa_errortype_col.name, col.sa_errortype_col.type)
|
|
257
|
-
tmp_cols.append(tmp_errortype_col)
|
|
258
244
|
tmp_errormsg_col = sql.Column(col.sa_errormsg_col.name, col.sa_errormsg_col.type)
|
|
259
|
-
tmp_cols.
|
|
245
|
+
tmp_cols.extend((tmp_errortype_col, tmp_errormsg_col))
|
|
246
|
+
tmp_col_names = [col.name for col in tmp_cols]
|
|
247
|
+
|
|
260
248
|
tmp_tbl = sql.Table(tmp_name, self.sa_md, *tmp_cols, prefixes=['TEMPORARY'])
|
|
261
249
|
conn = Env.get().conn
|
|
262
250
|
tmp_tbl.create(bind=conn)
|
|
263
251
|
|
|
252
|
+
row_builder = exec_plan.row_builder
|
|
253
|
+
|
|
264
254
|
try:
|
|
255
|
+
table_rows: list[tuple[Any]] = []
|
|
256
|
+
|
|
265
257
|
# insert rows from exec_plan into temp table
|
|
266
|
-
# TODO: unify the table row construction logic with RowBuilder.create_table_row()
|
|
267
258
|
for row_batch in exec_plan:
|
|
268
259
|
num_rows += len(row_batch)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
# we have yet to store this image
|
|
292
|
-
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
|
|
293
|
-
result_row.flush_img(value_expr_slot_idx, filepath)
|
|
294
|
-
val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
|
|
295
|
-
if col.col_type.is_media_type():
|
|
296
|
-
val = self._move_tmp_media_file(val, col, result_row.pk[-1])
|
|
297
|
-
tbl_row[col.sa_col.name] = val
|
|
298
|
-
if col.records_errors:
|
|
299
|
-
tbl_row[col.sa_errortype_col.name] = None
|
|
300
|
-
tbl_row[col.sa_errormsg_col.name] = None
|
|
301
|
-
|
|
302
|
-
tbl_rows.append(tbl_row)
|
|
303
|
-
conn.execute(sql.insert(tmp_tbl), tbl_rows)
|
|
260
|
+
batch_table_rows: list[tuple[Any]] = []
|
|
261
|
+
|
|
262
|
+
for row in row_batch:
|
|
263
|
+
if abort_on_exc and row.has_exc():
|
|
264
|
+
exc = row.get_first_exc()
|
|
265
|
+
raise excs.Error(f'Error while evaluating computed column {col.name!r}:\n{exc}') from exc
|
|
266
|
+
table_row, num_row_exc = row_builder.create_table_row(row, None, row.pk)
|
|
267
|
+
if col.col_type.is_media_type():
|
|
268
|
+
table_row[tmp_val_col_sql_idx] = self._move_tmp_media_file(
|
|
269
|
+
table_row[tmp_val_col_sql_idx], col, row.pk[-1]
|
|
270
|
+
)
|
|
271
|
+
num_excs += num_row_exc
|
|
272
|
+
batch_table_rows.append(tuple(table_row))
|
|
273
|
+
|
|
274
|
+
table_rows.extend(batch_table_rows)
|
|
275
|
+
|
|
276
|
+
if len(table_rows) >= self.__INSERT_BATCH_SIZE:
|
|
277
|
+
self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
|
|
278
|
+
table_rows.clear()
|
|
279
|
+
|
|
280
|
+
if len(table_rows) > 0:
|
|
281
|
+
self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
|
|
304
282
|
|
|
305
283
|
# update store table with values from temp table
|
|
306
284
|
update_stmt = sql.update(self.sa_tbl)
|
|
@@ -313,6 +291,7 @@ class StoreBase:
|
|
|
313
291
|
)
|
|
314
292
|
log_explain(_logger, update_stmt, conn)
|
|
315
293
|
conn.execute(update_stmt)
|
|
294
|
+
|
|
316
295
|
finally:
|
|
317
296
|
|
|
318
297
|
def remove_tmp_tbl() -> None:
|
|
@@ -320,6 +299,7 @@ class StoreBase:
|
|
|
320
299
|
tmp_tbl.drop(bind=conn)
|
|
321
300
|
|
|
322
301
|
run_cleanup(remove_tmp_tbl, raise_error=True)
|
|
302
|
+
|
|
323
303
|
return num_excs
|
|
324
304
|
|
|
325
305
|
def insert_rows(
|
|
@@ -329,7 +309,7 @@ class StoreBase:
|
|
|
329
309
|
show_progress: bool = True,
|
|
330
310
|
rowids: Optional[Iterator[int]] = None,
|
|
331
311
|
abort_on_exc: bool = False,
|
|
332
|
-
) -> tuple[
|
|
312
|
+
) -> tuple[set[int], UpdateStatus]:
|
|
333
313
|
"""Insert rows into the store table and update the catalog table's md
|
|
334
314
|
Returns:
|
|
335
315
|
number of inserted rows, number of exceptions, set of column ids that have exceptions
|
|
@@ -341,50 +321,81 @@ class StoreBase:
|
|
|
341
321
|
cols_with_excs: set[int] = set()
|
|
342
322
|
progress_bar: Optional[tqdm] = None # create this only after we started executing
|
|
343
323
|
row_builder = exec_plan.row_builder
|
|
344
|
-
|
|
345
|
-
|
|
324
|
+
|
|
325
|
+
store_col_names, media_cols_by_idx = row_builder.store_column_names()
|
|
346
326
|
|
|
347
327
|
try:
|
|
328
|
+
table_rows: list[tuple[Any]] = []
|
|
348
329
|
exec_plan.open()
|
|
330
|
+
|
|
349
331
|
for row_batch in exec_plan:
|
|
350
332
|
num_rows += len(row_batch)
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
333
|
+
batch_table_rows: list[tuple[Any]] = []
|
|
334
|
+
|
|
335
|
+
# compute batch of rows and convert them into table rows
|
|
336
|
+
for row in row_batch:
|
|
337
|
+
# if abort_on_exc == True, we need to check for media validation exceptions
|
|
338
|
+
if abort_on_exc and row.has_exc():
|
|
339
|
+
exc = row.get_first_exc()
|
|
340
|
+
raise exc
|
|
341
|
+
|
|
342
|
+
rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
|
|
343
|
+
pk = (*rowid, v_min)
|
|
344
|
+
assert len(pk) == len(self._pk_cols)
|
|
345
|
+
table_row, num_row_exc = row_builder.create_table_row(row, cols_with_excs, pk)
|
|
346
|
+
num_excs += num_row_exc
|
|
347
|
+
|
|
348
|
+
if show_progress:
|
|
349
|
+
if progress_bar is None:
|
|
350
|
+
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
351
|
+
progress_bar = tqdm(
|
|
352
|
+
desc=f'Inserting rows into `{self.tbl_version.get().name}`',
|
|
353
|
+
unit=' rows',
|
|
354
|
+
ncols=100,
|
|
355
|
+
file=sys.stdout,
|
|
356
|
+
)
|
|
357
|
+
progress_bar.update(1)
|
|
358
|
+
|
|
359
|
+
self._move_tmp_media_files(table_row, media_cols_by_idx, v_min)
|
|
360
|
+
batch_table_rows.append(tuple(table_row))
|
|
361
|
+
|
|
362
|
+
table_rows.extend(batch_table_rows)
|
|
363
|
+
|
|
364
|
+
# if a batch is ready for insertion into the database, insert it
|
|
365
|
+
if len(table_rows) >= self.__INSERT_BATCH_SIZE:
|
|
366
|
+
self.sql_insert(self.sa_tbl, store_col_names, table_rows)
|
|
367
|
+
table_rows.clear()
|
|
368
|
+
|
|
369
|
+
# insert any remaining rows
|
|
370
|
+
if len(table_rows) > 0:
|
|
371
|
+
self.sql_insert(self.sa_tbl, store_col_names, table_rows)
|
|
372
|
+
|
|
382
373
|
if progress_bar is not None:
|
|
383
374
|
progress_bar.close()
|
|
384
|
-
|
|
375
|
+
computed_values = exec_plan.ctx.num_computed_exprs * num_rows
|
|
376
|
+
row_counts = RowCountStats(
|
|
377
|
+
ins_rows=num_rows, num_excs=num_excs, computed_values=computed_values
|
|
378
|
+
) # insert (StoreBase)
|
|
379
|
+
|
|
380
|
+
return cols_with_excs, UpdateStatus(row_count_stats=row_counts)
|
|
381
|
+
|
|
385
382
|
finally:
|
|
386
383
|
exec_plan.close()
|
|
387
384
|
|
|
385
|
+
@classmethod
|
|
386
|
+
def sql_insert(cls, sa_tbl: sql.Table, store_col_names: list[str], table_rows: list[tuple[Any]]) -> None:
|
|
387
|
+
assert len(table_rows) > 0
|
|
388
|
+
conn = Env.get().conn
|
|
389
|
+
conn.execute(sql.insert(sa_tbl), [dict(zip(store_col_names, table_row)) for table_row in table_rows])
|
|
390
|
+
|
|
391
|
+
# TODO: Inserting directly via psycopg delivers a small performance benefit, but is somewhat fraught due to
|
|
392
|
+
# differences in the data representation that SQLAlchemy/psycopg expect. The below code will do the
|
|
393
|
+
# insertion in psycopg and can be used if/when we decide to pursue that optimization.
|
|
394
|
+
# col_names_str = ", ".join(store_col_names)
|
|
395
|
+
# placeholders_str = ", ".join('%s' for _ in store_col_names)
|
|
396
|
+
# stmt_text = f'INSERT INTO {self.sa_tbl.name} ({col_names_str}) VALUES ({placeholders_str})'
|
|
397
|
+
# conn.exec_driver_sql(stmt_text, table_rows)
|
|
398
|
+
|
|
388
399
|
def _versions_clause(self, versions: list[Optional[int]], match_on_vmin: bool) -> sql.ColumnElement[bool]:
|
|
389
400
|
"""Return filter for base versions"""
|
|
390
401
|
v = versions[0]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: pixeltable
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai
|