pixeltable 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +53 -0
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +181 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +192 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +695 -0
- pixeltable/catalog/table_version.py +1026 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/dataframe.py +749 -0
- pixeltable/env.py +466 -0
- pixeltable/exceptions.py +17 -0
- pixeltable/exec/__init__.py +10 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +116 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +94 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +73 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +226 -0
- pixeltable/exprs/__init__.py +25 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +114 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +199 -0
- pixeltable/exprs/expr.py +594 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +382 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +96 -0
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +109 -0
- pixeltable/exprs/inline_dict.py +103 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +66 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +329 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/similarity_expr.py +65 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/__init__.py +7 -0
- pixeltable/func/aggregate_function.py +197 -0
- pixeltable/func/callable_function.py +113 -0
- pixeltable/func/expr_template_function.py +99 -0
- pixeltable/func/function.py +141 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +46 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +162 -0
- pixeltable/func/udf.py +164 -0
- pixeltable/functions/__init__.py +95 -0
- pixeltable/functions/eval.py +215 -0
- pixeltable/functions/fireworks.py +34 -0
- pixeltable/functions/huggingface.py +167 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +289 -0
- pixeltable/functions/pil/image.py +147 -0
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +143 -0
- pixeltable/functions/util.py +52 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/globals.py +425 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +51 -0
- pixeltable/index/embedding_index.py +168 -0
- pixeltable/io/__init__.py +3 -0
- pixeltable/io/hf_datasets.py +188 -0
- pixeltable/io/pandas.py +148 -0
- pixeltable/io/parquet.py +192 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +52 -0
- pixeltable/iterators/document.py +432 -0
- pixeltable/iterators/video.py +88 -0
- pixeltable/metadata/__init__.py +58 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/schema.py +234 -0
- pixeltable/plan.py +620 -0
- pixeltable/store.py +424 -0
- pixeltable/tool/create_test_db_dump.py +184 -0
- pixeltable/tool/create_test_video.py +81 -0
- pixeltable/type_system.py +846 -0
- pixeltable/utils/__init__.py +17 -0
- pixeltable/utils/arrow.py +98 -0
- pixeltable/utils/clip.py +18 -0
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +69 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/http_server.py +70 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/pytorch.py +91 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.0.0.dist-info/LICENSE +18 -0
- pixeltable-0.0.0.dist-info/METADATA +131 -0
- pixeltable-0.0.0.dist-info/RECORD +119 -0
- pixeltable-0.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from typing import Generator, Optional
|
|
2
|
+
|
|
3
|
+
from .data_row_batch import DataRowBatch
|
|
4
|
+
from .exec_node import ExecNode
|
|
5
|
+
import pixeltable.catalog as catalog
|
|
6
|
+
import pixeltable.exprs as exprs
|
|
7
|
+
import pixeltable.exceptions as excs
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ComponentIterationNode(ExecNode):
|
|
11
|
+
"""Expands each row from a base table into one row per component returned by an iterator
|
|
12
|
+
|
|
13
|
+
Returns row batches of OUTPUT_BATCH_SIZE size.
|
|
14
|
+
"""
|
|
15
|
+
OUTPUT_BATCH_SIZE = 1024
|
|
16
|
+
|
|
17
|
+
def __init__(self, view: catalog.TableVersion, input: ExecNode):
|
|
18
|
+
assert view.is_component_view()
|
|
19
|
+
super().__init__(input.row_builder, [], [], input)
|
|
20
|
+
self.view = view
|
|
21
|
+
iterator_args = [view.iterator_args.copy()]
|
|
22
|
+
self.row_builder.substitute_exprs(iterator_args)
|
|
23
|
+
self.iterator_args = iterator_args[0]
|
|
24
|
+
assert isinstance(self.iterator_args, exprs.InlineDict)
|
|
25
|
+
self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
|
|
26
|
+
self.iterator_output_schema, self.unstored_column_names = \
|
|
27
|
+
self.view.iterator_cls.output_schema(**self.iterator_args.to_dict())
|
|
28
|
+
self.iterator_output_fields = list(self.iterator_output_schema.keys())
|
|
29
|
+
self.iterator_output_cols = \
|
|
30
|
+
{field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields}
|
|
31
|
+
# referenced iterator output fields
|
|
32
|
+
self.refd_output_slot_idxs = {
|
|
33
|
+
e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
|
|
34
|
+
if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
|
|
35
|
+
}
|
|
36
|
+
self._output: Optional[Generator[DataRowBatch, None, None]] = None
|
|
37
|
+
|
|
38
|
+
def _output_batches(self) -> Generator[DataRowBatch, None, None]:
|
|
39
|
+
output_batch = DataRowBatch(self.view, self.row_builder)
|
|
40
|
+
for input_batch in self.input:
|
|
41
|
+
for input_row in input_batch:
|
|
42
|
+
self.row_builder.eval(input_row, self.iterator_args_ctx)
|
|
43
|
+
iterator_args = input_row[self.iterator_args.slot_idx]
|
|
44
|
+
iterator = self.view.iterator_cls(**iterator_args)
|
|
45
|
+
for pos, component_dict in enumerate(iterator):
|
|
46
|
+
output_row = output_batch.add_row()
|
|
47
|
+
input_row.copy(output_row)
|
|
48
|
+
# we're expanding the input and need to add the iterator position to the pk
|
|
49
|
+
pk = output_row.pk[:-1] + (pos,) + output_row.pk[-1:]
|
|
50
|
+
output_row.set_pk(pk)
|
|
51
|
+
|
|
52
|
+
# verify and copy component_dict fields to their respective slots in output_row
|
|
53
|
+
for field_name, field_val in component_dict.items():
|
|
54
|
+
if field_name not in self.iterator_output_fields:
|
|
55
|
+
raise excs.Error(
|
|
56
|
+
f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
|
|
57
|
+
if field_name not in self.refd_output_slot_idxs:
|
|
58
|
+
# we can ignore this
|
|
59
|
+
continue
|
|
60
|
+
output_col = self.iterator_output_cols[field_name]
|
|
61
|
+
output_col.col_type.validate_literal(field_val)
|
|
62
|
+
output_row[self.refd_output_slot_idxs[field_name]] = field_val
|
|
63
|
+
if len(component_dict) != len(self.iterator_output_fields):
|
|
64
|
+
missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
|
|
65
|
+
raise excs.Error(
|
|
66
|
+
f'Invalid output of {self.view.iterator_cls.__name__}: '
|
|
67
|
+
f'missing fields {", ".join(missing_fields)}')
|
|
68
|
+
|
|
69
|
+
if len(output_batch) == self.OUTPUT_BATCH_SIZE:
|
|
70
|
+
yield output_batch
|
|
71
|
+
output_batch = DataRowBatch(self.view, self.row_builder)
|
|
72
|
+
|
|
73
|
+
if len(output_batch) > 0:
|
|
74
|
+
yield output_batch
|
|
75
|
+
|
|
76
|
+
def __next__(self) -> DataRowBatch:
|
|
77
|
+
if self._output is None:
|
|
78
|
+
self._output = self._output_batches()
|
|
79
|
+
return next(self._output)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import List, Iterator, Optional
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import pixeltable.exprs as exprs
|
|
6
|
+
import pixeltable.catalog as catalog
|
|
7
|
+
from pixeltable.utils.media_store import MediaStore
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_logger = logging.getLogger('pixeltable')
|
|
11
|
+
|
|
12
|
+
class DataRowBatch:
|
|
13
|
+
"""Set of DataRows, indexed by rowid.
|
|
14
|
+
|
|
15
|
+
Contains the metadata needed to initialize DataRows.
|
|
16
|
+
"""
|
|
17
|
+
def __init__(self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, len: int = 0):
|
|
18
|
+
self.tbl = tbl
|
|
19
|
+
self.row_builder = row_builder
|
|
20
|
+
self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
|
|
21
|
+
# non-image media slots
|
|
22
|
+
self.media_slot_idxs = [
|
|
23
|
+
e.slot_idx for e in row_builder.unique_exprs
|
|
24
|
+
if e.col_type.is_media_type() and not e.col_type.is_image_type()
|
|
25
|
+
]
|
|
26
|
+
self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
|
|
27
|
+
self.rows = [
|
|
28
|
+
exprs.DataRow(row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
|
|
29
|
+
for _ in range(len)
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
|
|
33
|
+
if row is None:
|
|
34
|
+
row = exprs.DataRow(
|
|
35
|
+
self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
|
|
36
|
+
self.rows.append(row)
|
|
37
|
+
return row
|
|
38
|
+
|
|
39
|
+
def pop_row(self) -> exprs.DataRow:
|
|
40
|
+
return self.rows.pop()
|
|
41
|
+
|
|
42
|
+
def set_row_ids(self, row_ids: List[int]) -> None:
|
|
43
|
+
"""Sets pks for rows in batch"""
|
|
44
|
+
assert self.tbl is not None
|
|
45
|
+
assert len(row_ids) == len(self.rows)
|
|
46
|
+
for row, row_id in zip(self.rows, row_ids):
|
|
47
|
+
row.set_pk((row_id, self.tbl))
|
|
48
|
+
|
|
49
|
+
def __len__(self) -> int:
|
|
50
|
+
return len(self.rows)
|
|
51
|
+
|
|
52
|
+
def __getitem__(self, index: object) -> exprs.DataRow:
|
|
53
|
+
return self.rows[index]
|
|
54
|
+
|
|
55
|
+
def flush_imgs(
|
|
56
|
+
self, idx_range: Optional[slice] = None, stored_img_info: Optional[List[exprs.ColumnSlotIdx]] = None,
|
|
57
|
+
flushed_slot_idxs: Optional[List[int]] = None
|
|
58
|
+
) -> None:
|
|
59
|
+
"""Flushes images in the given range of rows."""
|
|
60
|
+
assert self.tbl is not None
|
|
61
|
+
if stored_img_info is None:
|
|
62
|
+
stored_img_info = []
|
|
63
|
+
if flushed_slot_idxs is None:
|
|
64
|
+
flushed_slot_idxs = []
|
|
65
|
+
if len(stored_img_info) == 0 and len(flushed_slot_idxs) == 0:
|
|
66
|
+
return
|
|
67
|
+
if idx_range is None:
|
|
68
|
+
idx_range = slice(0, len(self.rows))
|
|
69
|
+
for row in self.rows[idx_range]:
|
|
70
|
+
for info in stored_img_info:
|
|
71
|
+
filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.version))
|
|
72
|
+
row.flush_img(info.slot_idx, filepath)
|
|
73
|
+
for slot_idx in flushed_slot_idxs:
|
|
74
|
+
row.flush_img(slot_idx)
|
|
75
|
+
|
|
76
|
+
def __iter__(self) -> Iterator[exprs.DataRow]:
|
|
77
|
+
return DataRowBatchIterator(self)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class DataRowBatchIterator:
|
|
81
|
+
"""
|
|
82
|
+
Iterator over a DataRowBatch.
|
|
83
|
+
"""
|
|
84
|
+
def __init__(self, batch: DataRowBatch):
|
|
85
|
+
self.row_batch = batch
|
|
86
|
+
self.index = 0
|
|
87
|
+
|
|
88
|
+
def __next__(self) -> exprs.DataRow:
|
|
89
|
+
if self.index >= len(self.row_batch.rows):
|
|
90
|
+
raise StopIteration
|
|
91
|
+
row = self.row_batch.rows[self.index]
|
|
92
|
+
self.index += 1
|
|
93
|
+
return row
|
|
94
|
+
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from typing import Optional, List
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
import pixeltable.exprs as exprs
|
|
6
|
+
|
|
7
|
+
class ExecContext:
|
|
8
|
+
"""Class for execution runtime constants"""
|
|
9
|
+
def __init__(
|
|
10
|
+
self, row_builder: exprs.RowBuilder, *, show_pbar: bool = False, batch_size: int = 0,
|
|
11
|
+
pk_clause: Optional[List[sql.ClauseElement]] = None, num_computed_exprs: int = 0,
|
|
12
|
+
ignore_errors: bool = False
|
|
13
|
+
):
|
|
14
|
+
self.show_pbar = show_pbar
|
|
15
|
+
self.batch_size = batch_size
|
|
16
|
+
self.profile = exprs.ExecProfile(row_builder)
|
|
17
|
+
# num_rows is used to compute the total number of computed cells used for the progress bar
|
|
18
|
+
self.num_rows: Optional[int] = None
|
|
19
|
+
self.conn: Optional[sql.engine.Connection] = None # if present, use this to execute SQL queries
|
|
20
|
+
self.pk_clause = pk_clause
|
|
21
|
+
self.num_computed_exprs = num_computed_exprs
|
|
22
|
+
self.ignore_errors = ignore_errors
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Iterable, Optional, List
|
|
3
|
+
import abc
|
|
4
|
+
|
|
5
|
+
from .data_row_batch import DataRowBatch
|
|
6
|
+
from .exec_context import ExecContext
|
|
7
|
+
import pixeltable.exprs as exprs
|
|
8
|
+
|
|
9
|
+
class ExecNode(abc.ABC):
|
|
10
|
+
"""Base class of all execution nodes"""
|
|
11
|
+
def __init__(
|
|
12
|
+
self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
|
|
13
|
+
input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
|
|
14
|
+
self.row_builder = row_builder
|
|
15
|
+
self.input = input
|
|
16
|
+
# we flush all image slots that aren't part of our output but are needed to create our output
|
|
17
|
+
output_slot_idxs = {e.slot_idx for e in output_exprs}
|
|
18
|
+
output_dependencies = row_builder.get_dependencies(output_exprs, exclude=input_exprs)
|
|
19
|
+
self.flushed_img_slots = [
|
|
20
|
+
e.slot_idx for e in output_dependencies
|
|
21
|
+
if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
22
|
+
]
|
|
23
|
+
self.stored_img_cols: List[exprs.ColumnSlotIdx] = []
|
|
24
|
+
self.ctx: Optional[ExecContext] = None # all nodes of a tree share the same context
|
|
25
|
+
|
|
26
|
+
def set_ctx(self, ctx: ExecContext) -> None:
|
|
27
|
+
self.ctx = ctx
|
|
28
|
+
if self.input is not None:
|
|
29
|
+
self.input.set_ctx(ctx)
|
|
30
|
+
|
|
31
|
+
def set_stored_img_cols(self, stored_img_cols: List[exprs.ColumnSlotIdx]) -> None:
|
|
32
|
+
self.stored_img_cols = stored_img_cols
|
|
33
|
+
# propagate batch size to the source
|
|
34
|
+
if self.input is not None:
|
|
35
|
+
self.input.set_stored_img_cols(stored_img_cols)
|
|
36
|
+
|
|
37
|
+
def __iter__(self):
|
|
38
|
+
return self
|
|
39
|
+
|
|
40
|
+
@abc.abstractmethod
|
|
41
|
+
def __next__(self) -> DataRowBatch:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
def open(self) -> None:
|
|
45
|
+
"""Bottom-up initialization of nodes for execution. Must be called before __next__."""
|
|
46
|
+
if self.input is not None:
|
|
47
|
+
self.input.open()
|
|
48
|
+
self._open()
|
|
49
|
+
|
|
50
|
+
def close(self) -> None:
|
|
51
|
+
"""Frees node resources top-down after execution. Must be called after final __next__."""
|
|
52
|
+
self._close()
|
|
53
|
+
if self.input is not None:
|
|
54
|
+
self.input.close()
|
|
55
|
+
|
|
56
|
+
def _open(self) -> None:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
def _close(self) -> None:
|
|
60
|
+
pass
|
|
61
|
+
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
import warnings
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
from tqdm import tqdm, TqdmWarning
|
|
9
|
+
|
|
10
|
+
import pixeltable.exprs as exprs
|
|
11
|
+
from pixeltable.func import CallableFunction
|
|
12
|
+
from .data_row_batch import DataRowBatch
|
|
13
|
+
from .exec_node import ExecNode
|
|
14
|
+
|
|
15
|
+
_logger = logging.getLogger('pixeltable')
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ExprEvalNode(ExecNode):
|
|
19
|
+
"""Materializes expressions
|
|
20
|
+
"""
|
|
21
|
+
@dataclass
|
|
22
|
+
class Cohort:
|
|
23
|
+
"""List of exprs that form an evaluation context and contain calls to at most one external function"""
|
|
24
|
+
exprs: List[exprs.Expr]
|
|
25
|
+
batched_fn: Optional[CallableFunction]
|
|
26
|
+
segment_ctxs: List[exprs.RowBuilder.EvalCtx]
|
|
27
|
+
target_slot_idxs: List[int]
|
|
28
|
+
batch_size: int = 8
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self, row_builder: exprs.RowBuilder, output_exprs: List[exprs.Expr], input_exprs: List[exprs.Expr],
|
|
32
|
+
input: ExecNode
|
|
33
|
+
):
|
|
34
|
+
super().__init__(row_builder, output_exprs, input_exprs, input)
|
|
35
|
+
self.input_exprs = input_exprs
|
|
36
|
+
input_slot_idxs = {e.slot_idx for e in input_exprs}
|
|
37
|
+
# we're only materializing exprs that are not already in the input
|
|
38
|
+
self.target_exprs = [e for e in output_exprs if e.slot_idx not in input_slot_idxs]
|
|
39
|
+
self.pbar: Optional[tqdm] = None
|
|
40
|
+
self.cohorts: List[List[ExprEvalNode.Cohort]] = []
|
|
41
|
+
self._create_cohorts()
|
|
42
|
+
|
|
43
|
+
def __next__(self) -> DataRowBatch:
|
|
44
|
+
input_batch = next(self.input)
|
|
45
|
+
# compute target exprs
|
|
46
|
+
for cohort in self.cohorts:
|
|
47
|
+
self._exec_cohort(cohort, input_batch)
|
|
48
|
+
_logger.debug(f'ExprEvalNode: returning {len(input_batch)} rows')
|
|
49
|
+
return input_batch
|
|
50
|
+
|
|
51
|
+
def _open(self) -> None:
|
|
52
|
+
warnings.simplefilter("ignore", category=TqdmWarning)
|
|
53
|
+
if self.ctx.show_pbar:
|
|
54
|
+
self.pbar = tqdm(
|
|
55
|
+
total=len(self.target_exprs) * self.ctx.num_rows,
|
|
56
|
+
desc='Computing cells',
|
|
57
|
+
unit=' cells',
|
|
58
|
+
ncols=100,
|
|
59
|
+
file=sys.stdout
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def _close(self) -> None:
|
|
63
|
+
if self.pbar is not None:
|
|
64
|
+
self.pbar.close()
|
|
65
|
+
|
|
66
|
+
def _get_batched_fn(self, expr: exprs.Expr) -> Optional[CallableFunction]:
|
|
67
|
+
if isinstance(expr, exprs.FunctionCall) and isinstance(expr.fn, CallableFunction) and expr.fn.is_batched:
|
|
68
|
+
return expr.fn
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
def _is_batched_fn_call(self, expr: exprs.Expr) -> bool:
|
|
72
|
+
return self._get_batched_fn(expr) is not None
|
|
73
|
+
|
|
74
|
+
def _create_cohorts(self) -> None:
|
|
75
|
+
all_exprs = self.row_builder.get_dependencies(self.target_exprs)
|
|
76
|
+
# break up all_exprs into cohorts such that each cohort contains calls to at most one external function;
|
|
77
|
+
# seed the cohorts with only the ext fn calls
|
|
78
|
+
cohorts: List[List[exprs.Expr]] = []
|
|
79
|
+
current_batched_fn: Optional[CallableFunction] = None
|
|
80
|
+
for e in all_exprs:
|
|
81
|
+
if not self._is_batched_fn_call(e):
|
|
82
|
+
continue
|
|
83
|
+
if current_batched_fn is None or current_batched_fn != e.fn:
|
|
84
|
+
# create a new cohort
|
|
85
|
+
cohorts.append([])
|
|
86
|
+
current_batched_fn = e.fn
|
|
87
|
+
cohorts[-1].append(e)
|
|
88
|
+
|
|
89
|
+
# expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
|
|
90
|
+
# cohorts are evaluated in order, so we can exclude the target slots from preceding cohorts and input slots
|
|
91
|
+
exclude = set([e.slot_idx for e in self.input_exprs])
|
|
92
|
+
all_target_slot_idxs = set([e.slot_idx for e in self.target_exprs])
|
|
93
|
+
target_slot_idxs: List[List[int]] = [] # the ones materialized by each cohort
|
|
94
|
+
for i in range(len(cohorts)):
|
|
95
|
+
cohorts[i] = self.row_builder.get_dependencies(
|
|
96
|
+
cohorts[i], exclude=[self.row_builder.unique_exprs[slot_idx] for slot_idx in exclude])
|
|
97
|
+
target_slot_idxs.append(
|
|
98
|
+
[e.slot_idx for e in cohorts[i] if e.slot_idx in all_target_slot_idxs])
|
|
99
|
+
exclude.update(target_slot_idxs[-1])
|
|
100
|
+
|
|
101
|
+
all_cohort_slot_idxs = set([e.slot_idx for cohort in cohorts for e in cohort])
|
|
102
|
+
remaining_slot_idxs = set(all_target_slot_idxs) - all_cohort_slot_idxs
|
|
103
|
+
if len(remaining_slot_idxs) > 0:
|
|
104
|
+
cohorts.append(self.row_builder.get_dependencies(
|
|
105
|
+
[self.row_builder.unique_exprs[slot_idx] for slot_idx in remaining_slot_idxs],
|
|
106
|
+
exclude=[self.row_builder.unique_exprs[slot_idx] for slot_idx in exclude]))
|
|
107
|
+
target_slot_idxs.append(list(remaining_slot_idxs))
|
|
108
|
+
# we need to have captured all target slots at this point
|
|
109
|
+
assert all_target_slot_idxs == set().union(*target_slot_idxs)
|
|
110
|
+
|
|
111
|
+
for i in range(len(cohorts)):
|
|
112
|
+
cohort = cohorts[i]
|
|
113
|
+
# segment the cohort into sublists that contain either a single ext. function call or no ext. function calls
|
|
114
|
+
# (i.e., only computed cols)
|
|
115
|
+
assert len(cohort) > 0
|
|
116
|
+
# create the first segment here, so we can avoid checking for an empty list in the loop
|
|
117
|
+
segments = [[cohort[0]]]
|
|
118
|
+
is_batched_segment = self._is_batched_fn_call(cohort[0])
|
|
119
|
+
batched_fn: Optional[CallableFunction] = self._get_batched_fn(cohort[0])
|
|
120
|
+
for e in cohort[1:]:
|
|
121
|
+
if self._is_batched_fn_call(e):
|
|
122
|
+
segments.append([e])
|
|
123
|
+
is_batched_segment = True
|
|
124
|
+
batched_fn = self._get_batched_fn(e)
|
|
125
|
+
else:
|
|
126
|
+
if is_batched_segment:
|
|
127
|
+
# start a new segment
|
|
128
|
+
segments.append([])
|
|
129
|
+
is_batched_segment = False
|
|
130
|
+
segments[-1].append(e)
|
|
131
|
+
|
|
132
|
+
# we create the EvalCtxs manually because create_eval_ctx() would repeat the dependencies of each segment
|
|
133
|
+
segment_ctxs = [
|
|
134
|
+
exprs.RowBuilder.EvalCtx(
|
|
135
|
+
slot_idxs=[e.slot_idx for e in s], exprs=s, target_slot_idxs=[], target_exprs=[])
|
|
136
|
+
for s in segments
|
|
137
|
+
]
|
|
138
|
+
cohort_info = self.Cohort(cohort, batched_fn, segment_ctxs, target_slot_idxs[i])
|
|
139
|
+
self.cohorts.append(cohort_info)
|
|
140
|
+
|
|
141
|
+
def _exec_cohort(self, cohort: Cohort, rows: DataRowBatch) -> None:
|
|
142
|
+
"""Compute the cohort for the entire input batch by dividing it up into sub-batches"""
|
|
143
|
+
batch_start_idx = 0 # start row of the current sub-batch
|
|
144
|
+
# for multi-resolution models, we re-assess the correct ext fn batch size for each input batch
|
|
145
|
+
ext_batch_size = cohort.batched_fn.get_batch_size() if cohort.batched_fn is not None else None
|
|
146
|
+
if ext_batch_size is not None:
|
|
147
|
+
cohort.batch_size = ext_batch_size
|
|
148
|
+
|
|
149
|
+
while batch_start_idx < len(rows):
|
|
150
|
+
num_batch_rows = min(cohort.batch_size, len(rows) - batch_start_idx)
|
|
151
|
+
for segment_ctx in cohort.segment_ctxs:
|
|
152
|
+
if not self._is_batched_fn_call(segment_ctx.exprs[0]):
|
|
153
|
+
# compute batch row-wise
|
|
154
|
+
for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
|
|
155
|
+
self.row_builder.eval(
|
|
156
|
+
rows[row_idx], segment_ctx, self.ctx.profile, ignore_errors=self.ctx.ignore_errors)
|
|
157
|
+
else:
|
|
158
|
+
fn_call = segment_ctx.exprs[0]
|
|
159
|
+
# make a batched external function call
|
|
160
|
+
arg_batches = [[] for _ in range(len(fn_call.args))]
|
|
161
|
+
kwarg_batches = {k: [] for k in fn_call.kwargs.keys()}
|
|
162
|
+
|
|
163
|
+
valid_batch_idxs: List[int] = [] # rows with exceptions are not valid
|
|
164
|
+
for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
|
|
165
|
+
row = rows[row_idx]
|
|
166
|
+
if row.has_exc(fn_call.slot_idx):
|
|
167
|
+
# one of our inputs had an exception, skip this row
|
|
168
|
+
continue
|
|
169
|
+
valid_batch_idxs.append(row_idx)
|
|
170
|
+
args, kwargs = fn_call._make_args(row)
|
|
171
|
+
[arg_batches[i].append(args[i]) for i in range(len(args))]
|
|
172
|
+
[kwarg_batches[k].append(kwargs[k]) for k in kwargs.keys()]
|
|
173
|
+
num_valid_batch_rows = len(valid_batch_idxs)
|
|
174
|
+
|
|
175
|
+
if ext_batch_size is None:
|
|
176
|
+
# we need to choose a batch size based on the args
|
|
177
|
+
sample_args = [arg_batches[i][0] for i in range(len(arg_batches))]
|
|
178
|
+
ext_batch_size = fn_call.fn.get_batch_size(*sample_args)
|
|
179
|
+
|
|
180
|
+
num_remaining_batch_rows = num_valid_batch_rows
|
|
181
|
+
while num_remaining_batch_rows > 0:
|
|
182
|
+
# we make ext. fn calls in batches of ext_batch_size
|
|
183
|
+
if ext_batch_size is None:
|
|
184
|
+
pass
|
|
185
|
+
num_ext_batch_rows = min(ext_batch_size, num_remaining_batch_rows)
|
|
186
|
+
ext_batch_offset = num_valid_batch_rows - num_remaining_batch_rows # offset into args, not rows
|
|
187
|
+
call_args = [
|
|
188
|
+
arg_batches[i][ext_batch_offset:ext_batch_offset + num_ext_batch_rows]
|
|
189
|
+
for i in range(len(arg_batches))
|
|
190
|
+
]
|
|
191
|
+
call_kwargs = {
|
|
192
|
+
k: kwarg_batches[k][ext_batch_offset:ext_batch_offset + num_ext_batch_rows]
|
|
193
|
+
for k in kwarg_batches.keys()
|
|
194
|
+
}
|
|
195
|
+
start_ts = time.perf_counter()
|
|
196
|
+
result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
|
|
197
|
+
self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
|
|
198
|
+
self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows
|
|
199
|
+
|
|
200
|
+
# move the result into the row batch
|
|
201
|
+
for result_idx in range(len(result_batch)):
|
|
202
|
+
row_idx = valid_batch_idxs[ext_batch_offset + result_idx]
|
|
203
|
+
row = rows[row_idx]
|
|
204
|
+
row[fn_call.slot_idx] = result_batch[result_idx]
|
|
205
|
+
|
|
206
|
+
num_remaining_batch_rows -= num_ext_batch_rows
|
|
207
|
+
|
|
208
|
+
# switch to the ext fn batch size
|
|
209
|
+
cohort.batch_size = ext_batch_size
|
|
210
|
+
|
|
211
|
+
# make sure images for stored cols have been saved to files before moving on to the next batch
|
|
212
|
+
rows.flush_imgs(
|
|
213
|
+
slice(batch_start_idx, batch_start_idx + num_batch_rows), self.stored_img_cols, self.flushed_img_slots)
|
|
214
|
+
if self.pbar is not None:
|
|
215
|
+
self.pbar.update(num_batch_rows * len(cohort.target_slot_idxs))
|
|
216
|
+
batch_start_idx += num_batch_rows
|
|
217
|
+
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import List, Dict, Any, Optional
|
|
2
|
+
import urllib
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from .data_row_batch import DataRowBatch
|
|
7
|
+
from .exec_node import ExecNode
|
|
8
|
+
import pixeltable.catalog as catalog
|
|
9
|
+
import pixeltable.exprs as exprs
|
|
10
|
+
import pixeltable.env as env
|
|
11
|
+
from pixeltable.utils.media_store import MediaStore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_logger = logging.getLogger('pixeltable')
|
|
15
|
+
|
|
16
|
+
class InMemoryDataNode(ExecNode):
|
|
17
|
+
"""Outputs in-memory data as a row batch of a particular table"""
|
|
18
|
+
def __init__(
|
|
19
|
+
self, tbl: catalog.TableVersionPath, rows: List[Dict[str, Any]],
|
|
20
|
+
row_builder: exprs.RowBuilder, start_row_id: int,
|
|
21
|
+
):
|
|
22
|
+
super().__init__(row_builder, [], [], None)
|
|
23
|
+
assert tbl.is_insertable()
|
|
24
|
+
self.tbl = tbl
|
|
25
|
+
self.input_rows = rows
|
|
26
|
+
self.start_row_id = start_row_id
|
|
27
|
+
self.has_returned_data = False
|
|
28
|
+
self.output_rows: Optional[DataRowBatch] = None
|
|
29
|
+
|
|
30
|
+
def _open(self) -> None:
|
|
31
|
+
"""Create row batch and populate with self.input_rows"""
|
|
32
|
+
column_info = {info.col.id: info for info in self.row_builder.output_slot_idxs()}
|
|
33
|
+
# exclude system columns
|
|
34
|
+
user_column_info = {info.col.name: info for _, info in column_info.items() if info.col.name is not None}
|
|
35
|
+
# stored columns that are not computed
|
|
36
|
+
inserted_col_ids = set([
|
|
37
|
+
info.col.id for info in self.row_builder.output_slot_idxs()
|
|
38
|
+
if info.col.is_stored and not info.col.is_computed
|
|
39
|
+
])
|
|
40
|
+
|
|
41
|
+
self.output_rows = DataRowBatch(self.tbl, self.row_builder, len(self.input_rows))
|
|
42
|
+
for row_idx, input_row in enumerate(self.input_rows):
|
|
43
|
+
# populate the output row with the values provided in the input row
|
|
44
|
+
input_col_ids: List[int] = []
|
|
45
|
+
for col_name, val in input_row.items():
|
|
46
|
+
col_info = user_column_info.get(col_name)
|
|
47
|
+
assert col_info is not None
|
|
48
|
+
|
|
49
|
+
if col_info.col.col_type.is_image_type() and isinstance(val, bytes):
|
|
50
|
+
# this is a literal image, ie, a sequence of bytes; we save this as a media file and store the path
|
|
51
|
+
path = str(MediaStore.prepare_media_path(self.tbl.id, col_info.col.id, self.tbl.version))
|
|
52
|
+
open(path, 'wb').write(val)
|
|
53
|
+
val = path
|
|
54
|
+
self.output_rows[row_idx][col_info.slot_idx] = val
|
|
55
|
+
input_col_ids.append(col_info.col.id)
|
|
56
|
+
|
|
57
|
+
# set the remaining stored non-computed columns to null
|
|
58
|
+
null_col_ids = inserted_col_ids - set(input_col_ids)
|
|
59
|
+
for col_id in null_col_ids:
|
|
60
|
+
col_info = column_info.get(col_id)
|
|
61
|
+
assert col_info is not None
|
|
62
|
+
self.output_rows[row_idx][col_info.slot_idx] = None
|
|
63
|
+
|
|
64
|
+
self.output_rows.set_row_ids([self.start_row_id + i for i in range(len(self.output_rows))])
|
|
65
|
+
self.ctx.num_rows = len(self.output_rows)
|
|
66
|
+
|
|
67
|
+
def __next__(self) -> DataRowBatch:
|
|
68
|
+
if self.has_returned_data:
|
|
69
|
+
raise StopIteration
|
|
70
|
+
self.has_returned_data = True
|
|
71
|
+
_logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
|
|
72
|
+
return self.output_rows
|
|
73
|
+
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Iterable, Optional
|
|
3
|
+
|
|
4
|
+
from .data_row_batch import DataRowBatch
|
|
5
|
+
from .exec_node import ExecNode
|
|
6
|
+
import pixeltable.exprs as exprs
|
|
7
|
+
import pixeltable.exceptions as excs
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MediaValidationNode(ExecNode):
|
|
11
|
+
"""Validation of selected media slots
|
|
12
|
+
Records exceptions in the rows of the input batch
|
|
13
|
+
"""
|
|
14
|
+
def __init__(
|
|
15
|
+
self, row_builder: exprs.RowBuilder, media_slots: Iterable[exprs.ColumnSlotIdx],
|
|
16
|
+
input: Optional[ExecNode]):
|
|
17
|
+
super().__init__(row_builder, [], [], input)
|
|
18
|
+
self.row_builder = row_builder
|
|
19
|
+
self.input = input
|
|
20
|
+
for col in [c.col for c in media_slots]:
|
|
21
|
+
assert col.col_type.is_media_type()
|
|
22
|
+
self.media_slots = media_slots
|
|
23
|
+
|
|
24
|
+
def __next__(self) -> DataRowBatch:
|
|
25
|
+
assert self.input is not None
|
|
26
|
+
row_batch = next(self.input)
|
|
27
|
+
for row in row_batch:
|
|
28
|
+
for slot_idx, col in [(c.slot_idx, c.col) for c in self.media_slots]:
|
|
29
|
+
if row.has_exc(slot_idx):
|
|
30
|
+
continue
|
|
31
|
+
assert row.has_val[slot_idx]
|
|
32
|
+
path = row.file_paths[slot_idx]
|
|
33
|
+
if path is None:
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
col.col_type.validate_media(path)
|
|
38
|
+
except excs.Error as exc:
|
|
39
|
+
self.row_builder.set_exc(row, slot_idx, exc)
|
|
40
|
+
if not self.ctx.ignore_errors:
|
|
41
|
+
raise exc
|
|
42
|
+
|
|
43
|
+
return row_batch
|