pixeltable 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (119) hide show
  1. pixeltable/__init__.py +53 -0
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/__init__.py +13 -0
  4. pixeltable/catalog/catalog.py +159 -0
  5. pixeltable/catalog/column.py +181 -0
  6. pixeltable/catalog/dir.py +32 -0
  7. pixeltable/catalog/globals.py +33 -0
  8. pixeltable/catalog/insertable_table.py +192 -0
  9. pixeltable/catalog/named_function.py +36 -0
  10. pixeltable/catalog/path.py +58 -0
  11. pixeltable/catalog/path_dict.py +139 -0
  12. pixeltable/catalog/schema_object.py +39 -0
  13. pixeltable/catalog/table.py +695 -0
  14. pixeltable/catalog/table_version.py +1026 -0
  15. pixeltable/catalog/table_version_path.py +133 -0
  16. pixeltable/catalog/view.py +203 -0
  17. pixeltable/dataframe.py +749 -0
  18. pixeltable/env.py +466 -0
  19. pixeltable/exceptions.py +17 -0
  20. pixeltable/exec/__init__.py +10 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +116 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +94 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +73 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +226 -0
  31. pixeltable/exprs/__init__.py +25 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +114 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +199 -0
  39. pixeltable/exprs/expr.py +594 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +382 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +96 -0
  44. pixeltable/exprs/in_predicate.py +96 -0
  45. pixeltable/exprs/inline_array.py +109 -0
  46. pixeltable/exprs/inline_dict.py +103 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +66 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +329 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/similarity_expr.py +65 -0
  56. pixeltable/exprs/type_cast.py +53 -0
  57. pixeltable/exprs/variable.py +45 -0
  58. pixeltable/ext/__init__.py +5 -0
  59. pixeltable/ext/functions/yolox.py +92 -0
  60. pixeltable/func/__init__.py +7 -0
  61. pixeltable/func/aggregate_function.py +197 -0
  62. pixeltable/func/callable_function.py +113 -0
  63. pixeltable/func/expr_template_function.py +99 -0
  64. pixeltable/func/function.py +141 -0
  65. pixeltable/func/function_registry.py +227 -0
  66. pixeltable/func/globals.py +46 -0
  67. pixeltable/func/nos_function.py +202 -0
  68. pixeltable/func/signature.py +162 -0
  69. pixeltable/func/udf.py +164 -0
  70. pixeltable/functions/__init__.py +95 -0
  71. pixeltable/functions/eval.py +215 -0
  72. pixeltable/functions/fireworks.py +34 -0
  73. pixeltable/functions/huggingface.py +167 -0
  74. pixeltable/functions/image.py +16 -0
  75. pixeltable/functions/openai.py +289 -0
  76. pixeltable/functions/pil/image.py +147 -0
  77. pixeltable/functions/string.py +13 -0
  78. pixeltable/functions/together.py +143 -0
  79. pixeltable/functions/util.py +52 -0
  80. pixeltable/functions/video.py +62 -0
  81. pixeltable/globals.py +425 -0
  82. pixeltable/index/__init__.py +2 -0
  83. pixeltable/index/base.py +51 -0
  84. pixeltable/index/embedding_index.py +168 -0
  85. pixeltable/io/__init__.py +3 -0
  86. pixeltable/io/hf_datasets.py +188 -0
  87. pixeltable/io/pandas.py +148 -0
  88. pixeltable/io/parquet.py +192 -0
  89. pixeltable/iterators/__init__.py +3 -0
  90. pixeltable/iterators/base.py +52 -0
  91. pixeltable/iterators/document.py +432 -0
  92. pixeltable/iterators/video.py +88 -0
  93. pixeltable/metadata/__init__.py +58 -0
  94. pixeltable/metadata/converters/convert_10.py +18 -0
  95. pixeltable/metadata/converters/convert_12.py +3 -0
  96. pixeltable/metadata/converters/convert_13.py +41 -0
  97. pixeltable/metadata/schema.py +234 -0
  98. pixeltable/plan.py +620 -0
  99. pixeltable/store.py +424 -0
  100. pixeltable/tool/create_test_db_dump.py +184 -0
  101. pixeltable/tool/create_test_video.py +81 -0
  102. pixeltable/type_system.py +846 -0
  103. pixeltable/utils/__init__.py +17 -0
  104. pixeltable/utils/arrow.py +98 -0
  105. pixeltable/utils/clip.py +18 -0
  106. pixeltable/utils/coco.py +136 -0
  107. pixeltable/utils/documents.py +69 -0
  108. pixeltable/utils/filecache.py +195 -0
  109. pixeltable/utils/help.py +11 -0
  110. pixeltable/utils/http_server.py +70 -0
  111. pixeltable/utils/media_store.py +76 -0
  112. pixeltable/utils/pytorch.py +91 -0
  113. pixeltable/utils/s3.py +13 -0
  114. pixeltable/utils/sql.py +17 -0
  115. pixeltable/utils/transactional_directory.py +35 -0
  116. pixeltable-0.0.0.dist-info/LICENSE +18 -0
  117. pixeltable-0.0.0.dist-info/METADATA +131 -0
  118. pixeltable-0.0.0.dist-info/RECORD +119 -0
  119. pixeltable-0.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,79 @@
1
+ from typing import Generator, Optional
2
+
3
+ from .data_row_batch import DataRowBatch
4
+ from .exec_node import ExecNode
5
+ import pixeltable.catalog as catalog
6
+ import pixeltable.exprs as exprs
7
+ import pixeltable.exceptions as excs
8
+
9
+
10
+ class ComponentIterationNode(ExecNode):
11
+ """Expands each row from a base table into one row per component returned by an iterator
12
+
13
+ Returns row batches of OUTPUT_BATCH_SIZE size.
14
+ """
15
+ OUTPUT_BATCH_SIZE = 1024
16
+
17
+ def __init__(self, view: catalog.TableVersion, input: ExecNode):
18
+ assert view.is_component_view()
19
+ super().__init__(input.row_builder, [], [], input)
20
+ self.view = view
21
+ iterator_args = [view.iterator_args.copy()]
22
+ self.row_builder.substitute_exprs(iterator_args)
23
+ self.iterator_args = iterator_args[0]
24
+ assert isinstance(self.iterator_args, exprs.InlineDict)
25
+ self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
26
+ self.iterator_output_schema, self.unstored_column_names = \
27
+ self.view.iterator_cls.output_schema(**self.iterator_args.to_dict())
28
+ self.iterator_output_fields = list(self.iterator_output_schema.keys())
29
+ self.iterator_output_cols = \
30
+ {field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields}
31
+ # referenced iterator output fields
32
+ self.refd_output_slot_idxs = {
33
+ e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
34
+ if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
35
+ }
36
+ self._output: Optional[Generator[DataRowBatch, None, None]] = None
37
+
38
+ def _output_batches(self) -> Generator[DataRowBatch, None, None]:
39
+ output_batch = DataRowBatch(self.view, self.row_builder)
40
+ for input_batch in self.input:
41
+ for input_row in input_batch:
42
+ self.row_builder.eval(input_row, self.iterator_args_ctx)
43
+ iterator_args = input_row[self.iterator_args.slot_idx]
44
+ iterator = self.view.iterator_cls(**iterator_args)
45
+ for pos, component_dict in enumerate(iterator):
46
+ output_row = output_batch.add_row()
47
+ input_row.copy(output_row)
48
+ # we're expanding the input and need to add the iterator position to the pk
49
+ pk = output_row.pk[:-1] + (pos,) + output_row.pk[-1:]
50
+ output_row.set_pk(pk)
51
+
52
+ # verify and copy component_dict fields to their respective slots in output_row
53
+ for field_name, field_val in component_dict.items():
54
+ if field_name not in self.iterator_output_fields:
55
+ raise excs.Error(
56
+ f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
57
+ if field_name not in self.refd_output_slot_idxs:
58
+ # we can ignore this
59
+ continue
60
+ output_col = self.iterator_output_cols[field_name]
61
+ output_col.col_type.validate_literal(field_val)
62
+ output_row[self.refd_output_slot_idxs[field_name]] = field_val
63
+ if len(component_dict) != len(self.iterator_output_fields):
64
+ missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
65
+ raise excs.Error(
66
+ f'Invalid output of {self.view.iterator_cls.__name__}: '
67
+ f'missing fields {", ".join(missing_fields)}')
68
+
69
+ if len(output_batch) == self.OUTPUT_BATCH_SIZE:
70
+ yield output_batch
71
+ output_batch = DataRowBatch(self.view, self.row_builder)
72
+
73
+ if len(output_batch) > 0:
74
+ yield output_batch
75
+
76
+ def __next__(self) -> DataRowBatch:
77
+ if self._output is None:
78
+ self._output = self._output_batches()
79
+ return next(self._output)
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+ from typing import List, Iterator, Optional
3
+ import logging
4
+
5
+ import pixeltable.exprs as exprs
6
+ import pixeltable.catalog as catalog
7
+ from pixeltable.utils.media_store import MediaStore
8
+
9
+
10
+ _logger = logging.getLogger('pixeltable')
11
+
12
+ class DataRowBatch:
13
+ """Set of DataRows, indexed by rowid.
14
+
15
+ Contains the metadata needed to initialize DataRows.
16
+ """
17
+ def __init__(self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, len: int = 0):
18
+ self.tbl = tbl
19
+ self.row_builder = row_builder
20
+ self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
21
+ # non-image media slots
22
+ self.media_slot_idxs = [
23
+ e.slot_idx for e in row_builder.unique_exprs
24
+ if e.col_type.is_media_type() and not e.col_type.is_image_type()
25
+ ]
26
+ self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
27
+ self.rows = [
28
+ exprs.DataRow(row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
29
+ for _ in range(len)
30
+ ]
31
+
32
+ def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
33
+ if row is None:
34
+ row = exprs.DataRow(
35
+ self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
36
+ self.rows.append(row)
37
+ return row
38
+
39
+ def pop_row(self) -> exprs.DataRow:
40
+ return self.rows.pop()
41
+
42
+ def set_row_ids(self, row_ids: List[int]) -> None:
43
+ """Sets pks for rows in batch"""
44
+ assert self.tbl is not None
45
+ assert len(row_ids) == len(self.rows)
46
+ for row, row_id in zip(self.rows, row_ids):
47
+ row.set_pk((row_id, self.tbl))
48
+
49
+ def __len__(self) -> int:
50
+ return len(self.rows)
51
+
52
+ def __getitem__(self, index: object) -> exprs.DataRow:
53
+ return self.rows[index]
54
+
55
+ def flush_imgs(
56
+ self, idx_range: Optional[slice] = None, stored_img_info: Optional[List[exprs.ColumnSlotIdx]] = None,
57
+ flushed_slot_idxs: Optional[List[int]] = None
58
+ ) -> None:
59
+ """Flushes images in the given range of rows."""
60
+ assert self.tbl is not None
61
+ if stored_img_info is None:
62
+ stored_img_info = []
63
+ if flushed_slot_idxs is None:
64
+ flushed_slot_idxs = []
65
+ if len(stored_img_info) == 0 and len(flushed_slot_idxs) == 0:
66
+ return
67
+ if idx_range is None:
68
+ idx_range = slice(0, len(self.rows))
69
+ for row in self.rows[idx_range]:
70
+ for info in stored_img_info:
71
+ filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.version))
72
+ row.flush_img(info.slot_idx, filepath)
73
+ for slot_idx in flushed_slot_idxs:
74
+ row.flush_img(slot_idx)
75
+
76
+ def __iter__(self) -> Iterator[exprs.DataRow]:
77
+ return DataRowBatchIterator(self)
78
+
79
+
80
+ class DataRowBatchIterator:
81
+ """
82
+ Iterator over a DataRowBatch.
83
+ """
84
+ def __init__(self, batch: DataRowBatch):
85
+ self.row_batch = batch
86
+ self.index = 0
87
+
88
+ def __next__(self) -> exprs.DataRow:
89
+ if self.index >= len(self.row_batch.rows):
90
+ raise StopIteration
91
+ row = self.row_batch.rows[self.index]
92
+ self.index += 1
93
+ return row
94
+
@@ -0,0 +1,22 @@
1
+ from typing import Optional, List
2
+
3
+ import sqlalchemy as sql
4
+
5
+ import pixeltable.exprs as exprs
6
+
7
+ class ExecContext:
8
+ """Class for execution runtime constants"""
9
+ def __init__(
10
+ self, row_builder: exprs.RowBuilder, *, show_pbar: bool = False, batch_size: int = 0,
11
+ pk_clause: Optional[List[sql.ClauseElement]] = None, num_computed_exprs: int = 0,
12
+ ignore_errors: bool = False
13
+ ):
14
+ self.show_pbar = show_pbar
15
+ self.batch_size = batch_size
16
+ self.profile = exprs.ExecProfile(row_builder)
17
+ # num_rows is used to compute the total number of computed cells used for the progress bar
18
+ self.num_rows: Optional[int] = None
19
+ self.conn: Optional[sql.engine.Connection] = None # if present, use this to execute SQL queries
20
+ self.pk_clause = pk_clause
21
+ self.num_computed_exprs = num_computed_exprs
22
+ self.ignore_errors = ignore_errors
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+ from typing import Iterable, Optional, List
3
+ import abc
4
+
5
+ from .data_row_batch import DataRowBatch
6
+ from .exec_context import ExecContext
7
+ import pixeltable.exprs as exprs
8
+
9
+ class ExecNode(abc.ABC):
10
+ """Base class of all execution nodes"""
11
+ def __init__(
12
+ self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
13
+ input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
14
+ self.row_builder = row_builder
15
+ self.input = input
16
+ # we flush all image slots that aren't part of our output but are needed to create our output
17
+ output_slot_idxs = {e.slot_idx for e in output_exprs}
18
+ output_dependencies = row_builder.get_dependencies(output_exprs, exclude=input_exprs)
19
+ self.flushed_img_slots = [
20
+ e.slot_idx for e in output_dependencies
21
+ if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
22
+ ]
23
+ self.stored_img_cols: List[exprs.ColumnSlotIdx] = []
24
+ self.ctx: Optional[ExecContext] = None # all nodes of a tree share the same context
25
+
26
+ def set_ctx(self, ctx: ExecContext) -> None:
27
+ self.ctx = ctx
28
+ if self.input is not None:
29
+ self.input.set_ctx(ctx)
30
+
31
+ def set_stored_img_cols(self, stored_img_cols: List[exprs.ColumnSlotIdx]) -> None:
32
+ self.stored_img_cols = stored_img_cols
33
+ # propagate batch size to the source
34
+ if self.input is not None:
35
+ self.input.set_stored_img_cols(stored_img_cols)
36
+
37
+ def __iter__(self):
38
+ return self
39
+
40
+ @abc.abstractmethod
41
+ def __next__(self) -> DataRowBatch:
42
+ pass
43
+
44
+ def open(self) -> None:
45
+ """Bottom-up initialization of nodes for execution. Must be called before __next__."""
46
+ if self.input is not None:
47
+ self.input.open()
48
+ self._open()
49
+
50
+ def close(self) -> None:
51
+ """Frees node resources top-down after execution. Must be called after final __next__."""
52
+ self._close()
53
+ if self.input is not None:
54
+ self.input.close()
55
+
56
+ def _open(self) -> None:
57
+ pass
58
+
59
+ def _close(self) -> None:
60
+ pass
61
+
@@ -0,0 +1,217 @@
1
+ import logging
2
+ import sys
3
+ import time
4
+ import warnings
5
+ from dataclasses import dataclass
6
+ from typing import List, Optional
7
+
8
+ from tqdm import tqdm, TqdmWarning
9
+
10
+ import pixeltable.exprs as exprs
11
+ from pixeltable.func import CallableFunction
12
+ from .data_row_batch import DataRowBatch
13
+ from .exec_node import ExecNode
14
+
15
+ _logger = logging.getLogger('pixeltable')
16
+
17
+
18
+ class ExprEvalNode(ExecNode):
19
+ """Materializes expressions
20
+ """
21
+ @dataclass
22
+ class Cohort:
23
+ """List of exprs that form an evaluation context and contain calls to at most one external function"""
24
+ exprs: List[exprs.Expr]
25
+ batched_fn: Optional[CallableFunction]
26
+ segment_ctxs: List[exprs.RowBuilder.EvalCtx]
27
+ target_slot_idxs: List[int]
28
+ batch_size: int = 8
29
+
30
+ def __init__(
31
+ self, row_builder: exprs.RowBuilder, output_exprs: List[exprs.Expr], input_exprs: List[exprs.Expr],
32
+ input: ExecNode
33
+ ):
34
+ super().__init__(row_builder, output_exprs, input_exprs, input)
35
+ self.input_exprs = input_exprs
36
+ input_slot_idxs = {e.slot_idx for e in input_exprs}
37
+ # we're only materializing exprs that are not already in the input
38
+ self.target_exprs = [e for e in output_exprs if e.slot_idx not in input_slot_idxs]
39
+ self.pbar: Optional[tqdm] = None
40
+ self.cohorts: List[List[ExprEvalNode.Cohort]] = []
41
+ self._create_cohorts()
42
+
43
+ def __next__(self) -> DataRowBatch:
44
+ input_batch = next(self.input)
45
+ # compute target exprs
46
+ for cohort in self.cohorts:
47
+ self._exec_cohort(cohort, input_batch)
48
+ _logger.debug(f'ExprEvalNode: returning {len(input_batch)} rows')
49
+ return input_batch
50
+
51
+ def _open(self) -> None:
52
+ warnings.simplefilter("ignore", category=TqdmWarning)
53
+ if self.ctx.show_pbar:
54
+ self.pbar = tqdm(
55
+ total=len(self.target_exprs) * self.ctx.num_rows,
56
+ desc='Computing cells',
57
+ unit=' cells',
58
+ ncols=100,
59
+ file=sys.stdout
60
+ )
61
+
62
+ def _close(self) -> None:
63
+ if self.pbar is not None:
64
+ self.pbar.close()
65
+
66
+ def _get_batched_fn(self, expr: exprs.Expr) -> Optional[CallableFunction]:
67
+ if isinstance(expr, exprs.FunctionCall) and isinstance(expr.fn, CallableFunction) and expr.fn.is_batched:
68
+ return expr.fn
69
+ return None
70
+
71
+ def _is_batched_fn_call(self, expr: exprs.Expr) -> bool:
72
+ return self._get_batched_fn(expr) is not None
73
+
74
+ def _create_cohorts(self) -> None:
75
+ all_exprs = self.row_builder.get_dependencies(self.target_exprs)
76
+ # break up all_exprs into cohorts such that each cohort contains calls to at most one external function;
77
+ # seed the cohorts with only the ext fn calls
78
+ cohorts: List[List[exprs.Expr]] = []
79
+ current_batched_fn: Optional[CallableFunction] = None
80
+ for e in all_exprs:
81
+ if not self._is_batched_fn_call(e):
82
+ continue
83
+ if current_batched_fn is None or current_batched_fn != e.fn:
84
+ # create a new cohort
85
+ cohorts.append([])
86
+ current_batched_fn = e.fn
87
+ cohorts[-1].append(e)
88
+
89
+ # expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
90
+ # cohorts are evaluated in order, so we can exclude the target slots from preceding cohorts and input slots
91
+ exclude = set([e.slot_idx for e in self.input_exprs])
92
+ all_target_slot_idxs = set([e.slot_idx for e in self.target_exprs])
93
+ target_slot_idxs: List[List[int]] = [] # the ones materialized by each cohort
94
+ for i in range(len(cohorts)):
95
+ cohorts[i] = self.row_builder.get_dependencies(
96
+ cohorts[i], exclude=[self.row_builder.unique_exprs[slot_idx] for slot_idx in exclude])
97
+ target_slot_idxs.append(
98
+ [e.slot_idx for e in cohorts[i] if e.slot_idx in all_target_slot_idxs])
99
+ exclude.update(target_slot_idxs[-1])
100
+
101
+ all_cohort_slot_idxs = set([e.slot_idx for cohort in cohorts for e in cohort])
102
+ remaining_slot_idxs = set(all_target_slot_idxs) - all_cohort_slot_idxs
103
+ if len(remaining_slot_idxs) > 0:
104
+ cohorts.append(self.row_builder.get_dependencies(
105
+ [self.row_builder.unique_exprs[slot_idx] for slot_idx in remaining_slot_idxs],
106
+ exclude=[self.row_builder.unique_exprs[slot_idx] for slot_idx in exclude]))
107
+ target_slot_idxs.append(list(remaining_slot_idxs))
108
+ # we need to have captured all target slots at this point
109
+ assert all_target_slot_idxs == set().union(*target_slot_idxs)
110
+
111
+ for i in range(len(cohorts)):
112
+ cohort = cohorts[i]
113
+ # segment the cohort into sublists that contain either a single ext. function call or no ext. function calls
114
+ # (i.e., only computed cols)
115
+ assert len(cohort) > 0
116
+ # create the first segment here, so we can avoid checking for an empty list in the loop
117
+ segments = [[cohort[0]]]
118
+ is_batched_segment = self._is_batched_fn_call(cohort[0])
119
+ batched_fn: Optional[CallableFunction] = self._get_batched_fn(cohort[0])
120
+ for e in cohort[1:]:
121
+ if self._is_batched_fn_call(e):
122
+ segments.append([e])
123
+ is_batched_segment = True
124
+ batched_fn = self._get_batched_fn(e)
125
+ else:
126
+ if is_batched_segment:
127
+ # start a new segment
128
+ segments.append([])
129
+ is_batched_segment = False
130
+ segments[-1].append(e)
131
+
132
+ # we create the EvalCtxs manually because create_eval_ctx() would repeat the dependencies of each segment
133
+ segment_ctxs = [
134
+ exprs.RowBuilder.EvalCtx(
135
+ slot_idxs=[e.slot_idx for e in s], exprs=s, target_slot_idxs=[], target_exprs=[])
136
+ for s in segments
137
+ ]
138
+ cohort_info = self.Cohort(cohort, batched_fn, segment_ctxs, target_slot_idxs[i])
139
+ self.cohorts.append(cohort_info)
140
+
141
+ def _exec_cohort(self, cohort: Cohort, rows: DataRowBatch) -> None:
142
+ """Compute the cohort for the entire input batch by dividing it up into sub-batches"""
143
+ batch_start_idx = 0 # start row of the current sub-batch
144
+ # for multi-resolution models, we re-assess the correct ext fn batch size for each input batch
145
+ ext_batch_size = cohort.batched_fn.get_batch_size() if cohort.batched_fn is not None else None
146
+ if ext_batch_size is not None:
147
+ cohort.batch_size = ext_batch_size
148
+
149
+ while batch_start_idx < len(rows):
150
+ num_batch_rows = min(cohort.batch_size, len(rows) - batch_start_idx)
151
+ for segment_ctx in cohort.segment_ctxs:
152
+ if not self._is_batched_fn_call(segment_ctx.exprs[0]):
153
+ # compute batch row-wise
154
+ for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
155
+ self.row_builder.eval(
156
+ rows[row_idx], segment_ctx, self.ctx.profile, ignore_errors=self.ctx.ignore_errors)
157
+ else:
158
+ fn_call = segment_ctx.exprs[0]
159
+ # make a batched external function call
160
+ arg_batches = [[] for _ in range(len(fn_call.args))]
161
+ kwarg_batches = {k: [] for k in fn_call.kwargs.keys()}
162
+
163
+ valid_batch_idxs: List[int] = [] # rows with exceptions are not valid
164
+ for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
165
+ row = rows[row_idx]
166
+ if row.has_exc(fn_call.slot_idx):
167
+ # one of our inputs had an exception, skip this row
168
+ continue
169
+ valid_batch_idxs.append(row_idx)
170
+ args, kwargs = fn_call._make_args(row)
171
+ [arg_batches[i].append(args[i]) for i in range(len(args))]
172
+ [kwarg_batches[k].append(kwargs[k]) for k in kwargs.keys()]
173
+ num_valid_batch_rows = len(valid_batch_idxs)
174
+
175
+ if ext_batch_size is None:
176
+ # we need to choose a batch size based on the args
177
+ sample_args = [arg_batches[i][0] for i in range(len(arg_batches))]
178
+ ext_batch_size = fn_call.fn.get_batch_size(*sample_args)
179
+
180
+ num_remaining_batch_rows = num_valid_batch_rows
181
+ while num_remaining_batch_rows > 0:
182
+ # we make ext. fn calls in batches of ext_batch_size
183
+ if ext_batch_size is None:
184
+ pass
185
+ num_ext_batch_rows = min(ext_batch_size, num_remaining_batch_rows)
186
+ ext_batch_offset = num_valid_batch_rows - num_remaining_batch_rows # offset into args, not rows
187
+ call_args = [
188
+ arg_batches[i][ext_batch_offset:ext_batch_offset + num_ext_batch_rows]
189
+ for i in range(len(arg_batches))
190
+ ]
191
+ call_kwargs = {
192
+ k: kwarg_batches[k][ext_batch_offset:ext_batch_offset + num_ext_batch_rows]
193
+ for k in kwarg_batches.keys()
194
+ }
195
+ start_ts = time.perf_counter()
196
+ result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
197
+ self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
198
+ self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows
199
+
200
+ # move the result into the row batch
201
+ for result_idx in range(len(result_batch)):
202
+ row_idx = valid_batch_idxs[ext_batch_offset + result_idx]
203
+ row = rows[row_idx]
204
+ row[fn_call.slot_idx] = result_batch[result_idx]
205
+
206
+ num_remaining_batch_rows -= num_ext_batch_rows
207
+
208
+ # switch to the ext fn batch size
209
+ cohort.batch_size = ext_batch_size
210
+
211
+ # make sure images for stored cols have been saved to files before moving on to the next batch
212
+ rows.flush_imgs(
213
+ slice(batch_start_idx, batch_start_idx + num_batch_rows), self.stored_img_cols, self.flushed_img_slots)
214
+ if self.pbar is not None:
215
+ self.pbar.update(num_batch_rows * len(cohort.target_slot_idxs))
216
+ batch_start_idx += num_batch_rows
217
+
@@ -0,0 +1,73 @@
1
+ from typing import List, Dict, Any, Optional
2
+ import urllib
3
+ import logging
4
+ import os
5
+
6
+ from .data_row_batch import DataRowBatch
7
+ from .exec_node import ExecNode
8
+ import pixeltable.catalog as catalog
9
+ import pixeltable.exprs as exprs
10
+ import pixeltable.env as env
11
+ from pixeltable.utils.media_store import MediaStore
12
+
13
+
14
+ _logger = logging.getLogger('pixeltable')
15
+
16
+ class InMemoryDataNode(ExecNode):
17
+ """Outputs in-memory data as a row batch of a particular table"""
18
+ def __init__(
19
+ self, tbl: catalog.TableVersionPath, rows: List[Dict[str, Any]],
20
+ row_builder: exprs.RowBuilder, start_row_id: int,
21
+ ):
22
+ super().__init__(row_builder, [], [], None)
23
+ assert tbl.is_insertable()
24
+ self.tbl = tbl
25
+ self.input_rows = rows
26
+ self.start_row_id = start_row_id
27
+ self.has_returned_data = False
28
+ self.output_rows: Optional[DataRowBatch] = None
29
+
30
+ def _open(self) -> None:
31
+ """Create row batch and populate with self.input_rows"""
32
+ column_info = {info.col.id: info for info in self.row_builder.output_slot_idxs()}
33
+ # exclude system columns
34
+ user_column_info = {info.col.name: info for _, info in column_info.items() if info.col.name is not None}
35
+ # stored columns that are not computed
36
+ inserted_col_ids = set([
37
+ info.col.id for info in self.row_builder.output_slot_idxs()
38
+ if info.col.is_stored and not info.col.is_computed
39
+ ])
40
+
41
+ self.output_rows = DataRowBatch(self.tbl, self.row_builder, len(self.input_rows))
42
+ for row_idx, input_row in enumerate(self.input_rows):
43
+ # populate the output row with the values provided in the input row
44
+ input_col_ids: List[int] = []
45
+ for col_name, val in input_row.items():
46
+ col_info = user_column_info.get(col_name)
47
+ assert col_info is not None
48
+
49
+ if col_info.col.col_type.is_image_type() and isinstance(val, bytes):
50
+ # this is a literal image, ie, a sequence of bytes; we save this as a media file and store the path
51
+ path = str(MediaStore.prepare_media_path(self.tbl.id, col_info.col.id, self.tbl.version))
52
+ open(path, 'wb').write(val)
53
+ val = path
54
+ self.output_rows[row_idx][col_info.slot_idx] = val
55
+ input_col_ids.append(col_info.col.id)
56
+
57
+ # set the remaining stored non-computed columns to null
58
+ null_col_ids = inserted_col_ids - set(input_col_ids)
59
+ for col_id in null_col_ids:
60
+ col_info = column_info.get(col_id)
61
+ assert col_info is not None
62
+ self.output_rows[row_idx][col_info.slot_idx] = None
63
+
64
+ self.output_rows.set_row_ids([self.start_row_id + i for i in range(len(self.output_rows))])
65
+ self.ctx.num_rows = len(self.output_rows)
66
+
67
+ def __next__(self) -> DataRowBatch:
68
+ if self.has_returned_data:
69
+ raise StopIteration
70
+ self.has_returned_data = True
71
+ _logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
72
+ return self.output_rows
73
+
@@ -0,0 +1,43 @@
1
+ from __future__ import annotations
2
+ from typing import Iterable, Optional
3
+
4
+ from .data_row_batch import DataRowBatch
5
+ from .exec_node import ExecNode
6
+ import pixeltable.exprs as exprs
7
+ import pixeltable.exceptions as excs
8
+
9
+
10
+ class MediaValidationNode(ExecNode):
11
+ """Validation of selected media slots
12
+ Records exceptions in the rows of the input batch
13
+ """
14
+ def __init__(
15
+ self, row_builder: exprs.RowBuilder, media_slots: Iterable[exprs.ColumnSlotIdx],
16
+ input: Optional[ExecNode]):
17
+ super().__init__(row_builder, [], [], input)
18
+ self.row_builder = row_builder
19
+ self.input = input
20
+ for col in [c.col for c in media_slots]:
21
+ assert col.col_type.is_media_type()
22
+ self.media_slots = media_slots
23
+
24
+ def __next__(self) -> DataRowBatch:
25
+ assert self.input is not None
26
+ row_batch = next(self.input)
27
+ for row in row_batch:
28
+ for slot_idx, col in [(c.slot_idx, c.col) for c in self.media_slots]:
29
+ if row.has_exc(slot_idx):
30
+ continue
31
+ assert row.has_val[slot_idx]
32
+ path = row.file_paths[slot_idx]
33
+ if path is None:
34
+ continue
35
+
36
+ try:
37
+ col.col_type.validate_media(path)
38
+ except excs.Error as exc:
39
+ self.row_builder.set_exc(row, slot_idx, exc)
40
+ if not self.ctx.ignore_errors:
41
+ raise exc
42
+
43
+ return row_batch