pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (120) hide show
  1. pixeltable/__init__.py +7 -19
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +7 -7
  4. pixeltable/catalog/column.py +37 -11
  5. pixeltable/catalog/globals.py +21 -0
  6. pixeltable/catalog/insertable_table.py +6 -4
  7. pixeltable/catalog/table.py +227 -148
  8. pixeltable/catalog/table_version.py +66 -28
  9. pixeltable/catalog/table_version_path.py +0 -8
  10. pixeltable/catalog/view.py +18 -19
  11. pixeltable/dataframe.py +16 -32
  12. pixeltable/env.py +6 -1
  13. pixeltable/exec/__init__.py +1 -2
  14. pixeltable/exec/aggregation_node.py +27 -17
  15. pixeltable/exec/cache_prefetch_node.py +1 -1
  16. pixeltable/exec/data_row_batch.py +9 -26
  17. pixeltable/exec/exec_node.py +36 -7
  18. pixeltable/exec/expr_eval_node.py +19 -11
  19. pixeltable/exec/in_memory_data_node.py +14 -11
  20. pixeltable/exec/sql_node.py +266 -138
  21. pixeltable/exprs/__init__.py +1 -0
  22. pixeltable/exprs/arithmetic_expr.py +3 -1
  23. pixeltable/exprs/array_slice.py +7 -7
  24. pixeltable/exprs/column_property_ref.py +37 -10
  25. pixeltable/exprs/column_ref.py +93 -14
  26. pixeltable/exprs/comparison.py +5 -5
  27. pixeltable/exprs/compound_predicate.py +8 -7
  28. pixeltable/exprs/data_row.py +56 -36
  29. pixeltable/exprs/expr.py +65 -63
  30. pixeltable/exprs/expr_dict.py +55 -0
  31. pixeltable/exprs/expr_set.py +26 -15
  32. pixeltable/exprs/function_call.py +53 -24
  33. pixeltable/exprs/globals.py +4 -1
  34. pixeltable/exprs/in_predicate.py +8 -7
  35. pixeltable/exprs/inline_expr.py +4 -4
  36. pixeltable/exprs/is_null.py +4 -4
  37. pixeltable/exprs/json_mapper.py +11 -12
  38. pixeltable/exprs/json_path.py +5 -10
  39. pixeltable/exprs/literal.py +5 -5
  40. pixeltable/exprs/method_ref.py +5 -4
  41. pixeltable/exprs/object_ref.py +2 -1
  42. pixeltable/exprs/row_builder.py +88 -36
  43. pixeltable/exprs/rowid_ref.py +14 -13
  44. pixeltable/exprs/similarity_expr.py +12 -7
  45. pixeltable/exprs/sql_element_cache.py +12 -6
  46. pixeltable/exprs/type_cast.py +8 -6
  47. pixeltable/exprs/variable.py +5 -4
  48. pixeltable/ext/functions/whisperx.py +7 -2
  49. pixeltable/func/aggregate_function.py +1 -1
  50. pixeltable/func/callable_function.py +2 -2
  51. pixeltable/func/function.py +11 -10
  52. pixeltable/func/function_registry.py +6 -7
  53. pixeltable/func/query_template_function.py +11 -12
  54. pixeltable/func/signature.py +17 -15
  55. pixeltable/func/udf.py +0 -4
  56. pixeltable/functions/__init__.py +2 -2
  57. pixeltable/functions/audio.py +4 -6
  58. pixeltable/functions/globals.py +84 -42
  59. pixeltable/functions/huggingface.py +31 -34
  60. pixeltable/functions/image.py +59 -45
  61. pixeltable/functions/json.py +0 -1
  62. pixeltable/functions/llama_cpp.py +106 -0
  63. pixeltable/functions/mistralai.py +2 -2
  64. pixeltable/functions/ollama.py +147 -0
  65. pixeltable/functions/openai.py +22 -25
  66. pixeltable/functions/replicate.py +72 -0
  67. pixeltable/functions/string.py +59 -50
  68. pixeltable/functions/timestamp.py +20 -20
  69. pixeltable/functions/together.py +2 -2
  70. pixeltable/functions/video.py +11 -20
  71. pixeltable/functions/whisper.py +2 -20
  72. pixeltable/globals.py +65 -74
  73. pixeltable/index/base.py +2 -2
  74. pixeltable/index/btree.py +20 -7
  75. pixeltable/index/embedding_index.py +12 -14
  76. pixeltable/io/__init__.py +1 -2
  77. pixeltable/io/external_store.py +11 -5
  78. pixeltable/io/fiftyone.py +178 -0
  79. pixeltable/io/globals.py +98 -2
  80. pixeltable/io/hf_datasets.py +1 -1
  81. pixeltable/io/label_studio.py +6 -6
  82. pixeltable/io/parquet.py +14 -13
  83. pixeltable/iterators/base.py +3 -2
  84. pixeltable/iterators/document.py +10 -8
  85. pixeltable/iterators/video.py +126 -60
  86. pixeltable/metadata/__init__.py +4 -3
  87. pixeltable/metadata/converters/convert_14.py +4 -2
  88. pixeltable/metadata/converters/convert_15.py +1 -1
  89. pixeltable/metadata/converters/convert_19.py +1 -0
  90. pixeltable/metadata/converters/convert_20.py +1 -1
  91. pixeltable/metadata/converters/convert_21.py +34 -0
  92. pixeltable/metadata/converters/util.py +54 -12
  93. pixeltable/metadata/notes.py +1 -0
  94. pixeltable/metadata/schema.py +40 -21
  95. pixeltable/plan.py +149 -165
  96. pixeltable/py.typed +0 -0
  97. pixeltable/store.py +57 -37
  98. pixeltable/tool/create_test_db_dump.py +6 -6
  99. pixeltable/tool/create_test_video.py +1 -1
  100. pixeltable/tool/doc_plugins/griffe.py +3 -34
  101. pixeltable/tool/embed_udf.py +1 -1
  102. pixeltable/tool/mypy_plugin.py +55 -0
  103. pixeltable/type_system.py +260 -61
  104. pixeltable/utils/arrow.py +10 -9
  105. pixeltable/utils/coco.py +4 -4
  106. pixeltable/utils/documents.py +16 -2
  107. pixeltable/utils/filecache.py +9 -9
  108. pixeltable/utils/formatter.py +10 -11
  109. pixeltable/utils/http_server.py +2 -5
  110. pixeltable/utils/media_store.py +6 -6
  111. pixeltable/utils/pytorch.py +10 -11
  112. pixeltable/utils/sql.py +2 -1
  113. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
  114. pixeltable-0.2.22.dist-info/RECORD +153 -0
  115. pixeltable/exec/media_validation_node.py +0 -43
  116. pixeltable/utils/help.py +0 -11
  117. pixeltable-0.2.20.dist-info/RECORD +0 -147
  118. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
  119. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
  120. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
@@ -1,11 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import enum
4
- from typing import Optional, List, Any, Dict, Tuple
4
+ from typing import Any, Optional
5
5
 
6
6
  import sqlalchemy as sql
7
7
 
8
8
  import pixeltable.type_system as ts
9
+ from pixeltable import catalog
9
10
  from .column_ref import ColumnRef
10
11
  from .data_row import DataRow
11
12
  from .expr import Expr
@@ -33,22 +34,36 @@ class ColumnPropertyRef(Expr):
33
34
  def default_column_name(self) -> Optional[str]:
34
35
  return str(self).replace('.', '_')
35
36
 
36
- def _equals(self, other: ColumnRef) -> bool:
37
+ def _equals(self, other: ColumnPropertyRef) -> bool:
37
38
  return self.prop == other.prop
38
39
 
39
- def _id_attrs(self) -> List[Tuple[str, Any]]:
40
+ def _id_attrs(self) -> list[tuple[str, Any]]:
40
41
  return super()._id_attrs() + [('prop', self.prop.value)]
41
42
 
42
43
  @property
43
44
  def _col_ref(self) -> ColumnRef:
44
- return self.components[0]
45
+ col_ref = self.components[0]
46
+ assert isinstance(col_ref, ColumnRef)
47
+ return col_ref
45
48
 
46
49
  def __str__(self) -> str:
47
50
  return f'{self._col_ref}.{self.prop.name.lower()}'
48
51
 
52
+ def is_error_prop(self) -> bool:
53
+ return self.prop == self.Property.ERRORTYPE or self.prop == self.Property.ERRORMSG
54
+
49
55
  def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
50
56
  if not self._col_ref.col.is_stored:
51
57
  return None
58
+
59
+ # the errortype/-msg properties of a read-validated media column need to be extracted from the DataRow
60
+ if (
61
+ self._col_ref.col.col_type.is_media_type()
62
+ and self._col_ref.col.media_validation == catalog.MediaValidation.ON_READ
63
+ and self.is_error_prop()
64
+ ):
65
+ return None
66
+
52
67
  if self.prop == self.Property.ERRORTYPE:
53
68
  assert self._col_ref.col.sa_errortype_col is not None
54
69
  return self._col_ref.col.sa_errortype_col
@@ -61,18 +76,30 @@ class ColumnPropertyRef(Expr):
61
76
  return None
62
77
 
63
78
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
64
- assert self.prop == self.Property.FILEURL or self.prop == self.Property.LOCALPATH
65
- assert data_row.has_val[self._col_ref.slot_idx]
66
79
  if self.prop == self.Property.FILEURL:
80
+ assert data_row.has_val[self._col_ref.slot_idx]
67
81
  data_row[self.slot_idx] = data_row.file_urls[self._col_ref.slot_idx]
68
- if self.prop == self.Property.LOCALPATH:
82
+ return
83
+ elif self.prop == self.Property.LOCALPATH:
84
+ assert data_row.has_val[self._col_ref.slot_idx]
69
85
  data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
70
-
71
- def _as_dict(self) -> Dict:
86
+ return
87
+ elif self.is_error_prop():
88
+ exc = data_row.get_exc(self._col_ref.slot_idx)
89
+ if exc is None:
90
+ data_row[self.slot_idx] = None
91
+ elif self.prop == self.Property.ERRORTYPE:
92
+ data_row[self.slot_idx] = type(exc).__name__
93
+ else:
94
+ data_row[self.slot_idx] = str(exc)
95
+ else:
96
+ assert False
97
+
98
+ def _as_dict(self) -> dict:
72
99
  return {'prop': self.prop.value, **super()._as_dict()}
73
100
 
74
101
  @classmethod
75
- def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
102
+ def _from_dict(cls, d: dict, components: list[Expr]) -> ColumnPropertyRef:
76
103
  assert 'prop' in d
77
104
  assert isinstance(components[0], ColumnRef)
78
105
  return cls(components[0], cls.Property(d['prop']))
@@ -1,16 +1,18 @@
1
1
  from __future__ import annotations
2
- from typing import Optional, Any, Tuple
2
+
3
+ from typing import Any, Optional, Sequence
3
4
  from uuid import UUID
4
5
 
5
6
  import sqlalchemy as sql
6
7
 
7
- from .expr import Expr
8
+ import pixeltable.catalog as catalog
9
+ import pixeltable.exceptions as excs
10
+ import pixeltable.iterators as iters
11
+
8
12
  from .data_row import DataRow
13
+ from .expr import Expr
9
14
  from .row_builder import RowBuilder
10
15
  from .sql_element_cache import SqlElementCache
11
- import pixeltable.iterators as iters
12
- import pixeltable.exceptions as excs
13
- import pixeltable.catalog as catalog
14
16
 
15
17
 
16
18
  class ColumnRef(Expr):
@@ -19,18 +21,31 @@ class ColumnRef(Expr):
19
21
  When this reference is created in the context of a view, it can also refer to a column of the view base.
20
22
  For that reason, a ColumnRef needs to be serialized with the qualifying table id (column ids are only
21
23
  unique in the context of a particular table).
24
+
25
+ Media validation:
26
+ - media validation is potentially cpu-intensive, and it's desirable to schedule and parallelize it during
27
+ general expr evaluation
28
+ - media validation on read is done in ColumnRef.eval()
29
+ - a validating ColumnRef cannot be translated to SQL (because the validation is done in Python)
30
+ - in that case, the ColumnRef also instantiates a second non-validating ColumnRef as a component (= dependency)
31
+ - the non-validating ColumnRef is used for SQL translation
32
+
33
+ TODO:
34
+ separate Exprs (like validating ColumnRefs) from the logical expression tree and instead have RowBuilder
35
+ insert them into the EvalCtxs as needed
22
36
  """
23
37
 
24
38
  col: catalog.Column
25
39
  is_unstored_iter_col: bool
26
40
  iter_arg_ctx: Optional[RowBuilder.EvalCtx]
27
41
  base_rowid_len: int
28
- base_rowid: list[Optional[Any]]
42
+ base_rowid: Sequence[Optional[Any]]
29
43
  iterator: Optional[iters.ComponentIterator]
30
44
  pos_idx: Optional[int]
31
45
  id: int
46
+ perform_validation: bool # if True, performs media validation
32
47
 
33
- def __init__(self, col: catalog.Column):
48
+ def __init__(self, col: catalog.Column, perform_validation: Optional[bool] = None):
34
49
  super().__init__(col.col_type)
35
50
  assert col.tbl is not None
36
51
  self.col = col
@@ -43,17 +58,44 @@ class ColumnRef(Expr):
43
58
  self.iterator = None
44
59
  # index of the position column in the view's primary key; don't try to reference tbl.store_tbl here
45
60
  self.pos_idx = col.tbl.num_rowid_columns() - 1 if self.is_unstored_iter_col else None
61
+
62
+ self.perform_validation = False
63
+ if col.col_type.is_media_type():
64
+ # we perform media validation if the column is a media type and the validation is set to ON_READ,
65
+ # unless we're told not to
66
+ if perform_validation is not None:
67
+ self.perform_validation = perform_validation
68
+ else:
69
+ self.perform_validation = (
70
+ col.col_type.is_media_type() and col.media_validation == catalog.MediaValidation.ON_READ
71
+ )
72
+ else:
73
+ assert perform_validation is None or not perform_validation
74
+ if self.perform_validation:
75
+ non_validating_col_ref = ColumnRef(col, perform_validation=False)
76
+ self.components = [non_validating_col_ref]
46
77
  self.id = self._create_id()
47
78
 
48
79
  def set_iter_arg_ctx(self, iter_arg_ctx: RowBuilder.EvalCtx) -> None:
49
80
  self.iter_arg_ctx = iter_arg_ctx
50
81
  assert len(self.iter_arg_ctx.target_slot_idxs) == 1 # a single inline dict
51
82
 
52
- def _id_attrs(self) -> list[Tuple[str, Any]]:
53
- return super()._id_attrs() + [('tbl_id', self.col.tbl.id), ('col_id', self.col.id)]
83
+ def _id_attrs(self) -> list[tuple[str, Any]]:
84
+ return (
85
+ super()._id_attrs()
86
+ + [('tbl_id', self.col.tbl.id), ('col_id', self.col.id), ('perform_validation', self.perform_validation)]
87
+ )
88
+
89
+ # override
90
+ def _retarget(self, tbl_versions: dict[UUID, catalog.TableVersion]) -> ColumnRef:
91
+ target = tbl_versions[self.col.tbl.id]
92
+ assert self.col.id in target.cols_by_id
93
+ col = target.cols_by_id[self.col.id]
94
+ return ColumnRef(col)
54
95
 
55
96
  def __getattr__(self, name: str) -> Expr:
56
97
  from .column_property_ref import ColumnPropertyRef
98
+
57
99
  # resolve column properties
58
100
  if name == ColumnPropertyRef.Property.ERRORTYPE.name.lower() \
59
101
  or name == ColumnPropertyRef.Property.ERRORMSG.name.lower():
@@ -82,7 +124,7 @@ class ColumnRef(Expr):
82
124
  return str(self)
83
125
 
84
126
  def _equals(self, other: ColumnRef) -> bool:
85
- return self.col == other.col
127
+ return self.col == other.col and self.perform_validation == other.perform_validation
86
128
 
87
129
  def __str__(self) -> str:
88
130
  if self.col.name is None:
@@ -94,9 +136,38 @@ class ColumnRef(Expr):
94
136
  return f'ColumnRef({self.col!r})'
95
137
 
96
138
  def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
97
- return self.col.sa_col
139
+ return None if self.perform_validation else self.col.sa_col
98
140
 
99
141
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
142
+ if self.perform_validation:
143
+ # validate media file of our input ColumnRef and if successful, replicate the state of that slot
144
+ # to our slot
145
+ unvalidated_slot_idx = self.components[0].slot_idx
146
+ if data_row.file_paths[unvalidated_slot_idx] is None:
147
+ # no media file to validate, we still need to replicate the value
148
+ assert data_row.file_urls[unvalidated_slot_idx] is None
149
+ val = data_row.vals[unvalidated_slot_idx]
150
+ data_row.vals[self.slot_idx] = val
151
+ data_row.has_val[self.slot_idx] = True
152
+ return
153
+
154
+ try:
155
+ self.col.col_type.validate_media(data_row.file_paths[unvalidated_slot_idx])
156
+ # access the value only after successful validation
157
+ val = data_row[unvalidated_slot_idx]
158
+ data_row.vals[self.slot_idx] = val
159
+ data_row.has_val[self.slot_idx] = True
160
+ # make sure that the validated slot points to the same file as the unvalidated slot
161
+ data_row.file_paths[self.slot_idx] = data_row.file_paths[unvalidated_slot_idx]
162
+ data_row.file_urls[self.slot_idx] = data_row.file_urls[unvalidated_slot_idx]
163
+ return
164
+ except excs.Error as exc:
165
+ # propagate the exception, but ignore it otherwise;
166
+ # media validation errors don't cause exceptions during query execution
167
+ # TODO: allow for different error-handling behavior
168
+ row_builder.set_exc(data_row, self.slot_idx, exc)
169
+ return
170
+
100
171
  if not self.is_unstored_iter_col:
101
172
  # supply default
102
173
  data_row[self.slot_idx] = None
@@ -115,7 +186,14 @@ class ColumnRef(Expr):
115
186
  def _as_dict(self) -> dict:
116
187
  tbl = self.col.tbl
117
188
  version = tbl.version if tbl.is_snapshot else None
118
- return {'tbl_id': str(tbl.id), 'tbl_version': version, 'col_id': self.col.id}
189
+ # we omit self.components, even if this is a validating ColumnRef, because init() will recreate the
190
+ # non-validating component ColumnRef
191
+ return {
192
+ 'tbl_id': str(tbl.id),
193
+ 'tbl_version': version,
194
+ 'col_id': self.col.id,
195
+ 'perform_validation': self.perform_validation
196
+ }
119
197
 
120
198
  @classmethod
121
199
  def get_column(cls, d: dict) -> catalog.Column:
@@ -126,6 +204,7 @@ class ColumnRef(Expr):
126
204
  return col
127
205
 
128
206
  @classmethod
129
- def _from_dict(cls, d: dict, _: list[Expr]) -> Expr:
207
+ def _from_dict(cls, d: dict, _: list[Expr]) -> ColumnRef:
130
208
  col = cls.get_column(d)
131
- return cls(col)
209
+ perform_validation = d['perform_validation']
210
+ return cls(col, perform_validation=perform_validation)
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Optional, List, Any, Dict
3
+ from typing import Any, Optional
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
7
7
  import pixeltable.exceptions as excs
8
8
  import pixeltable.index as index
9
9
  import pixeltable.type_system as ts
10
+
10
11
  from .column_ref import ColumnRef
11
12
  from .data_row import DataRow
12
13
  from .expr import Expr
@@ -65,7 +66,7 @@ class Comparison(Expr):
65
66
  def _op2(self) -> Expr:
66
67
  return self.components[1]
67
68
 
68
- def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ClauseElement]:
69
+ def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
69
70
  left = sql_elements.get(self._op1)
70
71
  if self.is_search_arg_comparison:
71
72
  # reference the index value column if there is an index and this is not a snapshot
@@ -113,11 +114,10 @@ class Comparison(Expr):
113
114
  elif self.operator == ComparisonOperator.GE:
114
115
  data_row[self.slot_idx] = left >= right
115
116
 
116
- def _as_dict(self) -> Dict:
117
+ def _as_dict(self) -> dict:
117
118
  return {'operator': self.operator.value, **super()._as_dict()}
118
119
 
119
120
  @classmethod
120
- def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
121
+ def _from_dict(cls, d: dict, components: list[Expr]) -> Comparison:
121
122
  assert 'operator' in d
122
123
  return cls(ComparisonOperator(d['operator']), components[0], components[1])
123
-
@@ -1,20 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import operator
4
- from typing import Optional, List, Any, Dict, Callable
4
+ from typing import Any, Callable, Optional
5
5
 
6
6
  import sqlalchemy as sql
7
7
 
8
+ import pixeltable.type_system as ts
9
+
8
10
  from .data_row import DataRow
9
11
  from .expr import Expr
10
12
  from .globals import LogicalOperator
11
13
  from .row_builder import RowBuilder
12
14
  from .sql_element_cache import SqlElementCache
13
- import pixeltable.type_system as ts
14
15
 
15
16
 
16
17
  class CompoundPredicate(Expr):
17
- def __init__(self, operator: LogicalOperator, operands: List[Expr]):
18
+ def __init__(self, operator: LogicalOperator, operands: list[Expr]):
18
19
  super().__init__(ts.BoolType())
19
20
  self.operator = operator
20
21
  # operands are stored in self.components
@@ -23,7 +24,7 @@ class CompoundPredicate(Expr):
23
24
  self.components = operands
24
25
  else:
25
26
  assert len(operands) > 1
26
- self.operands: List[Expr] = []
27
+ self.operands: list[Expr] = []
27
28
  for operand in operands:
28
29
  self._merge_operand(operand)
29
30
 
@@ -35,7 +36,7 @@ class CompoundPredicate(Expr):
35
36
  return f' {self.operator} '.join([f'({e})' for e in self.components])
36
37
 
37
38
  @classmethod
38
- def make_conjunction(cls, operands: List[Expr]) -> Optional[Expr]:
39
+ def make_conjunction(cls, operands: list[Expr]) -> Optional[Expr]:
39
40
  if len(operands) == 0:
40
41
  return None
41
42
  if len(operands) == 1:
@@ -89,11 +90,11 @@ class CompoundPredicate(Expr):
89
90
  val = op_function(val, data_row[op.slot_idx])
90
91
  data_row[self.slot_idx] = val
91
92
 
92
- def _as_dict(self) -> Dict:
93
+ def _as_dict(self) -> dict:
93
94
  return {'operator': self.operator.value, **super()._as_dict()}
94
95
 
95
96
  @classmethod
96
- def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
97
+ def _from_dict(cls, d: dict, components: list[Expr]) -> CompoundPredicate:
97
98
  assert 'operator' in d
98
99
  return cls(LogicalOperator(d['operator']), components)
99
100
 
@@ -4,13 +4,13 @@ import datetime
4
4
  import io
5
5
  import urllib.parse
6
6
  import urllib.request
7
- from typing import Optional, List, Any, Tuple
7
+ from typing import Any, Optional
8
8
 
9
- import sqlalchemy as sql
10
- import pgvector.sqlalchemy
9
+ import numpy as np
10
+ import pgvector.sqlalchemy # type: ignore[import-untyped]
11
11
  import PIL
12
12
  import PIL.Image
13
- import numpy as np
13
+ import sqlalchemy as sql
14
14
 
15
15
  from pixeltable import env
16
16
 
@@ -33,29 +33,40 @@ class DataRow:
33
33
  - ImageType: PIL.Image.Image
34
34
  - VideoType: local path if available, otherwise url
35
35
  """
36
- def __init__(self, size: int, img_slot_idxs: List[int], media_slot_idxs: List[int], array_slot_idxs: List[int]):
37
- self.vals: List[Any] = [None] * size # either cell values or exceptions
38
- self.has_val = [False] * size
39
- self.excs: List[Optional[Exception]] = [None] * size
40
36
 
41
- # control structures that are shared across all DataRows in a batch
42
- self.img_slot_idxs = img_slot_idxs
43
- self.media_slot_idxs = media_slot_idxs # all media types aside from image
44
- self.array_slot_idxs = array_slot_idxs
37
+ vals: list[Any]
38
+ has_val: list[bool]
39
+ excs: list[Optional[Exception]]
40
+
41
+ # control structures that are shared across all DataRows in a batch
42
+ img_slot_idxs: list[int]
43
+ media_slot_idxs: list[int]
44
+ array_slot_idxs: list[int]
45
45
 
46
- # the primary key of a store row is a sequence of ints (the number is different for table vs view)
47
- self.pk: Optional[Tuple[int, ...]] = None
46
+ # the primary key of a store row is a sequence of ints (the number is different for table vs view)
47
+ pk: Optional[tuple[int, ...]]
48
48
 
49
- # file_urls:
50
- # - stored url of file for media in vals[i]
51
- # - None if vals[i] is not media type
52
- # - not None if file_paths[i] is not None
53
- self.file_urls: List[Optional[str]] = [None] * size
49
+ # file_urls:
50
+ # - stored url of file for media in vals[i]
51
+ # - None if vals[i] is not media type
52
+ # - not None if file_paths[i] is not None
53
+ file_urls: list[Optional[str]]
54
54
 
55
- # file_paths:
56
- # - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
57
- # - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
58
- self.file_paths: List[Optional[str]] = [None] * size
55
+ # file_paths:
56
+ # - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
57
+ # - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
58
+ file_paths: list[Optional[str]]
59
+
60
+ def __init__(self, size: int, img_slot_idxs: list[int], media_slot_idxs: list[int], array_slot_idxs: list[int]):
61
+ self.vals = [None] * size
62
+ self.has_val = [False] * size
63
+ self.excs = [None] * size
64
+ self.img_slot_idxs = img_slot_idxs
65
+ self.media_slot_idxs = media_slot_idxs
66
+ self.array_slot_idxs = array_slot_idxs
67
+ self.pk = None
68
+ self.file_urls = [None] * size
69
+ self.file_paths = [None] * size
59
70
 
60
71
  def clear(self) -> None:
61
72
  size = len(self.vals)
@@ -78,27 +89,35 @@ class DataRow:
78
89
  target.file_urls = self.file_urls.copy()
79
90
  target.file_paths = self.file_paths.copy()
80
91
 
81
- def set_pk(self, pk: Tuple[int, ...]) -> None:
92
+ def set_pk(self, pk: tuple[int, ...]) -> None:
82
93
  self.pk = pk
83
94
 
84
- def has_exc(self, slot_idx: int) -> bool:
85
- return self.excs[slot_idx] is not None
95
+ def has_exc(self, slot_idx: Optional[int] = None) -> bool:
96
+ """
97
+ Returns True if an exception has been set for the given slot index, or for any slot index if slot_idx is None
98
+ """
99
+ if slot_idx is not None:
100
+ return self.excs[slot_idx] is not None
101
+ return any(exc is not None for exc in self.excs)
86
102
 
87
- def get_exc(self, slot_idx: int) -> Exception:
88
- assert self.has_val[slot_idx] is False
89
- assert self.excs[slot_idx] is not None
103
+ def get_exc(self, slot_idx: int) -> Optional[Exception]:
90
104
  return self.excs[slot_idx]
91
105
 
106
+ def get_first_exc(self) -> Optional[Exception]:
107
+ for exc in self.excs:
108
+ if exc is not None:
109
+ return exc
110
+ return None
111
+
92
112
  def set_exc(self, slot_idx: int, exc: Exception) -> None:
93
113
  assert self.excs[slot_idx] is None
94
114
  self.excs[slot_idx] = exc
95
115
 
96
- if self.has_val[slot_idx]:
97
- # eg. during validation, where contents of file is found invalid
98
- self.has_val[slot_idx] = False
99
- self.vals[slot_idx] = None
100
- self.file_paths[slot_idx] = None
101
- self.file_urls[slot_idx] = None
116
+ # an exception means the value is None
117
+ self.has_val[slot_idx] = True
118
+ self.vals[slot_idx] = None
119
+ self.file_paths[slot_idx] = None
120
+ self.file_urls[slot_idx] = None
102
121
 
103
122
  def __len__(self) -> int:
104
123
  return len(self.vals)
@@ -113,6 +132,7 @@ class DataRow:
113
132
 
114
133
  if self.file_urls[index] is not None and index in self.img_slot_idxs:
115
134
  # if we need to load this from a file, it should have been materialized locally
135
+ # TODO this fails if the url was instantiated dynamically using astype()
116
136
  assert self.file_paths[index] is not None
117
137
  if self.vals[index] is None:
118
138
  self.vals[index] = PIL.Image.open(self.file_paths[index])
@@ -220,7 +240,7 @@ class DataRow:
220
240
  self.vals[index] = None
221
241
 
222
242
  @property
223
- def rowid(self) -> Tuple[int]:
243
+ def rowid(self) -> tuple[int, ...]:
224
244
  return self.pk[:-1]
225
245
 
226
246
  @property