pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,10 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import dataclasses
3
4
  import datetime
4
5
  import io
5
6
  import urllib.parse
6
7
  import urllib.request
7
- from typing import Any, Optional
8
+ from pathlib import Path
9
+ from typing import Any
8
10
 
9
11
  import numpy as np
10
12
  import pgvector.sqlalchemy # type: ignore[import-untyped]
@@ -12,14 +14,81 @@ import PIL
12
14
  import PIL.Image
13
15
  import sqlalchemy as sql
14
16
 
15
- from pixeltable import env
17
+ import pixeltable.utils.image as image_utils
18
+ from pixeltable import catalog, env
19
+ from pixeltable.utils.local_store import TempStore
20
+ from pixeltable.utils.misc import non_none_dict_factory
21
+
22
+
23
+ @dataclasses.dataclass
24
+ class ArrayMd:
25
+ """
26
+ Metadata for array cells that are stored externally.
27
+ """
28
+
29
+ start: int
30
+ end: int
31
+
32
+ # we store bool arrays as packed bits (uint8 arrays), and need to record the shape to reconstruct the array
33
+ is_bool: bool = False
34
+ shape: tuple[int, ...] | None = None
35
+
36
+ def as_dict(self) -> dict:
37
+ # dict_factory: suppress Nones
38
+ x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
39
+ return x
40
+
41
+
42
+ @dataclasses.dataclass
43
+ class BinaryMd:
44
+ """
45
+ Metadata for binary cells that are stored externally.
46
+ """
47
+
48
+ start: int
49
+ end: int
50
+
51
+
52
+ @dataclasses.dataclass
53
+ class CellMd:
54
+ """
55
+ Content of the cellmd column.
56
+
57
+ All fields are optional, to minimize storage.
58
+ """
59
+
60
+ errortype: str | None = None
61
+ errormsg: str | None = None
62
+
63
+ # a list of file urls that are used to store images and arrays; only set for json and array columns
64
+ # for json columns: a list of all urls referenced in the column value
65
+ # for array columns: a single url
66
+ file_urls: list[str] | None = None
67
+
68
+ array_md: ArrayMd | None = None
69
+ binary_md: BinaryMd | None = None
70
+
71
+ @classmethod
72
+ def from_dict(cls, d: dict) -> CellMd:
73
+ d = d.copy()
74
+ if 'array_md' in d:
75
+ d['array_md'] = ArrayMd(**d['array_md'])
76
+ if 'binary_md' in d:
77
+ d['binary_md'] = BinaryMd(**d['binary_md'])
78
+ return cls(**d)
79
+
80
+ def as_dict(self) -> dict:
81
+ x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
82
+ return x
16
83
 
17
84
 
18
85
  class DataRow:
19
86
  """
20
87
  Encapsulates all data and execution state needed by RowBuilder and DataRowBatch:
21
88
  - state for in-memory computation
22
- - state for storing the data
89
+ - state needed for expression evaluation
90
+ - containers for output column values
91
+
23
92
  This is not meant to be a black-box abstraction.
24
93
 
25
94
  In-memory representations by column type:
@@ -28,54 +97,113 @@ class DataRow:
28
97
  - FloatType: float
29
98
  - BoolType: bool
30
99
  - TimestampType: datetime.datetime
100
+ - DateType: datetime.date
101
+ - UUIDType: uuid.UUID
102
+ - BinaryType: bytes
31
103
  - JsonType: json-serializable object
32
104
  - ArrayType: numpy.ndarray
33
105
  - ImageType: PIL.Image.Image
34
106
  - VideoType: local path if available, otherwise url
107
+ - AudioType: local path if available, otherwise url
108
+ - DocumentType: local path if available, otherwise url
35
109
  """
36
110
 
37
- vals: list[Any]
38
- has_val: list[bool]
39
- excs: list[Optional[Exception]]
40
-
41
- # control structures that are shared across all DataRows in a batch
42
- img_slot_idxs: list[int]
43
- media_slot_idxs: list[int]
44
- array_slot_idxs: list[int]
111
+ # expr evaluation state; indexed by slot idx
112
+ vals: np.ndarray # of object
113
+ has_val: np.ndarray # of bool
114
+ excs: np.ndarray # of object
115
+ missing_slots: np.ndarray # of bool; number of missing dependencies
116
+ missing_dependents: np.ndarray # of int16; number of missing dependents
117
+ is_scheduled: np.ndarray # of bool; True if this slot is scheduled for evaluation
45
118
 
46
- # the primary key of a store row is a sequence of ints (the number is different for table vs view)
47
- pk: Optional[tuple[int, ...]]
119
+ # CellMd needed for query execution; needs to be indexed by slot idx, not column id, to work for joins
120
+ slot_md: dict[int, CellMd]
48
121
 
49
122
  # file_urls:
50
123
  # - stored url of file for media in vals[i]
51
124
  # - None if vals[i] is not media type
52
125
  # - not None if file_paths[i] is not None
53
- file_urls: list[Optional[str]]
126
+ # TODO: this is a sparse vector; should it be a dict[int, str]?
127
+ file_urls: np.ndarray # of str
54
128
 
55
129
  # file_paths:
56
130
  # - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
57
131
  # - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
58
- file_paths: list[Optional[str]]
132
+ # TODO: this is a sparse vector; should it be a dict[int, str]?
133
+ file_paths: np.ndarray # of str
134
+
135
+ # If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
136
+ # exception handling under normal operation.
137
+ _may_have_exc: bool
138
+
139
+ # the primary key of a store row is a sequence of ints (the number is different for table vs view)
140
+ pk: tuple[int, ...] | None
141
+ # for nested rows (ie, those produced by JsonMapperDispatcher)
142
+ parent_row: DataRow | None
143
+ parent_slot_idx: int | None
59
144
 
60
- def __init__(self, size: int, img_slot_idxs: list[int], media_slot_idxs: list[int], array_slot_idxs: list[int]):
61
- self.vals = [None] * size
62
- self.has_val = [False] * size
63
- self.excs = [None] * size
145
+ # state for table output (insert()/update()); key: column id
146
+ cell_vals: dict[int, Any] # materialized values of output columns, in the format required for the column
147
+ cell_md: dict[int, CellMd]
148
+
149
+ # control structures that are shared across all DataRows in a batch
150
+ img_slot_idxs: list[int]
151
+ media_slot_idxs: list[int]
152
+ array_slot_idxs: list[int]
153
+ json_slot_idxs: list[int]
154
+
155
+ def __init__(
156
+ self,
157
+ size: int,
158
+ img_slot_idxs: list[int],
159
+ media_slot_idxs: list[int],
160
+ array_slot_idxs: list[int],
161
+ json_slot_idxs: list[int],
162
+ parent_row: DataRow | None = None,
163
+ parent_slot_idx: int | None = None,
164
+ ):
165
+ self.init(size)
166
+ self.parent_row = parent_row
167
+ self.parent_slot_idx = parent_slot_idx
64
168
  self.img_slot_idxs = img_slot_idxs
65
169
  self.media_slot_idxs = media_slot_idxs
66
170
  self.array_slot_idxs = array_slot_idxs
171
+ self.json_slot_idxs = json_slot_idxs
172
+
173
+ def init(self, size: int) -> None:
174
+ self.vals = np.full(size, None, dtype=object)
175
+ self.has_val = np.zeros(size, dtype=bool)
176
+ self.excs = np.full(size, None, dtype=object)
177
+ self.missing_slots = np.zeros(size, dtype=bool)
178
+ self.missing_dependents = np.zeros(size, dtype=np.int16)
179
+ self.is_scheduled = np.zeros(size, dtype=bool)
180
+ self.slot_md = {}
181
+ self.file_urls = np.full(size, None, dtype=object)
182
+ self.file_paths = np.full(size, None, dtype=object)
183
+ self._may_have_exc = False
184
+ self.cell_vals = {}
185
+ self.cell_md = {}
67
186
  self.pk = None
68
- self.file_urls = [None] * size
69
- self.file_paths = [None] * size
70
-
71
- def clear(self) -> None:
72
- size = len(self.vals)
73
- self.vals = [None] * size
74
- self.has_val = [False] * size
75
- self.excs = [None] * size
76
- self.pk = None
77
- self.file_urls = [None] * size
78
- self.file_paths = [None] * size
187
+ self.parent_row = None
188
+ self.parent_slot_idx = None
189
+
190
+ def clear(self, slot_idxs: np.ndarray | None = None) -> None:
191
+ if slot_idxs is not None:
192
+ self.has_val[slot_idxs] = False
193
+ self.vals[slot_idxs] = None
194
+ self.excs[slot_idxs] = None
195
+ self.file_urls[slot_idxs] = None
196
+ self.file_paths[slot_idxs] = None
197
+ else:
198
+ self.init(len(self.vals))
199
+
200
+ def set_file_path(self, idx: int, path: str) -> None:
201
+ """Augment an existing url with a local file path"""
202
+ assert self.has_val[idx]
203
+ assert idx in self.img_slot_idxs or idx in self.media_slot_idxs
204
+ self.file_paths[idx] = path
205
+ if idx in self.media_slot_idxs:
206
+ self.vals[idx] = path
79
207
 
80
208
  def copy(self, target: DataRow) -> None:
81
209
  """Create a copy of the contents of this DataRow in target
@@ -92,26 +220,32 @@ class DataRow:
92
220
  def set_pk(self, pk: tuple[int, ...]) -> None:
93
221
  self.pk = pk
94
222
 
95
- def has_exc(self, slot_idx: Optional[int] = None) -> bool:
223
+ def has_exc(self, slot_idx: int | None = None) -> bool:
96
224
  """
97
225
  Returns True if an exception has been set for the given slot index, or for any slot index if slot_idx is None
98
226
  """
227
+ if not self._may_have_exc:
228
+ return False
229
+
99
230
  if slot_idx is not None:
100
231
  return self.excs[slot_idx] is not None
101
- return any(exc is not None for exc in self.excs)
232
+ return (self.excs != None).any()
102
233
 
103
- def get_exc(self, slot_idx: int) -> Optional[Exception]:
104
- return self.excs[slot_idx]
234
+ def get_exc(self, slot_idx: int) -> Exception | None:
235
+ exc = self.excs[slot_idx]
236
+ assert exc is None or isinstance(exc, Exception)
237
+ return exc
105
238
 
106
- def get_first_exc(self) -> Optional[Exception]:
107
- for exc in self.excs:
108
- if exc is not None:
109
- return exc
110
- return None
239
+ def get_first_exc(self) -> Exception | None:
240
+ mask = self.excs != None
241
+ if not mask.any():
242
+ return None
243
+ return self.excs[mask][0]
111
244
 
112
245
  def set_exc(self, slot_idx: int, exc: Exception) -> None:
113
246
  assert self.excs[slot_idx] is None
114
247
  self.excs[slot_idx] = exc
248
+ self._may_have_exc = True
115
249
 
116
250
  # an exception means the value is None
117
251
  self.has_val[slot_idx] = True
@@ -119,16 +253,13 @@ class DataRow:
119
253
  self.file_paths[slot_idx] = None
120
254
  self.file_urls[slot_idx] = None
121
255
 
122
- def __len__(self) -> int:
123
- return len(self.vals)
124
-
125
- def __getitem__(self, index: object) -> Any:
256
+ def __getitem__(self, index: int) -> Any:
126
257
  """Returns in-memory value, ie, what is needed for expr evaluation"""
127
258
  assert isinstance(index, int)
128
259
  if not self.has_val[index]:
129
- # for debugging purposes
130
- pass
131
- assert self.has_val[index], index
260
+ # This is a sufficiently cheap and sensitive validation that it makes sense to keep the assertion around
261
+ # even if python is running with -O.
262
+ raise AssertionError(index)
132
263
 
133
264
  if self.file_urls[index] is not None and index in self.img_slot_idxs:
134
265
  # if we need to load this from a file, it should have been materialized locally
@@ -140,7 +271,7 @@ class DataRow:
140
271
 
141
272
  return self.vals[index]
142
273
 
143
- def get_stored_val(self, index: int, sa_col_type: Optional[sql.types.TypeEngine] = None) -> Any:
274
+ def get_stored_val(self, index: int, sa_col_type: sql.types.TypeEngine | None = None) -> Any:
144
275
  """Return the value that gets stored in the db"""
145
276
  assert self.excs[index] is None
146
277
  if not self.has_val[index]:
@@ -171,7 +302,7 @@ class DataRow:
171
302
 
172
303
  return self.vals[index]
173
304
 
174
- def __setitem__(self, idx: object, val: Any) -> None:
305
+ def __setitem__(self, idx: int, val: Any) -> None:
175
306
  """Assign in-memory cell value
176
307
  This allows overwriting
177
308
  """
@@ -188,9 +319,10 @@ class DataRow:
188
319
  # local file path
189
320
  assert self.file_urls[idx] is None and self.file_paths[idx] is None
190
321
  if len(parsed.scheme) <= 1:
191
- self.file_urls[idx] = urllib.parse.urljoin('file:', urllib.request.pathname2url(val))
192
- self.file_paths[idx] = val
193
- else:
322
+ path = str(Path(val).absolute()) # Ensure we're using an absolute pathname.
323
+ self.file_urls[idx] = urllib.parse.urljoin('file:', urllib.request.pathname2url(path))
324
+ self.file_paths[idx] = path
325
+ else: # file:// URL
194
326
  self.file_urls[idx] = val
195
327
  # Wrap the path in a url2pathname() call to ensure proper handling on Windows.
196
328
  self.file_paths[idx] = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
@@ -207,37 +339,46 @@ class DataRow:
207
339
  self.vals[idx] = val
208
340
  self.has_val[idx] = True
209
341
 
210
- def set_file_path(self, idx: int, path: str) -> None:
211
- """Augment an existing url with a local file path"""
212
- assert self.has_val[idx]
213
- assert idx in self.img_slot_idxs or idx in self.media_slot_idxs
214
- self.file_paths[idx] = path
215
- if idx in self.media_slot_idxs:
216
- self.vals[idx] = path
342
+ def prepare_col_val_for_save(self, index: int, col: catalog.Column | None = None) -> bool:
343
+ """
344
+ Prepare to save a column's value into the appropriate store. Discard unneeded values.
217
345
 
218
- def flush_img(self, index: int, filepath: Optional[str] = None) -> None:
219
- """Discard the in-memory value and save it to a local file, if filepath is not None"""
346
+ Return:
347
+ True if the media object in the column needs to be saved.
348
+ """
220
349
  if self.vals[index] is None:
221
- return
350
+ return False
351
+
352
+ if self.file_urls[index] is not None:
353
+ return False
354
+
222
355
  assert self.excs[index] is None
223
356
  if self.file_paths[index] is None:
224
- if filepath is not None:
225
- # we want to save this to a file
226
- self.file_paths[index] = filepath
227
- self.file_urls[index] = urllib.parse.urljoin('file:', urllib.request.pathname2url(filepath))
228
- image = self.vals[index]
229
- assert isinstance(image, PIL.Image.Image)
230
- # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
231
- # In that case, use WebP instead.
232
- format = 'webp' if image.has_transparency_data else 'jpeg'
233
- image.save(filepath, format=format)
357
+ if col is not None:
358
+ # This is a media object that needs to be saved
359
+ return True
234
360
  else:
235
- # we discard the content of this cell
361
+ # This is a media object that we don't care about, so we discard it
236
362
  self.has_val[index] = False
237
363
  else:
238
364
  # we already have a file for this image, nothing left to do
239
365
  pass
366
+
367
+ self.vals[index] = None
368
+ return False
369
+
370
+ def save_media_to_temp(self, index: int, col: catalog.Column) -> str:
371
+ """Save the media object in the column to the TempStore.
372
+ Objects cannot be saved directly to general destinations."""
373
+ assert col.col_type.is_media_type()
374
+ val = self.vals[index]
375
+ format = None
376
+ if isinstance(val, PIL.Image.Image):
377
+ format = image_utils.default_format(val)
378
+ filepath, url = TempStore.save_media_object(val, col, format=format)
379
+ self.file_paths[index] = str(filepath) if filepath is not None else None
240
380
  self.vals[index] = None
381
+ return url
241
382
 
242
383
  @property
243
384
  def rowid(self) -> tuple[int, ...]: