ome-arrow 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ome_arrow/__init__.py +19 -0
- ome_arrow/_version.py +34 -0
- ome_arrow/core.py +492 -0
- ome_arrow/export.py +422 -0
- ome_arrow/ingest.py +932 -0
- ome_arrow/meta.py +90 -0
- ome_arrow/transform.py +182 -0
- ome_arrow/utils.py +83 -0
- ome_arrow/view.py +286 -0
- ome_arrow-0.0.2.dist-info/METADATA +34 -0
- ome_arrow-0.0.2.dist-info/RECORD +14 -0
- ome_arrow-0.0.2.dist-info/WHEEL +5 -0
- ome_arrow-0.0.2.dist-info/licenses/LICENSE +28 -0
- ome_arrow-0.0.2.dist-info/top_level.txt +1 -0
ome_arrow/ingest.py
ADDED
|
@@ -0,0 +1,932 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Converting to and from OME-Arrow formats.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import itertools
|
|
6
|
+
import re
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
|
10
|
+
|
|
11
|
+
import bioio_ome_tiff
|
|
12
|
+
import bioio_tifffile
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pyarrow as pa
|
|
15
|
+
import pyarrow.parquet as pq
|
|
16
|
+
from bioio import BioImage
|
|
17
|
+
from bioio_ome_zarr import Reader as OMEZarrReader
|
|
18
|
+
|
|
19
|
+
from ome_arrow.meta import OME_ARROW_STRUCT, OME_ARROW_TAG_TYPE, OME_ARROW_TAG_VERSION
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def to_ome_arrow(
|
|
23
|
+
type_: str = OME_ARROW_TAG_TYPE,
|
|
24
|
+
version: str = OME_ARROW_TAG_VERSION,
|
|
25
|
+
image_id: str = "unnamed",
|
|
26
|
+
name: str = "unknown",
|
|
27
|
+
acquisition_datetime: Optional[datetime] = None,
|
|
28
|
+
dimension_order: str = "XYZCT",
|
|
29
|
+
dtype: str = "uint16",
|
|
30
|
+
size_x: int = 1,
|
|
31
|
+
size_y: int = 1,
|
|
32
|
+
size_z: int = 1,
|
|
33
|
+
size_c: int = 1,
|
|
34
|
+
size_t: int = 1,
|
|
35
|
+
physical_size_x: float = 1.0,
|
|
36
|
+
physical_size_y: float = 1.0,
|
|
37
|
+
physical_size_z: float = 1.0,
|
|
38
|
+
physical_size_unit: str = "µm",
|
|
39
|
+
channels: Optional[List[Dict[str, Any]]] = None,
|
|
40
|
+
planes: Optional[List[Dict[str, Any]]] = None,
|
|
41
|
+
masks: Any = None,
|
|
42
|
+
) -> pa.StructScalar:
|
|
43
|
+
"""
|
|
44
|
+
Create a typed OME-Arrow StructScalar with sensible defaults.
|
|
45
|
+
|
|
46
|
+
This builds and validates a nested dict that conforms to the given
|
|
47
|
+
StructType (e.g., OME_ARROW_STRUCT). You can override any field
|
|
48
|
+
explicitly; others use safe defaults.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
type_: Top-level type string ("ome.arrow" by default).
|
|
52
|
+
version: Specification version string.
|
|
53
|
+
image_id: Unique image identifier.
|
|
54
|
+
name: Human-friendly name.
|
|
55
|
+
acquisition_datetime: Datetime of acquisition (defaults to now).
|
|
56
|
+
dimension_order: Dimension order ("XYZCT" or "XYCT").
|
|
57
|
+
dtype: Pixel data type string (e.g., "uint16").
|
|
58
|
+
size_x, size_y, size_z, size_c, size_t: Axis sizes.
|
|
59
|
+
physical_size_x/y/z: Physical scaling in µm.
|
|
60
|
+
physical_size_unit: Unit string, default "µm".
|
|
61
|
+
channels: List of channel dicts. Autogenerates one if None.
|
|
62
|
+
planes: List of plane dicts. Empty if None.
|
|
63
|
+
masks: Optional placeholder for future annotations.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
pa.StructScalar: A validated StructScalar for the schema.
|
|
67
|
+
|
|
68
|
+
Example:
|
|
69
|
+
>>> s = to_struct_scalar(OME_ARROW_STRUCT, image_id="img001")
|
|
70
|
+
>>> s.type == OME_ARROW_STRUCT
|
|
71
|
+
True
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
type_ = str(type_)
|
|
75
|
+
version = str(version)
|
|
76
|
+
image_id = str(image_id)
|
|
77
|
+
name = str(name)
|
|
78
|
+
dimension_order = str(dimension_order)
|
|
79
|
+
dtype = str(dtype)
|
|
80
|
+
physical_size_unit = str(physical_size_unit)
|
|
81
|
+
|
|
82
|
+
# Sensible defaults for channels and planes
|
|
83
|
+
if channels is None:
|
|
84
|
+
channels = [
|
|
85
|
+
{
|
|
86
|
+
"id": "ch-0",
|
|
87
|
+
"name": "default",
|
|
88
|
+
"emission_um": 0.0,
|
|
89
|
+
"excitation_um": 0.0,
|
|
90
|
+
"illumination": "Unknown",
|
|
91
|
+
"color_rgba": 0xFFFFFFFF,
|
|
92
|
+
}
|
|
93
|
+
]
|
|
94
|
+
else:
|
|
95
|
+
# --- NEW: coerce channel text fields to str ------------------
|
|
96
|
+
for ch in channels:
|
|
97
|
+
if "id" in ch:
|
|
98
|
+
ch["id"] = str(ch["id"])
|
|
99
|
+
if "name" in ch:
|
|
100
|
+
ch["name"] = str(ch["name"])
|
|
101
|
+
if "illumination" in ch:
|
|
102
|
+
ch["illumination"] = str(ch["illumination"])
|
|
103
|
+
|
|
104
|
+
if planes is None:
|
|
105
|
+
planes = [{"z": 0, "t": 0, "c": 0, "pixels": [0] * (size_x * size_y)}]
|
|
106
|
+
|
|
107
|
+
record = {
|
|
108
|
+
"type": type_,
|
|
109
|
+
"version": version,
|
|
110
|
+
"id": image_id,
|
|
111
|
+
"name": name,
|
|
112
|
+
"acquisition_datetime": acquisition_datetime or datetime.now(timezone.utc),
|
|
113
|
+
"pixels_meta": {
|
|
114
|
+
"dimension_order": dimension_order,
|
|
115
|
+
"type": dtype,
|
|
116
|
+
"size_x": size_x,
|
|
117
|
+
"size_y": size_y,
|
|
118
|
+
"size_z": size_z,
|
|
119
|
+
"size_c": size_c,
|
|
120
|
+
"size_t": size_t,
|
|
121
|
+
"physical_size_x": physical_size_x,
|
|
122
|
+
"physical_size_y": physical_size_y,
|
|
123
|
+
"physical_size_z": physical_size_z,
|
|
124
|
+
"physical_size_x_unit": physical_size_unit,
|
|
125
|
+
"physical_size_y_unit": physical_size_unit,
|
|
126
|
+
"physical_size_z_unit": physical_size_unit,
|
|
127
|
+
"channels": channels,
|
|
128
|
+
},
|
|
129
|
+
"planes": planes,
|
|
130
|
+
"masks": masks,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return pa.scalar(record, type=OME_ARROW_STRUCT)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def from_numpy(
|
|
137
|
+
arr: np.ndarray,
|
|
138
|
+
*,
|
|
139
|
+
dim_order: str = "TCZYX",
|
|
140
|
+
image_id: Optional[str] = None,
|
|
141
|
+
name: Optional[str] = None,
|
|
142
|
+
channel_names: Optional[Sequence[str]] = None,
|
|
143
|
+
acquisition_datetime: Optional[datetime] = None,
|
|
144
|
+
clamp_to_uint16: bool = True,
|
|
145
|
+
# meta
|
|
146
|
+
physical_size_x: float = 1.0,
|
|
147
|
+
physical_size_y: float = 1.0,
|
|
148
|
+
physical_size_z: float = 1.0,
|
|
149
|
+
physical_size_unit: str = "µm",
|
|
150
|
+
dtype_meta: Optional[str] = None, # if None, inferred from output dtype
|
|
151
|
+
) -> pa.StructScalar:
|
|
152
|
+
"""
|
|
153
|
+
Build an OME-Arrow StructScalar from a NumPy array.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
arr : np.ndarray
|
|
158
|
+
Image data with axes described by `dim_order`.
|
|
159
|
+
dim_order : str, default "TCZYX"
|
|
160
|
+
Axis labels for `arr`. Must include "Y" and "X".
|
|
161
|
+
Supported examples: "YX", "ZYX", "CYX", "CZYX", "TYX", "TCYX", "TCZYX".
|
|
162
|
+
image_id, name : Optional[str]
|
|
163
|
+
Identifiers to embed in the record.
|
|
164
|
+
channel_names : Optional[Sequence[str]]
|
|
165
|
+
Names for channels; defaults to C0..C{n-1}.
|
|
166
|
+
acquisition_datetime : Optional[datetime]
|
|
167
|
+
Defaults to now (UTC) if None.
|
|
168
|
+
clamp_to_uint16 : bool, default True
|
|
169
|
+
If True, clamp/cast planes to uint16 before serialization.
|
|
170
|
+
physical_size_x/y/z : float
|
|
171
|
+
Spatial pixel sizes (µm), Z used if present.
|
|
172
|
+
physical_size_unit : str
|
|
173
|
+
Unit string for spatial axes (default "µm").
|
|
174
|
+
dtype_meta : Optional[str]
|
|
175
|
+
Pixel dtype string to place in metadata; if None, inferred from the
|
|
176
|
+
(possibly cast) array's dtype.
|
|
177
|
+
|
|
178
|
+
Returns
|
|
179
|
+
-------
|
|
180
|
+
pa.StructScalar
|
|
181
|
+
Typed OME-Arrow record (schema = OME_ARROW_STRUCT).
|
|
182
|
+
|
|
183
|
+
Notes
|
|
184
|
+
-----
|
|
185
|
+
- If Z is not in `dim_order`, `size_z` will be 1 and the meta
|
|
186
|
+
dimension_order becomes "XYCT"; otherwise "XYZCT".
|
|
187
|
+
- If T/C are absent in `dim_order`, they default to size 1.
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
if not isinstance(arr, np.ndarray):
|
|
191
|
+
raise TypeError("from_numpy expects a NumPy ndarray.")
|
|
192
|
+
|
|
193
|
+
dims = dim_order.upper()
|
|
194
|
+
if "Y" not in dims or "X" not in dims:
|
|
195
|
+
raise ValueError("dim_order must include 'Y' and 'X' axes.")
|
|
196
|
+
|
|
197
|
+
# Map current axes -> indices
|
|
198
|
+
axis_to_idx: Dict[str, int] = {ax: i for i, ax in enumerate(dims)}
|
|
199
|
+
|
|
200
|
+
# Extract sizes with defaults for missing axes
|
|
201
|
+
size_x = int(arr.shape[axis_to_idx["X"]])
|
|
202
|
+
size_y = int(arr.shape[axis_to_idx["Y"]])
|
|
203
|
+
size_z = int(arr.shape[axis_to_idx["Z"]]) if "Z" in axis_to_idx else 1
|
|
204
|
+
size_c = int(arr.shape[axis_to_idx["C"]]) if "C" in axis_to_idx else 1
|
|
205
|
+
size_t = int(arr.shape[axis_to_idx["T"]]) if "T" in axis_to_idx else 1
|
|
206
|
+
|
|
207
|
+
if size_x <= 0 or size_y <= 0:
|
|
208
|
+
raise ValueError("Image must have positive Y and X dimensions.")
|
|
209
|
+
|
|
210
|
+
# Reorder to a standard (T, C, Z, Y, X) view for plane extraction
|
|
211
|
+
desired_axes = ["T", "C", "Z", "Y", "X"]
|
|
212
|
+
current_axes = list(dims)
|
|
213
|
+
# Insert absent axes with size 1 using np.expand_dims
|
|
214
|
+
view = arr
|
|
215
|
+
for ax in desired_axes:
|
|
216
|
+
if ax not in axis_to_idx:
|
|
217
|
+
# Append a new singleton axis at the end, then we'll permute
|
|
218
|
+
view = np.expand_dims(view, axis=-1)
|
|
219
|
+
# Pretend this new axis now exists at the last index
|
|
220
|
+
current_axes.append(ax)
|
|
221
|
+
axis_to_idx = {a: i for i, a in enumerate(current_axes)}
|
|
222
|
+
|
|
223
|
+
# Permute to TCZYX
|
|
224
|
+
perm = [axis_to_idx[a] for a in desired_axes]
|
|
225
|
+
tczyx = np.transpose(view, axes=perm)
|
|
226
|
+
|
|
227
|
+
# Validate final shape
|
|
228
|
+
if tuple(tczyx.shape) != (size_t, size_c, size_z, size_y, size_x):
|
|
229
|
+
# This should not happen, but guard just in case
|
|
230
|
+
raise ValueError(
|
|
231
|
+
"Internal axis reordering mismatch: "
|
|
232
|
+
f"got {tczyx.shape} vs expected {(size_t, size_c, size_z, size_y, size_x)}"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Clamp/cast
|
|
236
|
+
if clamp_to_uint16 and tczyx.dtype != np.uint16:
|
|
237
|
+
tczyx = np.clip(tczyx, 0, 65535).astype(np.uint16, copy=False)
|
|
238
|
+
|
|
239
|
+
# Channel names
|
|
240
|
+
if not channel_names or len(channel_names) != size_c:
|
|
241
|
+
channel_names = [f"C{i}" for i in range(size_c)]
|
|
242
|
+
channel_names = [str(x) for x in channel_names]
|
|
243
|
+
|
|
244
|
+
channels = [
|
|
245
|
+
{
|
|
246
|
+
"id": f"ch-{i}",
|
|
247
|
+
"name": channel_names[i],
|
|
248
|
+
"emission_um": 0.0,
|
|
249
|
+
"excitation_um": 0.0,
|
|
250
|
+
"illumination": "Unknown",
|
|
251
|
+
"color_rgba": 0xFFFFFFFF,
|
|
252
|
+
}
|
|
253
|
+
for i in range(size_c)
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
# Build planes: flatten YX per (t,c,z)
|
|
257
|
+
planes: List[Dict[str, Any]] = []
|
|
258
|
+
for t in range(size_t):
|
|
259
|
+
for c in range(size_c):
|
|
260
|
+
for z in range(size_z):
|
|
261
|
+
plane = tczyx[t, c, z]
|
|
262
|
+
planes.append(
|
|
263
|
+
{"z": z, "t": t, "c": c, "pixels": plane.ravel().tolist()}
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# Meta dimension_order: mirror your other ingests
|
|
267
|
+
meta_dim_order = "XYCT" if size_z == 1 else "XYZCT"
|
|
268
|
+
|
|
269
|
+
# Pixel dtype in metadata
|
|
270
|
+
dtype_str = dtype_meta or np.dtype(tczyx.dtype).name
|
|
271
|
+
|
|
272
|
+
return to_ome_arrow(
|
|
273
|
+
image_id=str(image_id or "unnamed"),
|
|
274
|
+
name=str(name or "unknown"),
|
|
275
|
+
acquisition_datetime=acquisition_datetime or datetime.now(timezone.utc),
|
|
276
|
+
dimension_order=meta_dim_order,
|
|
277
|
+
dtype=dtype_str,
|
|
278
|
+
size_x=size_x,
|
|
279
|
+
size_y=size_y,
|
|
280
|
+
size_z=size_z,
|
|
281
|
+
size_c=size_c,
|
|
282
|
+
size_t=size_t,
|
|
283
|
+
physical_size_x=float(physical_size_x),
|
|
284
|
+
physical_size_y=float(physical_size_y),
|
|
285
|
+
physical_size_z=float(physical_size_z),
|
|
286
|
+
physical_size_unit=str(physical_size_unit),
|
|
287
|
+
channels=channels,
|
|
288
|
+
planes=planes,
|
|
289
|
+
masks=None,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def from_tiff(
|
|
294
|
+
tiff_path: str | Path,
|
|
295
|
+
image_id: Optional[str] = None,
|
|
296
|
+
name: Optional[str] = None,
|
|
297
|
+
channel_names: Optional[Sequence[str]] = None,
|
|
298
|
+
acquisition_datetime: Optional[datetime] = None,
|
|
299
|
+
clamp_to_uint16: bool = True,
|
|
300
|
+
) -> pa.StructScalar:
|
|
301
|
+
"""
|
|
302
|
+
Read a TIFF and return a typed OME-Arrow StructScalar.
|
|
303
|
+
|
|
304
|
+
Uses bioio to read TCZYX (or XY) data, flattens each YX plane, and
|
|
305
|
+
delegates struct creation to `to_struct_scalar`.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
tiff_path: Path to a TIFF readable by bioio.
|
|
309
|
+
image_id: Optional stable image identifier (defaults to stem).
|
|
310
|
+
name: Optional human label (defaults to file name).
|
|
311
|
+
channel_names: Optional channel names; defaults to C0..C{n-1}.
|
|
312
|
+
acquisition_datetime: Optional acquisition time (UTC now if None).
|
|
313
|
+
clamp_to_uint16: If True, clamp/cast planes to uint16.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
pa.StructScalar validated against `struct`.
|
|
317
|
+
"""
|
|
318
|
+
|
|
319
|
+
p = Path(tiff_path)
|
|
320
|
+
|
|
321
|
+
img = BioImage(
|
|
322
|
+
image=str(p),
|
|
323
|
+
reader=(
|
|
324
|
+
bioio_ome_tiff.Reader
|
|
325
|
+
if str(p).lower().endswith(("ome.tif", "ome.tiff"))
|
|
326
|
+
else bioio_tifffile.Reader
|
|
327
|
+
),
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
arr = np.asarray(img.data) # (T, C, Z, Y, X)
|
|
331
|
+
dims = img.dims
|
|
332
|
+
size_t = int(dims.T or 1)
|
|
333
|
+
size_c = int(dims.C or 1)
|
|
334
|
+
size_z = int(dims.Z or 1)
|
|
335
|
+
size_y = int(dims.Y or arr.shape[-2])
|
|
336
|
+
size_x = int(dims.X or arr.shape[-1])
|
|
337
|
+
if size_x <= 0 or size_y <= 0:
|
|
338
|
+
raise ValueError("Image must have positive Y and X dims.")
|
|
339
|
+
|
|
340
|
+
pps = getattr(img, "physical_pixel_sizes", None)
|
|
341
|
+
try:
|
|
342
|
+
psize_x = float(getattr(pps, "X", None) or 1.0)
|
|
343
|
+
psize_y = float(getattr(pps, "Y", None) or 1.0)
|
|
344
|
+
psize_z = float(getattr(pps, "Z", None) or 1.0)
|
|
345
|
+
except Exception:
|
|
346
|
+
psize_x = psize_y = psize_z = 1.0
|
|
347
|
+
|
|
348
|
+
# --- NEW: coerce top-level strings --------------------------------
|
|
349
|
+
img_id = str(image_id or p.stem)
|
|
350
|
+
display_name = str(name or p.name)
|
|
351
|
+
|
|
352
|
+
# --- NEW: ensure channel_names is list[str] ------------------------
|
|
353
|
+
if not channel_names or len(channel_names) != size_c:
|
|
354
|
+
channel_names = [f"C{i}" for i in range(size_c)]
|
|
355
|
+
channel_names = [str(x) for x in channel_names]
|
|
356
|
+
|
|
357
|
+
channels = [
|
|
358
|
+
{
|
|
359
|
+
"id": f"ch-{i}",
|
|
360
|
+
"name": channel_names[i],
|
|
361
|
+
"emission_um": 0.0,
|
|
362
|
+
"excitation_um": 0.0,
|
|
363
|
+
"illumination": "Unknown",
|
|
364
|
+
"color_rgba": 0xFFFFFFFF,
|
|
365
|
+
}
|
|
366
|
+
for i in range(size_c)
|
|
367
|
+
]
|
|
368
|
+
|
|
369
|
+
planes: List[Dict[str, Any]] = []
|
|
370
|
+
for t in range(size_t):
|
|
371
|
+
for c in range(size_c):
|
|
372
|
+
for z in range(size_z):
|
|
373
|
+
plane = arr[t, c, z]
|
|
374
|
+
if clamp_to_uint16 and plane.dtype != np.uint16:
|
|
375
|
+
plane = np.clip(plane, 0, 65535).astype(np.uint16)
|
|
376
|
+
planes.append(
|
|
377
|
+
{"z": z, "t": t, "c": c, "pixels": plane.ravel().tolist()}
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
dim_order = "XYCT" if size_z == 1 else "XYZCT"
|
|
381
|
+
|
|
382
|
+
return to_ome_arrow(
|
|
383
|
+
image_id=img_id,
|
|
384
|
+
name=display_name,
|
|
385
|
+
acquisition_datetime=acquisition_datetime or datetime.now(timezone.utc),
|
|
386
|
+
dimension_order=dim_order,
|
|
387
|
+
dtype="uint16",
|
|
388
|
+
size_x=size_x,
|
|
389
|
+
size_y=size_y,
|
|
390
|
+
size_z=size_z,
|
|
391
|
+
size_c=size_c,
|
|
392
|
+
size_t=size_t,
|
|
393
|
+
physical_size_x=psize_x,
|
|
394
|
+
physical_size_y=psize_y,
|
|
395
|
+
physical_size_z=psize_z,
|
|
396
|
+
physical_size_unit="µm",
|
|
397
|
+
channels=channels,
|
|
398
|
+
planes=planes,
|
|
399
|
+
masks=None,
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def from_stack_pattern_path(
|
|
404
|
+
pattern_path: str | Path,
|
|
405
|
+
default_dim_for_unspecified: str = "C",
|
|
406
|
+
map_series_to: Optional[str] = "T",
|
|
407
|
+
clamp_to_uint16: bool = True,
|
|
408
|
+
channel_names: Optional[List[str]] = None,
|
|
409
|
+
image_id: Optional[str] = None,
|
|
410
|
+
name: Optional[str] = None,
|
|
411
|
+
) -> pa.StructScalar:
|
|
412
|
+
path = Path(pattern_path)
|
|
413
|
+
folder = path.parent
|
|
414
|
+
line = path.name.strip()
|
|
415
|
+
if not line:
|
|
416
|
+
raise ValueError("Pattern path string is empty or malformed")
|
|
417
|
+
|
|
418
|
+
DIM_TOKENS = {
|
|
419
|
+
"C": {"c", "ch", "w", "wavelength"},
|
|
420
|
+
"T": {"t", "tl", "tp", "timepoint"},
|
|
421
|
+
"Z": {"z", "zs", "sec", "fp", "focal", "focalplane"},
|
|
422
|
+
"S": {"s", "sp", "series"},
|
|
423
|
+
}
|
|
424
|
+
NUM_RANGE_RE = re.compile(r"^(?P<a>\d+)\-(?P<b>\d+)(?::(?P<step>\d+))?$")
|
|
425
|
+
|
|
426
|
+
def detect_dim(before_text: str) -> Optional[str]:
|
|
427
|
+
m = re.search(r"([A-Za-z]+)$", before_text)
|
|
428
|
+
if not m:
|
|
429
|
+
return None
|
|
430
|
+
token = m.group(1).lower()
|
|
431
|
+
for dim, names in DIM_TOKENS.items():
|
|
432
|
+
if token in names:
|
|
433
|
+
return dim
|
|
434
|
+
return None
|
|
435
|
+
|
|
436
|
+
def expand_raw_token(raw: str) -> Tuple[List[str], bool]:
|
|
437
|
+
raw = raw.strip()
|
|
438
|
+
if "," in raw and not NUM_RANGE_RE.match(raw):
|
|
439
|
+
parts = [p.strip() for p in raw.split(",")]
|
|
440
|
+
return parts, all(p.isdigit() for p in parts)
|
|
441
|
+
m = NUM_RANGE_RE.match(raw)
|
|
442
|
+
if m:
|
|
443
|
+
a, b = m.group("a"), m.group("b")
|
|
444
|
+
step = int(m.group("step") or "1")
|
|
445
|
+
start, stop = int(a), int(b)
|
|
446
|
+
if stop < start:
|
|
447
|
+
raise ValueError(f"Inverted range not supported: <{raw}>")
|
|
448
|
+
width = max(len(a), len(b))
|
|
449
|
+
nums = [str(v).zfill(width) for v in range(start, stop + 1, step)]
|
|
450
|
+
return nums, True
|
|
451
|
+
return [raw], raw.isdigit()
|
|
452
|
+
|
|
453
|
+
def parse_bracket_pattern(s: str) -> Tuple[str, List[Dict[str, Any]]]:
|
|
454
|
+
placeholders, out = [], []
|
|
455
|
+
i = ph_i = 0
|
|
456
|
+
while i < len(s):
|
|
457
|
+
if s[i] == "<":
|
|
458
|
+
j = s.find(">", i + 1)
|
|
459
|
+
if j == -1:
|
|
460
|
+
raise ValueError("Unclosed '<' in pattern.")
|
|
461
|
+
raw_inside = s[i + 1 : j]
|
|
462
|
+
before = "".join(out)
|
|
463
|
+
dim = detect_dim(before) or "?"
|
|
464
|
+
choices, is_num = expand_raw_token(raw_inside)
|
|
465
|
+
placeholders.append(
|
|
466
|
+
{
|
|
467
|
+
"idx": ph_i,
|
|
468
|
+
"raw": raw_inside,
|
|
469
|
+
"choices": choices,
|
|
470
|
+
"dim": dim,
|
|
471
|
+
"is_numeric": is_num,
|
|
472
|
+
}
|
|
473
|
+
)
|
|
474
|
+
out.append(f"{{{ph_i}}}")
|
|
475
|
+
ph_i += 1
|
|
476
|
+
i = j + 1
|
|
477
|
+
else:
|
|
478
|
+
out.append(s[i])
|
|
479
|
+
i += 1
|
|
480
|
+
return "".join(out), placeholders
|
|
481
|
+
|
|
482
|
+
def regex_match(folder: Path, regex: str) -> List[Path]:
|
|
483
|
+
r = re.compile(regex)
|
|
484
|
+
return sorted(
|
|
485
|
+
[p for p in folder.iterdir() if p.is_file() and r.fullmatch(p.name)]
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
matched: Dict[Tuple[int, int, int], Path] = {}
|
|
489
|
+
literal_channel_names: Optional[List[str]] = None
|
|
490
|
+
|
|
491
|
+
if "<" in line and ">" in line:
|
|
492
|
+
template, placeholders = parse_bracket_pattern(line)
|
|
493
|
+
for ph in placeholders:
|
|
494
|
+
ph["dim"] = (ph["dim"] or "?").upper()
|
|
495
|
+
if ph["dim"] == "?":
|
|
496
|
+
ph["dim"] = default_dim_for_unspecified.upper()
|
|
497
|
+
|
|
498
|
+
for combo in itertools.product(*[ph["choices"] for ph in placeholders]):
|
|
499
|
+
fname = template.format(*combo)
|
|
500
|
+
fpath = folder / fname
|
|
501
|
+
if not fpath.exists():
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
t = c = z = 0
|
|
505
|
+
for ph, val in zip(placeholders, combo):
|
|
506
|
+
idx = ph["choices"].index(val)
|
|
507
|
+
dim = ph["dim"]
|
|
508
|
+
if dim == "S":
|
|
509
|
+
if not map_series_to:
|
|
510
|
+
raise ValueError("Encountered 'series' but map_series_to=None")
|
|
511
|
+
dim = map_series_to.upper()
|
|
512
|
+
if dim == "T":
|
|
513
|
+
t = idx
|
|
514
|
+
elif dim == "C":
|
|
515
|
+
c = idx
|
|
516
|
+
elif dim == "Z":
|
|
517
|
+
z = idx
|
|
518
|
+
|
|
519
|
+
if literal_channel_names is None:
|
|
520
|
+
for ph in placeholders:
|
|
521
|
+
dim_eff = ph["dim"] if ph["dim"] != "S" else (map_series_to or "S")
|
|
522
|
+
if dim_eff == "C" and not ph["is_numeric"]:
|
|
523
|
+
literal_channel_names = ph["choices"]
|
|
524
|
+
break
|
|
525
|
+
|
|
526
|
+
matched[(t, c, z)] = fpath
|
|
527
|
+
else:
|
|
528
|
+
for z, p in enumerate(regex_match(folder, line)):
|
|
529
|
+
matched[(0, 0, z)] = p
|
|
530
|
+
|
|
531
|
+
if not matched:
|
|
532
|
+
raise FileNotFoundError(f"No files matched pattern: {pattern_path}")
|
|
533
|
+
|
|
534
|
+
size_t = max(k[0] for k in matched) + 1
|
|
535
|
+
size_c = max(k[1] for k in matched) + 1
|
|
536
|
+
size_z = max(k[2] for k in matched) + 1
|
|
537
|
+
|
|
538
|
+
if channel_names and len(channel_names) != size_c:
|
|
539
|
+
raise ValueError(
|
|
540
|
+
f"channel_names length {len(channel_names)} != size_c {size_c}"
|
|
541
|
+
)
|
|
542
|
+
if not channel_names:
|
|
543
|
+
channel_names = literal_channel_names or [f"C{i}" for i in range(size_c)]
|
|
544
|
+
|
|
545
|
+
# ---- PROBE SHAPE (NEW: accept TCZYX and squeeze singleton axes) ----
|
|
546
|
+
sample = next(iter(matched.values()))
|
|
547
|
+
is_ome = sample.suffix.lower() in (".ome.tif", ".ome.tiff")
|
|
548
|
+
img0 = BioImage(
|
|
549
|
+
image=str(sample),
|
|
550
|
+
reader=(bioio_ome_tiff.Reader if is_ome else bioio_tifffile.Reader),
|
|
551
|
+
)
|
|
552
|
+
a0 = np.asarray(img0.data)
|
|
553
|
+
# bioio returns TCZYX or YX; normalize to TCZYX
|
|
554
|
+
if a0.ndim == 2:
|
|
555
|
+
_T0, _C0, _Z0, Y0, X0 = 1, 1, 1, a0.shape[0], a0.shape[1]
|
|
556
|
+
else:
|
|
557
|
+
# Heuristic: last two are (Y,X); leading dims are (T,C,Z) possibly singleton
|
|
558
|
+
Y0, X0 = a0.shape[-2], a0.shape[-1]
|
|
559
|
+
lead = a0.shape[:-2]
|
|
560
|
+
# Pad leading dims to T,C,Z (left-aligned)
|
|
561
|
+
_T0, _C0, _Z0 = ([*list(lead), 1, 1, 1])[:3]
|
|
562
|
+
size_y, size_x = Y0, X0
|
|
563
|
+
|
|
564
|
+
# physical pixel sizes
|
|
565
|
+
pps = getattr(img0, "physical_pixel_sizes", None)
|
|
566
|
+
try:
|
|
567
|
+
psize_x = float(getattr(pps, "X", None) or 1.0)
|
|
568
|
+
psize_y = float(getattr(pps, "Y", None) or 1.0)
|
|
569
|
+
psize_z = float(getattr(pps, "Z", None) or 1.0)
|
|
570
|
+
except Exception:
|
|
571
|
+
psize_x = psize_y = psize_z = 1.0
|
|
572
|
+
|
|
573
|
+
# ---- BUILD PLANES (NEW: support Z-stacks within a single file when T=C=1) ----
|
|
574
|
+
planes: List[Dict[str, Any]] = []
|
|
575
|
+
|
|
576
|
+
def _ensure_u16(arr: np.ndarray) -> np.ndarray:
|
|
577
|
+
if clamp_to_uint16 and arr.dtype != np.uint16:
|
|
578
|
+
arr = np.clip(arr, 0, 65535).astype(np.uint16)
|
|
579
|
+
return arr
|
|
580
|
+
|
|
581
|
+
for t in range(size_t):
|
|
582
|
+
for c in range(size_c):
|
|
583
|
+
for z in range(size_z):
|
|
584
|
+
fpath = matched.get((t, c, z))
|
|
585
|
+
if fpath is None:
|
|
586
|
+
# missing plane: zero-fill
|
|
587
|
+
planes.append(
|
|
588
|
+
{"z": z, "t": t, "c": c, "pixels": [0] * (size_x * size_y)}
|
|
589
|
+
)
|
|
590
|
+
continue
|
|
591
|
+
|
|
592
|
+
reader = (
|
|
593
|
+
bioio_ome_tiff.Reader
|
|
594
|
+
if fpath.suffix.lower() in (".ome.tif", ".ome.tiff")
|
|
595
|
+
else bioio_tifffile.Reader
|
|
596
|
+
)
|
|
597
|
+
im = BioImage(image=str(fpath), reader=reader)
|
|
598
|
+
arr = np.asarray(im.data)
|
|
599
|
+
|
|
600
|
+
if arr.ndim == 2:
|
|
601
|
+
# Direct YX
|
|
602
|
+
if arr.shape != (size_y, size_x):
|
|
603
|
+
raise ValueError(
|
|
604
|
+
f"Shape mismatch for {fpath.name}:"
|
|
605
|
+
f" {arr.shape} vs {(size_y, size_x)}"
|
|
606
|
+
)
|
|
607
|
+
arr = _ensure_u16(arr)
|
|
608
|
+
planes.append(
|
|
609
|
+
{"z": z, "t": t, "c": c, "pixels": arr.ravel().tolist()}
|
|
610
|
+
)
|
|
611
|
+
else:
|
|
612
|
+
# Treat as TCZYX; extract dims
|
|
613
|
+
Y, X = arr.shape[-2], arr.shape[-1]
|
|
614
|
+
lead = arr.shape[:-2]
|
|
615
|
+
Tn, Cn, Zn = ([*list(lead), 1, 1, 1])[:3]
|
|
616
|
+
if (size_y, size_x) != (Y, X):
|
|
617
|
+
raise ValueError(
|
|
618
|
+
f"Shape mismatch for {fpath.name}:"
|
|
619
|
+
f" {(Y, X)} vs {(size_y, size_x)}"
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
# Case A: singleton TCZ -> squeeze to YX
|
|
623
|
+
if Tn == 1 and Cn == 1 and Zn == 1:
|
|
624
|
+
plane2d = _ensure_u16(arr.reshape(Y, X))
|
|
625
|
+
planes.append(
|
|
626
|
+
{"z": z, "t": t, "c": c, "pixels": plane2d.ravel().tolist()}
|
|
627
|
+
)
|
|
628
|
+
# Case B: multi-Z only (expand across Z)
|
|
629
|
+
elif Tn == 1 and Cn == 1 and Zn > 1:
|
|
630
|
+
# spill Z pages starting at this z index
|
|
631
|
+
for z_local in range(Zn):
|
|
632
|
+
plane2d = _ensure_u16(
|
|
633
|
+
arr.reshape(1, 1, Zn, Y, X)[0, 0, z_local]
|
|
634
|
+
)
|
|
635
|
+
z_idx = z + z_local
|
|
636
|
+
planes.append(
|
|
637
|
+
{
|
|
638
|
+
"z": z_idx,
|
|
639
|
+
"t": t,
|
|
640
|
+
"c": c,
|
|
641
|
+
"pixels": plane2d.ravel().tolist(),
|
|
642
|
+
}
|
|
643
|
+
)
|
|
644
|
+
# bump global size_z if we exceeded it
|
|
645
|
+
size_z = max(size_z, z + Zn)
|
|
646
|
+
else:
|
|
647
|
+
# For now, we require multi-T/C pages to be
|
|
648
|
+
# expressed by the filename pattern,
|
|
649
|
+
# not embedded inside a single file.
|
|
650
|
+
raise ValueError(
|
|
651
|
+
f"{fpath.name} contains "
|
|
652
|
+
f"multiple pages across T/C/Z={Tn, Cn, Zn}; "
|
|
653
|
+
f"only Z>1 with T=C=1 is supported inside one file. "
|
|
654
|
+
f"Please express T/C via the filename pattern."
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# Adjust channels (meta)
|
|
658
|
+
channels_meta = [
|
|
659
|
+
{
|
|
660
|
+
"id": f"ch-{i}",
|
|
661
|
+
"name": str((channel_names or [f"C{i}" for i in range(size_c)])[i]),
|
|
662
|
+
"emission_um": 0.0,
|
|
663
|
+
"excitation_um": 0.0,
|
|
664
|
+
"illumination": "Unknown",
|
|
665
|
+
"color_rgba": 0xFFFFFFFF,
|
|
666
|
+
}
|
|
667
|
+
for i in range(size_c)
|
|
668
|
+
]
|
|
669
|
+
|
|
670
|
+
dim_order = "XYZCT" if size_z > 1 else "XYCT"
|
|
671
|
+
display_name = name or str(pattern_path)
|
|
672
|
+
img_id = image_id or path.stem
|
|
673
|
+
|
|
674
|
+
return to_ome_arrow(
|
|
675
|
+
image_id=str(img_id),
|
|
676
|
+
name=str(display_name),
|
|
677
|
+
acquisition_datetime=None,
|
|
678
|
+
dimension_order=dim_order,
|
|
679
|
+
dtype="uint16",
|
|
680
|
+
size_x=size_x,
|
|
681
|
+
size_y=size_y,
|
|
682
|
+
size_z=size_z,
|
|
683
|
+
size_c=size_c,
|
|
684
|
+
size_t=size_t,
|
|
685
|
+
physical_size_x=psize_x,
|
|
686
|
+
physical_size_y=psize_y,
|
|
687
|
+
physical_size_z=psize_z,
|
|
688
|
+
physical_size_unit="µm",
|
|
689
|
+
channels=channels_meta,
|
|
690
|
+
planes=planes,
|
|
691
|
+
masks=None,
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def from_ome_zarr(
|
|
696
|
+
zarr_path: str | Path,
|
|
697
|
+
image_id: Optional[str] = None,
|
|
698
|
+
name: Optional[str] = None,
|
|
699
|
+
channel_names: Optional[Sequence[str]] = None,
|
|
700
|
+
acquisition_datetime: Optional[datetime] = None,
|
|
701
|
+
clamp_to_uint16: bool = True,
|
|
702
|
+
) -> pa.StructScalar:
|
|
703
|
+
"""
|
|
704
|
+
Read an OME-Zarr directory and return a typed OME-Arrow StructScalar.
|
|
705
|
+
|
|
706
|
+
Uses BioIO with the OMEZarrReader backend to read TCZYX (or XY) data,
|
|
707
|
+
flattens each YX plane into OME-Arrow planes, and builds a validated
|
|
708
|
+
StructScalar via `to_ome_arrow`.
|
|
709
|
+
|
|
710
|
+
Args:
|
|
711
|
+
zarr_path:
|
|
712
|
+
Path to the OME-Zarr directory (e.g., "image.ome.zarr").
|
|
713
|
+
image_id:
|
|
714
|
+
Optional stable image identifier (defaults to directory stem).
|
|
715
|
+
name:
|
|
716
|
+
Optional display name (defaults to directory name).
|
|
717
|
+
channel_names:
|
|
718
|
+
Optional list of channel names. Defaults to C0, C1, ...
|
|
719
|
+
acquisition_datetime:
|
|
720
|
+
Optional datetime (defaults to UTC now).
|
|
721
|
+
clamp_to_uint16:
|
|
722
|
+
If True, cast pixels to uint16.
|
|
723
|
+
|
|
724
|
+
Returns:
|
|
725
|
+
pa.StructScalar: Validated OME-Arrow struct for this image.
|
|
726
|
+
"""
|
|
727
|
+
p = Path(zarr_path)
|
|
728
|
+
|
|
729
|
+
img = BioImage(image=str(p), reader=OMEZarrReader)
|
|
730
|
+
|
|
731
|
+
arr = np.asarray(img.data) # shape (T, C, Z, Y, X)
|
|
732
|
+
dims = img.dims
|
|
733
|
+
|
|
734
|
+
size_t = int(dims.T or 1)
|
|
735
|
+
size_c = int(dims.C or 1)
|
|
736
|
+
size_z = int(dims.Z or 1)
|
|
737
|
+
size_y = int(dims.Y or arr.shape[-2])
|
|
738
|
+
size_x = int(dims.X or arr.shape[-1])
|
|
739
|
+
|
|
740
|
+
if size_x <= 0 or size_y <= 0:
|
|
741
|
+
raise ValueError("Image must have positive Y and X dimensions.")
|
|
742
|
+
|
|
743
|
+
pps = getattr(img, "physical_pixel_sizes", None)
|
|
744
|
+
try:
|
|
745
|
+
psize_x = float(getattr(pps, "X", None) or 1.0)
|
|
746
|
+
psize_y = float(getattr(pps, "Y", None) or 1.0)
|
|
747
|
+
psize_z = float(getattr(pps, "Z", None) or 1.0)
|
|
748
|
+
except Exception:
|
|
749
|
+
psize_x = psize_y = psize_z = 1.0
|
|
750
|
+
|
|
751
|
+
img_id = str(image_id or p.stem)
|
|
752
|
+
display_name = str(name or p.name)
|
|
753
|
+
|
|
754
|
+
# Infer or assign channel names
|
|
755
|
+
if not channel_names or len(channel_names) != size_c:
|
|
756
|
+
try:
|
|
757
|
+
chs = getattr(img, "channel_names", None)
|
|
758
|
+
if chs is None:
|
|
759
|
+
chs = [getattr(ch, "name", None) for ch in getattr(img, "channels", [])]
|
|
760
|
+
if chs and len(chs) == size_c and all(c is not None for c in chs):
|
|
761
|
+
channel_names = [str(c) for c in chs]
|
|
762
|
+
else:
|
|
763
|
+
channel_names = [f"C{i}" for i in range(size_c)]
|
|
764
|
+
except Exception:
|
|
765
|
+
channel_names = [f"C{i}" for i in range(size_c)]
|
|
766
|
+
channel_names = [str(x) for x in channel_names]
|
|
767
|
+
|
|
768
|
+
channels = [
|
|
769
|
+
{
|
|
770
|
+
"id": f"ch-{i}",
|
|
771
|
+
"name": channel_names[i],
|
|
772
|
+
"emission_um": 0.0,
|
|
773
|
+
"excitation_um": 0.0,
|
|
774
|
+
"illumination": "Unknown",
|
|
775
|
+
"color_rgba": 0xFFFFFFFF,
|
|
776
|
+
}
|
|
777
|
+
for i in range(size_c)
|
|
778
|
+
]
|
|
779
|
+
|
|
780
|
+
planes: List[Dict[str, Any]] = []
|
|
781
|
+
for t in range(size_t):
|
|
782
|
+
for c in range(size_c):
|
|
783
|
+
for z in range(size_z):
|
|
784
|
+
plane = arr[t, c, z]
|
|
785
|
+
if clamp_to_uint16 and plane.dtype != np.uint16:
|
|
786
|
+
plane = np.clip(plane, 0, 65535).astype(np.uint16)
|
|
787
|
+
planes.append(
|
|
788
|
+
{"z": z, "t": t, "c": c, "pixels": plane.ravel().tolist()}
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
dim_order = "XYCT" if size_z == 1 else "XYZCT"
|
|
792
|
+
|
|
793
|
+
return to_ome_arrow(
|
|
794
|
+
image_id=img_id,
|
|
795
|
+
name=display_name,
|
|
796
|
+
acquisition_datetime=acquisition_datetime or datetime.now(timezone.utc),
|
|
797
|
+
dimension_order=dim_order,
|
|
798
|
+
dtype="uint16",
|
|
799
|
+
size_x=size_x,
|
|
800
|
+
size_y=size_y,
|
|
801
|
+
size_z=size_z,
|
|
802
|
+
size_c=size_c,
|
|
803
|
+
size_t=size_t,
|
|
804
|
+
physical_size_x=psize_x,
|
|
805
|
+
physical_size_y=psize_y,
|
|
806
|
+
physical_size_z=psize_z,
|
|
807
|
+
physical_size_unit="µm",
|
|
808
|
+
channels=channels,
|
|
809
|
+
planes=planes,
|
|
810
|
+
masks=None,
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
def from_ome_parquet(
|
|
815
|
+
parquet_path: str | Path,
|
|
816
|
+
*,
|
|
817
|
+
column_name: Optional[str] = "ome_arrow",
|
|
818
|
+
row_index: int = 0,
|
|
819
|
+
strict_schema: bool = False,
|
|
820
|
+
) -> pa.StructScalar:
|
|
821
|
+
"""
|
|
822
|
+
Read an OME-Arrow record from a Parquet file and return a typed StructScalar.
|
|
823
|
+
|
|
824
|
+
Expected layout (as produced by `to_ome_parquet`):
|
|
825
|
+
- single Parquet file
|
|
826
|
+
- a single column (default name "ome_arrow") of `OME_ARROW_STRUCT` type
|
|
827
|
+
- one row (row_index=0)
|
|
828
|
+
|
|
829
|
+
This function is forgiving:
|
|
830
|
+
- If `column_name` is None or not found, it will auto-detect a struct column
|
|
831
|
+
that matches the OME-Arrow field names.
|
|
832
|
+
- If the table has multiple rows, you can choose which record to read
|
|
833
|
+
via `row_index`.
|
|
834
|
+
|
|
835
|
+
Parameters
|
|
836
|
+
----------
|
|
837
|
+
parquet_path : str | Path
|
|
838
|
+
Path to the .parquet file.
|
|
839
|
+
column_name : Optional[str], default "ome_arrow"
|
|
840
|
+
Name of the column that stores the OME-Arrow struct. If None, auto-detect.
|
|
841
|
+
row_index : int, default 0
|
|
842
|
+
Which row to read if the table contains multiple rows.
|
|
843
|
+
strict_schema : bool, default False
|
|
844
|
+
If True, require the column's type to equal `OME_ARROW_STRUCT` exactly.
|
|
845
|
+
If False, we only require the column to be a Struct with the same field
|
|
846
|
+
names (order can vary).
|
|
847
|
+
|
|
848
|
+
Returns
|
|
849
|
+
-------
|
|
850
|
+
pa.StructScalar
|
|
851
|
+
A validated OME-Arrow struct scalar.
|
|
852
|
+
|
|
853
|
+
Raises
|
|
854
|
+
------
|
|
855
|
+
FileNotFoundError
|
|
856
|
+
If the file does not exist.
|
|
857
|
+
ValueError
|
|
858
|
+
If a suitable column/row cannot be found or schema checks fail.
|
|
859
|
+
"""
|
|
860
|
+
p = Path(parquet_path)
|
|
861
|
+
if not p.exists():
|
|
862
|
+
raise FileNotFoundError(f"No such file: {p}")
|
|
863
|
+
|
|
864
|
+
table = pq.read_table(p)
|
|
865
|
+
|
|
866
|
+
if table.num_rows == 0:
|
|
867
|
+
raise ValueError("Parquet file contains 0 rows; expected at least 1.")
|
|
868
|
+
if not (0 <= row_index < table.num_rows):
|
|
869
|
+
raise ValueError(f"row_index {row_index} out of range [0, {table.num_rows}).")
|
|
870
|
+
|
|
871
|
+
# 1) Locate the OME-Arrow column
|
|
872
|
+
def _struct_matches_ome_fields(t: pa.StructType) -> bool:
|
|
873
|
+
ome_fields = {f.name for f in OME_ARROW_STRUCT}
|
|
874
|
+
col_fields = {f.name for f in t}
|
|
875
|
+
return ome_fields == col_fields
|
|
876
|
+
|
|
877
|
+
candidate_col = None
|
|
878
|
+
|
|
879
|
+
if column_name is not None and column_name in table.column_names:
|
|
880
|
+
arr = table[column_name]
|
|
881
|
+
if not pa.types.is_struct(arr.type):
|
|
882
|
+
raise ValueError(f"Column '{column_name}' is not a Struct; got {arr.type}.")
|
|
883
|
+
if strict_schema and arr.type != OME_ARROW_STRUCT:
|
|
884
|
+
raise ValueError(
|
|
885
|
+
f"Column '{column_name}' schema != OME_ARROW_STRUCT.\n"
|
|
886
|
+
f"Got: {arr.type}\n"
|
|
887
|
+
f"Expect:{OME_ARROW_STRUCT}"
|
|
888
|
+
)
|
|
889
|
+
if not strict_schema and not _struct_matches_ome_fields(arr.type):
|
|
890
|
+
raise ValueError(
|
|
891
|
+
f"Column '{column_name}' does not have the expected OME-Arrow fields."
|
|
892
|
+
)
|
|
893
|
+
candidate_col = arr
|
|
894
|
+
else:
|
|
895
|
+
# Auto-detect a struct column that matches OME-Arrow fields
|
|
896
|
+
for name in table.column_names:
|
|
897
|
+
arr = table[name]
|
|
898
|
+
if pa.types.is_struct(arr.type):
|
|
899
|
+
if strict_schema and arr.type == OME_ARROW_STRUCT:
|
|
900
|
+
candidate_col = arr
|
|
901
|
+
column_name = name
|
|
902
|
+
break
|
|
903
|
+
if not strict_schema and _struct_matches_ome_fields(arr.type):
|
|
904
|
+
candidate_col = arr
|
|
905
|
+
column_name = name
|
|
906
|
+
break
|
|
907
|
+
if candidate_col is None:
|
|
908
|
+
if column_name is None:
|
|
909
|
+
hint = "no struct column with OME-Arrow fields was found."
|
|
910
|
+
else:
|
|
911
|
+
hint = f"column '{column_name}' not found and auto-detection failed."
|
|
912
|
+
raise ValueError(f"Could not locate an OME-Arrow struct column: {hint}")
|
|
913
|
+
|
|
914
|
+
# 2) Extract the row as a Python dict
|
|
915
|
+
# (Using to_pylist() for the single element slice is simple & reliable.)
|
|
916
|
+
record_dict: Dict[str, Any] = candidate_col.slice(row_index, 1).to_pylist()[0]
|
|
917
|
+
|
|
918
|
+
# 3) Reconstruct a typed StructScalar using the canonical schema
|
|
919
|
+
# (this validates field names/types and normalizes order)
|
|
920
|
+
scalar = pa.scalar(record_dict, type=OME_ARROW_STRUCT)
|
|
921
|
+
|
|
922
|
+
# Optional: soft validation via file-level metadata (if present)
|
|
923
|
+
try:
|
|
924
|
+
meta = table.schema.metadata or {}
|
|
925
|
+
meta.get(b"ome.arrow.type", b"").decode() == str(
|
|
926
|
+
OME_ARROW_TAG_TYPE
|
|
927
|
+
) and meta.get(b"ome.arrow.version", b"").decode() == str(OME_ARROW_TAG_VERSION)
|
|
928
|
+
# You could log/print a warning if tag_ok is False, but don't fail.
|
|
929
|
+
except Exception:
|
|
930
|
+
pass
|
|
931
|
+
|
|
932
|
+
return scalar
|