ome-arrow 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ome_arrow/ingest.py ADDED
@@ -0,0 +1,932 @@
1
+ """
2
+ Converting to and from OME-Arrow formats.
3
+ """
4
+
5
+ import itertools
6
+ import re
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
10
+
11
+ import bioio_ome_tiff
12
+ import bioio_tifffile
13
+ import numpy as np
14
+ import pyarrow as pa
15
+ import pyarrow.parquet as pq
16
+ from bioio import BioImage
17
+ from bioio_ome_zarr import Reader as OMEZarrReader
18
+
19
+ from ome_arrow.meta import OME_ARROW_STRUCT, OME_ARROW_TAG_TYPE, OME_ARROW_TAG_VERSION
20
+
21
+
22
+ def to_ome_arrow(
23
+ type_: str = OME_ARROW_TAG_TYPE,
24
+ version: str = OME_ARROW_TAG_VERSION,
25
+ image_id: str = "unnamed",
26
+ name: str = "unknown",
27
+ acquisition_datetime: Optional[datetime] = None,
28
+ dimension_order: str = "XYZCT",
29
+ dtype: str = "uint16",
30
+ size_x: int = 1,
31
+ size_y: int = 1,
32
+ size_z: int = 1,
33
+ size_c: int = 1,
34
+ size_t: int = 1,
35
+ physical_size_x: float = 1.0,
36
+ physical_size_y: float = 1.0,
37
+ physical_size_z: float = 1.0,
38
+ physical_size_unit: str = "µm",
39
+ channels: Optional[List[Dict[str, Any]]] = None,
40
+ planes: Optional[List[Dict[str, Any]]] = None,
41
+ masks: Any = None,
42
+ ) -> pa.StructScalar:
43
+ """
44
+ Create a typed OME-Arrow StructScalar with sensible defaults.
45
+
46
+ This builds and validates a nested dict that conforms to the given
47
+ StructType (e.g., OME_ARROW_STRUCT). You can override any field
48
+ explicitly; others use safe defaults.
49
+
50
+ Args:
51
+ type_: Top-level type string ("ome.arrow" by default).
52
+ version: Specification version string.
53
+ image_id: Unique image identifier.
54
+ name: Human-friendly name.
55
+ acquisition_datetime: Datetime of acquisition (defaults to now).
56
+ dimension_order: Dimension order ("XYZCT" or "XYCT").
57
+ dtype: Pixel data type string (e.g., "uint16").
58
+ size_x, size_y, size_z, size_c, size_t: Axis sizes.
59
+ physical_size_x/y/z: Physical scaling in µm.
60
+ physical_size_unit: Unit string, default "µm".
61
+ channels: List of channel dicts. Autogenerates one if None.
62
+ planes: List of plane dicts. Empty if None.
63
+ masks: Optional placeholder for future annotations.
64
+
65
+ Returns:
66
+ pa.StructScalar: A validated StructScalar for the schema.
67
+
68
+ Example:
69
+ >>> s = to_struct_scalar(OME_ARROW_STRUCT, image_id="img001")
70
+ >>> s.type == OME_ARROW_STRUCT
71
+ True
72
+ """
73
+
74
+ type_ = str(type_)
75
+ version = str(version)
76
+ image_id = str(image_id)
77
+ name = str(name)
78
+ dimension_order = str(dimension_order)
79
+ dtype = str(dtype)
80
+ physical_size_unit = str(physical_size_unit)
81
+
82
+ # Sensible defaults for channels and planes
83
+ if channels is None:
84
+ channels = [
85
+ {
86
+ "id": "ch-0",
87
+ "name": "default",
88
+ "emission_um": 0.0,
89
+ "excitation_um": 0.0,
90
+ "illumination": "Unknown",
91
+ "color_rgba": 0xFFFFFFFF,
92
+ }
93
+ ]
94
+ else:
95
+ # --- NEW: coerce channel text fields to str ------------------
96
+ for ch in channels:
97
+ if "id" in ch:
98
+ ch["id"] = str(ch["id"])
99
+ if "name" in ch:
100
+ ch["name"] = str(ch["name"])
101
+ if "illumination" in ch:
102
+ ch["illumination"] = str(ch["illumination"])
103
+
104
+ if planes is None:
105
+ planes = [{"z": 0, "t": 0, "c": 0, "pixels": [0] * (size_x * size_y)}]
106
+
107
+ record = {
108
+ "type": type_,
109
+ "version": version,
110
+ "id": image_id,
111
+ "name": name,
112
+ "acquisition_datetime": acquisition_datetime or datetime.now(timezone.utc),
113
+ "pixels_meta": {
114
+ "dimension_order": dimension_order,
115
+ "type": dtype,
116
+ "size_x": size_x,
117
+ "size_y": size_y,
118
+ "size_z": size_z,
119
+ "size_c": size_c,
120
+ "size_t": size_t,
121
+ "physical_size_x": physical_size_x,
122
+ "physical_size_y": physical_size_y,
123
+ "physical_size_z": physical_size_z,
124
+ "physical_size_x_unit": physical_size_unit,
125
+ "physical_size_y_unit": physical_size_unit,
126
+ "physical_size_z_unit": physical_size_unit,
127
+ "channels": channels,
128
+ },
129
+ "planes": planes,
130
+ "masks": masks,
131
+ }
132
+
133
+ return pa.scalar(record, type=OME_ARROW_STRUCT)
134
+
135
+
136
+ def from_numpy(
137
+ arr: np.ndarray,
138
+ *,
139
+ dim_order: str = "TCZYX",
140
+ image_id: Optional[str] = None,
141
+ name: Optional[str] = None,
142
+ channel_names: Optional[Sequence[str]] = None,
143
+ acquisition_datetime: Optional[datetime] = None,
144
+ clamp_to_uint16: bool = True,
145
+ # meta
146
+ physical_size_x: float = 1.0,
147
+ physical_size_y: float = 1.0,
148
+ physical_size_z: float = 1.0,
149
+ physical_size_unit: str = "µm",
150
+ dtype_meta: Optional[str] = None, # if None, inferred from output dtype
151
+ ) -> pa.StructScalar:
152
+ """
153
+ Build an OME-Arrow StructScalar from a NumPy array.
154
+
155
+ Parameters
156
+ ----------
157
+ arr : np.ndarray
158
+ Image data with axes described by `dim_order`.
159
+ dim_order : str, default "TCZYX"
160
+ Axis labels for `arr`. Must include "Y" and "X".
161
+ Supported examples: "YX", "ZYX", "CYX", "CZYX", "TYX", "TCYX", "TCZYX".
162
+ image_id, name : Optional[str]
163
+ Identifiers to embed in the record.
164
+ channel_names : Optional[Sequence[str]]
165
+ Names for channels; defaults to C0..C{n-1}.
166
+ acquisition_datetime : Optional[datetime]
167
+ Defaults to now (UTC) if None.
168
+ clamp_to_uint16 : bool, default True
169
+ If True, clamp/cast planes to uint16 before serialization.
170
+ physical_size_x/y/z : float
171
+ Spatial pixel sizes (µm), Z used if present.
172
+ physical_size_unit : str
173
+ Unit string for spatial axes (default "µm").
174
+ dtype_meta : Optional[str]
175
+ Pixel dtype string to place in metadata; if None, inferred from the
176
+ (possibly cast) array's dtype.
177
+
178
+ Returns
179
+ -------
180
+ pa.StructScalar
181
+ Typed OME-Arrow record (schema = OME_ARROW_STRUCT).
182
+
183
+ Notes
184
+ -----
185
+ - If Z is not in `dim_order`, `size_z` will be 1 and the meta
186
+ dimension_order becomes "XYCT"; otherwise "XYZCT".
187
+ - If T/C are absent in `dim_order`, they default to size 1.
188
+ """
189
+
190
+ if not isinstance(arr, np.ndarray):
191
+ raise TypeError("from_numpy expects a NumPy ndarray.")
192
+
193
+ dims = dim_order.upper()
194
+ if "Y" not in dims or "X" not in dims:
195
+ raise ValueError("dim_order must include 'Y' and 'X' axes.")
196
+
197
+ # Map current axes -> indices
198
+ axis_to_idx: Dict[str, int] = {ax: i for i, ax in enumerate(dims)}
199
+
200
+ # Extract sizes with defaults for missing axes
201
+ size_x = int(arr.shape[axis_to_idx["X"]])
202
+ size_y = int(arr.shape[axis_to_idx["Y"]])
203
+ size_z = int(arr.shape[axis_to_idx["Z"]]) if "Z" in axis_to_idx else 1
204
+ size_c = int(arr.shape[axis_to_idx["C"]]) if "C" in axis_to_idx else 1
205
+ size_t = int(arr.shape[axis_to_idx["T"]]) if "T" in axis_to_idx else 1
206
+
207
+ if size_x <= 0 or size_y <= 0:
208
+ raise ValueError("Image must have positive Y and X dimensions.")
209
+
210
+ # Reorder to a standard (T, C, Z, Y, X) view for plane extraction
211
+ desired_axes = ["T", "C", "Z", "Y", "X"]
212
+ current_axes = list(dims)
213
+ # Insert absent axes with size 1 using np.expand_dims
214
+ view = arr
215
+ for ax in desired_axes:
216
+ if ax not in axis_to_idx:
217
+ # Append a new singleton axis at the end, then we'll permute
218
+ view = np.expand_dims(view, axis=-1)
219
+ # Pretend this new axis now exists at the last index
220
+ current_axes.append(ax)
221
+ axis_to_idx = {a: i for i, a in enumerate(current_axes)}
222
+
223
+ # Permute to TCZYX
224
+ perm = [axis_to_idx[a] for a in desired_axes]
225
+ tczyx = np.transpose(view, axes=perm)
226
+
227
+ # Validate final shape
228
+ if tuple(tczyx.shape) != (size_t, size_c, size_z, size_y, size_x):
229
+ # This should not happen, but guard just in case
230
+ raise ValueError(
231
+ "Internal axis reordering mismatch: "
232
+ f"got {tczyx.shape} vs expected {(size_t, size_c, size_z, size_y, size_x)}"
233
+ )
234
+
235
+ # Clamp/cast
236
+ if clamp_to_uint16 and tczyx.dtype != np.uint16:
237
+ tczyx = np.clip(tczyx, 0, 65535).astype(np.uint16, copy=False)
238
+
239
+ # Channel names
240
+ if not channel_names or len(channel_names) != size_c:
241
+ channel_names = [f"C{i}" for i in range(size_c)]
242
+ channel_names = [str(x) for x in channel_names]
243
+
244
+ channels = [
245
+ {
246
+ "id": f"ch-{i}",
247
+ "name": channel_names[i],
248
+ "emission_um": 0.0,
249
+ "excitation_um": 0.0,
250
+ "illumination": "Unknown",
251
+ "color_rgba": 0xFFFFFFFF,
252
+ }
253
+ for i in range(size_c)
254
+ ]
255
+
256
+ # Build planes: flatten YX per (t,c,z)
257
+ planes: List[Dict[str, Any]] = []
258
+ for t in range(size_t):
259
+ for c in range(size_c):
260
+ for z in range(size_z):
261
+ plane = tczyx[t, c, z]
262
+ planes.append(
263
+ {"z": z, "t": t, "c": c, "pixels": plane.ravel().tolist()}
264
+ )
265
+
266
+ # Meta dimension_order: mirror your other ingests
267
+ meta_dim_order = "XYCT" if size_z == 1 else "XYZCT"
268
+
269
+ # Pixel dtype in metadata
270
+ dtype_str = dtype_meta or np.dtype(tczyx.dtype).name
271
+
272
+ return to_ome_arrow(
273
+ image_id=str(image_id or "unnamed"),
274
+ name=str(name or "unknown"),
275
+ acquisition_datetime=acquisition_datetime or datetime.now(timezone.utc),
276
+ dimension_order=meta_dim_order,
277
+ dtype=dtype_str,
278
+ size_x=size_x,
279
+ size_y=size_y,
280
+ size_z=size_z,
281
+ size_c=size_c,
282
+ size_t=size_t,
283
+ physical_size_x=float(physical_size_x),
284
+ physical_size_y=float(physical_size_y),
285
+ physical_size_z=float(physical_size_z),
286
+ physical_size_unit=str(physical_size_unit),
287
+ channels=channels,
288
+ planes=planes,
289
+ masks=None,
290
+ )
291
+
292
+
293
+ def from_tiff(
294
+ tiff_path: str | Path,
295
+ image_id: Optional[str] = None,
296
+ name: Optional[str] = None,
297
+ channel_names: Optional[Sequence[str]] = None,
298
+ acquisition_datetime: Optional[datetime] = None,
299
+ clamp_to_uint16: bool = True,
300
+ ) -> pa.StructScalar:
301
+ """
302
+ Read a TIFF and return a typed OME-Arrow StructScalar.
303
+
304
+ Uses bioio to read TCZYX (or XY) data, flattens each YX plane, and
305
+ delegates struct creation to `to_struct_scalar`.
306
+
307
+ Args:
308
+ tiff_path: Path to a TIFF readable by bioio.
309
+ image_id: Optional stable image identifier (defaults to stem).
310
+ name: Optional human label (defaults to file name).
311
+ channel_names: Optional channel names; defaults to C0..C{n-1}.
312
+ acquisition_datetime: Optional acquisition time (UTC now if None).
313
+ clamp_to_uint16: If True, clamp/cast planes to uint16.
314
+
315
+ Returns:
316
+ pa.StructScalar validated against `struct`.
317
+ """
318
+
319
+ p = Path(tiff_path)
320
+
321
+ img = BioImage(
322
+ image=str(p),
323
+ reader=(
324
+ bioio_ome_tiff.Reader
325
+ if str(p).lower().endswith(("ome.tif", "ome.tiff"))
326
+ else bioio_tifffile.Reader
327
+ ),
328
+ )
329
+
330
+ arr = np.asarray(img.data) # (T, C, Z, Y, X)
331
+ dims = img.dims
332
+ size_t = int(dims.T or 1)
333
+ size_c = int(dims.C or 1)
334
+ size_z = int(dims.Z or 1)
335
+ size_y = int(dims.Y or arr.shape[-2])
336
+ size_x = int(dims.X or arr.shape[-1])
337
+ if size_x <= 0 or size_y <= 0:
338
+ raise ValueError("Image must have positive Y and X dims.")
339
+
340
+ pps = getattr(img, "physical_pixel_sizes", None)
341
+ try:
342
+ psize_x = float(getattr(pps, "X", None) or 1.0)
343
+ psize_y = float(getattr(pps, "Y", None) or 1.0)
344
+ psize_z = float(getattr(pps, "Z", None) or 1.0)
345
+ except Exception:
346
+ psize_x = psize_y = psize_z = 1.0
347
+
348
+ # --- NEW: coerce top-level strings --------------------------------
349
+ img_id = str(image_id or p.stem)
350
+ display_name = str(name or p.name)
351
+
352
+ # --- NEW: ensure channel_names is list[str] ------------------------
353
+ if not channel_names or len(channel_names) != size_c:
354
+ channel_names = [f"C{i}" for i in range(size_c)]
355
+ channel_names = [str(x) for x in channel_names]
356
+
357
+ channels = [
358
+ {
359
+ "id": f"ch-{i}",
360
+ "name": channel_names[i],
361
+ "emission_um": 0.0,
362
+ "excitation_um": 0.0,
363
+ "illumination": "Unknown",
364
+ "color_rgba": 0xFFFFFFFF,
365
+ }
366
+ for i in range(size_c)
367
+ ]
368
+
369
+ planes: List[Dict[str, Any]] = []
370
+ for t in range(size_t):
371
+ for c in range(size_c):
372
+ for z in range(size_z):
373
+ plane = arr[t, c, z]
374
+ if clamp_to_uint16 and plane.dtype != np.uint16:
375
+ plane = np.clip(plane, 0, 65535).astype(np.uint16)
376
+ planes.append(
377
+ {"z": z, "t": t, "c": c, "pixels": plane.ravel().tolist()}
378
+ )
379
+
380
+ dim_order = "XYCT" if size_z == 1 else "XYZCT"
381
+
382
+ return to_ome_arrow(
383
+ image_id=img_id,
384
+ name=display_name,
385
+ acquisition_datetime=acquisition_datetime or datetime.now(timezone.utc),
386
+ dimension_order=dim_order,
387
+ dtype="uint16",
388
+ size_x=size_x,
389
+ size_y=size_y,
390
+ size_z=size_z,
391
+ size_c=size_c,
392
+ size_t=size_t,
393
+ physical_size_x=psize_x,
394
+ physical_size_y=psize_y,
395
+ physical_size_z=psize_z,
396
+ physical_size_unit="µm",
397
+ channels=channels,
398
+ planes=planes,
399
+ masks=None,
400
+ )
401
+
402
+
403
+ def from_stack_pattern_path(
404
+ pattern_path: str | Path,
405
+ default_dim_for_unspecified: str = "C",
406
+ map_series_to: Optional[str] = "T",
407
+ clamp_to_uint16: bool = True,
408
+ channel_names: Optional[List[str]] = None,
409
+ image_id: Optional[str] = None,
410
+ name: Optional[str] = None,
411
+ ) -> pa.StructScalar:
412
+ path = Path(pattern_path)
413
+ folder = path.parent
414
+ line = path.name.strip()
415
+ if not line:
416
+ raise ValueError("Pattern path string is empty or malformed")
417
+
418
+ DIM_TOKENS = {
419
+ "C": {"c", "ch", "w", "wavelength"},
420
+ "T": {"t", "tl", "tp", "timepoint"},
421
+ "Z": {"z", "zs", "sec", "fp", "focal", "focalplane"},
422
+ "S": {"s", "sp", "series"},
423
+ }
424
+ NUM_RANGE_RE = re.compile(r"^(?P<a>\d+)\-(?P<b>\d+)(?::(?P<step>\d+))?$")
425
+
426
+ def detect_dim(before_text: str) -> Optional[str]:
427
+ m = re.search(r"([A-Za-z]+)$", before_text)
428
+ if not m:
429
+ return None
430
+ token = m.group(1).lower()
431
+ for dim, names in DIM_TOKENS.items():
432
+ if token in names:
433
+ return dim
434
+ return None
435
+
436
+ def expand_raw_token(raw: str) -> Tuple[List[str], bool]:
437
+ raw = raw.strip()
438
+ if "," in raw and not NUM_RANGE_RE.match(raw):
439
+ parts = [p.strip() for p in raw.split(",")]
440
+ return parts, all(p.isdigit() for p in parts)
441
+ m = NUM_RANGE_RE.match(raw)
442
+ if m:
443
+ a, b = m.group("a"), m.group("b")
444
+ step = int(m.group("step") or "1")
445
+ start, stop = int(a), int(b)
446
+ if stop < start:
447
+ raise ValueError(f"Inverted range not supported: <{raw}>")
448
+ width = max(len(a), len(b))
449
+ nums = [str(v).zfill(width) for v in range(start, stop + 1, step)]
450
+ return nums, True
451
+ return [raw], raw.isdigit()
452
+
453
+ def parse_bracket_pattern(s: str) -> Tuple[str, List[Dict[str, Any]]]:
454
+ placeholders, out = [], []
455
+ i = ph_i = 0
456
+ while i < len(s):
457
+ if s[i] == "<":
458
+ j = s.find(">", i + 1)
459
+ if j == -1:
460
+ raise ValueError("Unclosed '<' in pattern.")
461
+ raw_inside = s[i + 1 : j]
462
+ before = "".join(out)
463
+ dim = detect_dim(before) or "?"
464
+ choices, is_num = expand_raw_token(raw_inside)
465
+ placeholders.append(
466
+ {
467
+ "idx": ph_i,
468
+ "raw": raw_inside,
469
+ "choices": choices,
470
+ "dim": dim,
471
+ "is_numeric": is_num,
472
+ }
473
+ )
474
+ out.append(f"{{{ph_i}}}")
475
+ ph_i += 1
476
+ i = j + 1
477
+ else:
478
+ out.append(s[i])
479
+ i += 1
480
+ return "".join(out), placeholders
481
+
482
+ def regex_match(folder: Path, regex: str) -> List[Path]:
483
+ r = re.compile(regex)
484
+ return sorted(
485
+ [p for p in folder.iterdir() if p.is_file() and r.fullmatch(p.name)]
486
+ )
487
+
488
+ matched: Dict[Tuple[int, int, int], Path] = {}
489
+ literal_channel_names: Optional[List[str]] = None
490
+
491
+ if "<" in line and ">" in line:
492
+ template, placeholders = parse_bracket_pattern(line)
493
+ for ph in placeholders:
494
+ ph["dim"] = (ph["dim"] or "?").upper()
495
+ if ph["dim"] == "?":
496
+ ph["dim"] = default_dim_for_unspecified.upper()
497
+
498
+ for combo in itertools.product(*[ph["choices"] for ph in placeholders]):
499
+ fname = template.format(*combo)
500
+ fpath = folder / fname
501
+ if not fpath.exists():
502
+ continue
503
+
504
+ t = c = z = 0
505
+ for ph, val in zip(placeholders, combo):
506
+ idx = ph["choices"].index(val)
507
+ dim = ph["dim"]
508
+ if dim == "S":
509
+ if not map_series_to:
510
+ raise ValueError("Encountered 'series' but map_series_to=None")
511
+ dim = map_series_to.upper()
512
+ if dim == "T":
513
+ t = idx
514
+ elif dim == "C":
515
+ c = idx
516
+ elif dim == "Z":
517
+ z = idx
518
+
519
+ if literal_channel_names is None:
520
+ for ph in placeholders:
521
+ dim_eff = ph["dim"] if ph["dim"] != "S" else (map_series_to or "S")
522
+ if dim_eff == "C" and not ph["is_numeric"]:
523
+ literal_channel_names = ph["choices"]
524
+ break
525
+
526
+ matched[(t, c, z)] = fpath
527
+ else:
528
+ for z, p in enumerate(regex_match(folder, line)):
529
+ matched[(0, 0, z)] = p
530
+
531
+ if not matched:
532
+ raise FileNotFoundError(f"No files matched pattern: {pattern_path}")
533
+
534
+ size_t = max(k[0] for k in matched) + 1
535
+ size_c = max(k[1] for k in matched) + 1
536
+ size_z = max(k[2] for k in matched) + 1
537
+
538
+ if channel_names and len(channel_names) != size_c:
539
+ raise ValueError(
540
+ f"channel_names length {len(channel_names)} != size_c {size_c}"
541
+ )
542
+ if not channel_names:
543
+ channel_names = literal_channel_names or [f"C{i}" for i in range(size_c)]
544
+
545
+ # ---- PROBE SHAPE (NEW: accept TCZYX and squeeze singleton axes) ----
546
+ sample = next(iter(matched.values()))
547
+ is_ome = sample.suffix.lower() in (".ome.tif", ".ome.tiff")
548
+ img0 = BioImage(
549
+ image=str(sample),
550
+ reader=(bioio_ome_tiff.Reader if is_ome else bioio_tifffile.Reader),
551
+ )
552
+ a0 = np.asarray(img0.data)
553
+ # bioio returns TCZYX or YX; normalize to TCZYX
554
+ if a0.ndim == 2:
555
+ _T0, _C0, _Z0, Y0, X0 = 1, 1, 1, a0.shape[0], a0.shape[1]
556
+ else:
557
+ # Heuristic: last two are (Y,X); leading dims are (T,C,Z) possibly singleton
558
+ Y0, X0 = a0.shape[-2], a0.shape[-1]
559
+ lead = a0.shape[:-2]
560
+ # Pad leading dims to T,C,Z (left-aligned)
561
+ _T0, _C0, _Z0 = ([*list(lead), 1, 1, 1])[:3]
562
+ size_y, size_x = Y0, X0
563
+
564
+ # physical pixel sizes
565
+ pps = getattr(img0, "physical_pixel_sizes", None)
566
+ try:
567
+ psize_x = float(getattr(pps, "X", None) or 1.0)
568
+ psize_y = float(getattr(pps, "Y", None) or 1.0)
569
+ psize_z = float(getattr(pps, "Z", None) or 1.0)
570
+ except Exception:
571
+ psize_x = psize_y = psize_z = 1.0
572
+
573
+ # ---- BUILD PLANES (NEW: support Z-stacks within a single file when T=C=1) ----
574
+ planes: List[Dict[str, Any]] = []
575
+
576
+ def _ensure_u16(arr: np.ndarray) -> np.ndarray:
577
+ if clamp_to_uint16 and arr.dtype != np.uint16:
578
+ arr = np.clip(arr, 0, 65535).astype(np.uint16)
579
+ return arr
580
+
581
+ for t in range(size_t):
582
+ for c in range(size_c):
583
+ for z in range(size_z):
584
+ fpath = matched.get((t, c, z))
585
+ if fpath is None:
586
+ # missing plane: zero-fill
587
+ planes.append(
588
+ {"z": z, "t": t, "c": c, "pixels": [0] * (size_x * size_y)}
589
+ )
590
+ continue
591
+
592
+ reader = (
593
+ bioio_ome_tiff.Reader
594
+ if fpath.suffix.lower() in (".ome.tif", ".ome.tiff")
595
+ else bioio_tifffile.Reader
596
+ )
597
+ im = BioImage(image=str(fpath), reader=reader)
598
+ arr = np.asarray(im.data)
599
+
600
+ if arr.ndim == 2:
601
+ # Direct YX
602
+ if arr.shape != (size_y, size_x):
603
+ raise ValueError(
604
+ f"Shape mismatch for {fpath.name}:"
605
+ f" {arr.shape} vs {(size_y, size_x)}"
606
+ )
607
+ arr = _ensure_u16(arr)
608
+ planes.append(
609
+ {"z": z, "t": t, "c": c, "pixels": arr.ravel().tolist()}
610
+ )
611
+ else:
612
+ # Treat as TCZYX; extract dims
613
+ Y, X = arr.shape[-2], arr.shape[-1]
614
+ lead = arr.shape[:-2]
615
+ Tn, Cn, Zn = ([*list(lead), 1, 1, 1])[:3]
616
+ if (size_y, size_x) != (Y, X):
617
+ raise ValueError(
618
+ f"Shape mismatch for {fpath.name}:"
619
+ f" {(Y, X)} vs {(size_y, size_x)}"
620
+ )
621
+
622
+ # Case A: singleton TCZ -> squeeze to YX
623
+ if Tn == 1 and Cn == 1 and Zn == 1:
624
+ plane2d = _ensure_u16(arr.reshape(Y, X))
625
+ planes.append(
626
+ {"z": z, "t": t, "c": c, "pixels": plane2d.ravel().tolist()}
627
+ )
628
+ # Case B: multi-Z only (expand across Z)
629
+ elif Tn == 1 and Cn == 1 and Zn > 1:
630
+ # spill Z pages starting at this z index
631
+ for z_local in range(Zn):
632
+ plane2d = _ensure_u16(
633
+ arr.reshape(1, 1, Zn, Y, X)[0, 0, z_local]
634
+ )
635
+ z_idx = z + z_local
636
+ planes.append(
637
+ {
638
+ "z": z_idx,
639
+ "t": t,
640
+ "c": c,
641
+ "pixels": plane2d.ravel().tolist(),
642
+ }
643
+ )
644
+ # bump global size_z if we exceeded it
645
+ size_z = max(size_z, z + Zn)
646
+ else:
647
+ # For now, we require multi-T/C pages to be
648
+ # expressed by the filename pattern,
649
+ # not embedded inside a single file.
650
+ raise ValueError(
651
+ f"{fpath.name} contains "
652
+ f"multiple pages across T/C/Z={Tn, Cn, Zn}; "
653
+ f"only Z>1 with T=C=1 is supported inside one file. "
654
+ f"Please express T/C via the filename pattern."
655
+ )
656
+
657
+ # Adjust channels (meta)
658
+ channels_meta = [
659
+ {
660
+ "id": f"ch-{i}",
661
+ "name": str((channel_names or [f"C{i}" for i in range(size_c)])[i]),
662
+ "emission_um": 0.0,
663
+ "excitation_um": 0.0,
664
+ "illumination": "Unknown",
665
+ "color_rgba": 0xFFFFFFFF,
666
+ }
667
+ for i in range(size_c)
668
+ ]
669
+
670
+ dim_order = "XYZCT" if size_z > 1 else "XYCT"
671
+ display_name = name or str(pattern_path)
672
+ img_id = image_id or path.stem
673
+
674
+ return to_ome_arrow(
675
+ image_id=str(img_id),
676
+ name=str(display_name),
677
+ acquisition_datetime=None,
678
+ dimension_order=dim_order,
679
+ dtype="uint16",
680
+ size_x=size_x,
681
+ size_y=size_y,
682
+ size_z=size_z,
683
+ size_c=size_c,
684
+ size_t=size_t,
685
+ physical_size_x=psize_x,
686
+ physical_size_y=psize_y,
687
+ physical_size_z=psize_z,
688
+ physical_size_unit="µm",
689
+ channels=channels_meta,
690
+ planes=planes,
691
+ masks=None,
692
+ )
693
+
694
+
695
+ def from_ome_zarr(
696
+ zarr_path: str | Path,
697
+ image_id: Optional[str] = None,
698
+ name: Optional[str] = None,
699
+ channel_names: Optional[Sequence[str]] = None,
700
+ acquisition_datetime: Optional[datetime] = None,
701
+ clamp_to_uint16: bool = True,
702
+ ) -> pa.StructScalar:
703
+ """
704
+ Read an OME-Zarr directory and return a typed OME-Arrow StructScalar.
705
+
706
+ Uses BioIO with the OMEZarrReader backend to read TCZYX (or XY) data,
707
+ flattens each YX plane into OME-Arrow planes, and builds a validated
708
+ StructScalar via `to_ome_arrow`.
709
+
710
+ Args:
711
+ zarr_path:
712
+ Path to the OME-Zarr directory (e.g., "image.ome.zarr").
713
+ image_id:
714
+ Optional stable image identifier (defaults to directory stem).
715
+ name:
716
+ Optional display name (defaults to directory name).
717
+ channel_names:
718
+ Optional list of channel names. Defaults to C0, C1, ...
719
+ acquisition_datetime:
720
+ Optional datetime (defaults to UTC now).
721
+ clamp_to_uint16:
722
+ If True, cast pixels to uint16.
723
+
724
+ Returns:
725
+ pa.StructScalar: Validated OME-Arrow struct for this image.
726
+ """
727
+ p = Path(zarr_path)
728
+
729
+ img = BioImage(image=str(p), reader=OMEZarrReader)
730
+
731
+ arr = np.asarray(img.data) # shape (T, C, Z, Y, X)
732
+ dims = img.dims
733
+
734
+ size_t = int(dims.T or 1)
735
+ size_c = int(dims.C or 1)
736
+ size_z = int(dims.Z or 1)
737
+ size_y = int(dims.Y or arr.shape[-2])
738
+ size_x = int(dims.X or arr.shape[-1])
739
+
740
+ if size_x <= 0 or size_y <= 0:
741
+ raise ValueError("Image must have positive Y and X dimensions.")
742
+
743
+ pps = getattr(img, "physical_pixel_sizes", None)
744
+ try:
745
+ psize_x = float(getattr(pps, "X", None) or 1.0)
746
+ psize_y = float(getattr(pps, "Y", None) or 1.0)
747
+ psize_z = float(getattr(pps, "Z", None) or 1.0)
748
+ except Exception:
749
+ psize_x = psize_y = psize_z = 1.0
750
+
751
+ img_id = str(image_id or p.stem)
752
+ display_name = str(name or p.name)
753
+
754
+ # Infer or assign channel names
755
+ if not channel_names or len(channel_names) != size_c:
756
+ try:
757
+ chs = getattr(img, "channel_names", None)
758
+ if chs is None:
759
+ chs = [getattr(ch, "name", None) for ch in getattr(img, "channels", [])]
760
+ if chs and len(chs) == size_c and all(c is not None for c in chs):
761
+ channel_names = [str(c) for c in chs]
762
+ else:
763
+ channel_names = [f"C{i}" for i in range(size_c)]
764
+ except Exception:
765
+ channel_names = [f"C{i}" for i in range(size_c)]
766
+ channel_names = [str(x) for x in channel_names]
767
+
768
+ channels = [
769
+ {
770
+ "id": f"ch-{i}",
771
+ "name": channel_names[i],
772
+ "emission_um": 0.0,
773
+ "excitation_um": 0.0,
774
+ "illumination": "Unknown",
775
+ "color_rgba": 0xFFFFFFFF,
776
+ }
777
+ for i in range(size_c)
778
+ ]
779
+
780
+ planes: List[Dict[str, Any]] = []
781
+ for t in range(size_t):
782
+ for c in range(size_c):
783
+ for z in range(size_z):
784
+ plane = arr[t, c, z]
785
+ if clamp_to_uint16 and plane.dtype != np.uint16:
786
+ plane = np.clip(plane, 0, 65535).astype(np.uint16)
787
+ planes.append(
788
+ {"z": z, "t": t, "c": c, "pixels": plane.ravel().tolist()}
789
+ )
790
+
791
+ dim_order = "XYCT" if size_z == 1 else "XYZCT"
792
+
793
+ return to_ome_arrow(
794
+ image_id=img_id,
795
+ name=display_name,
796
+ acquisition_datetime=acquisition_datetime or datetime.now(timezone.utc),
797
+ dimension_order=dim_order,
798
+ dtype="uint16",
799
+ size_x=size_x,
800
+ size_y=size_y,
801
+ size_z=size_z,
802
+ size_c=size_c,
803
+ size_t=size_t,
804
+ physical_size_x=psize_x,
805
+ physical_size_y=psize_y,
806
+ physical_size_z=psize_z,
807
+ physical_size_unit="µm",
808
+ channels=channels,
809
+ planes=planes,
810
+ masks=None,
811
+ )
812
+
813
+
814
+ def from_ome_parquet(
815
+ parquet_path: str | Path,
816
+ *,
817
+ column_name: Optional[str] = "ome_arrow",
818
+ row_index: int = 0,
819
+ strict_schema: bool = False,
820
+ ) -> pa.StructScalar:
821
+ """
822
+ Read an OME-Arrow record from a Parquet file and return a typed StructScalar.
823
+
824
+ Expected layout (as produced by `to_ome_parquet`):
825
+ - single Parquet file
826
+ - a single column (default name "ome_arrow") of `OME_ARROW_STRUCT` type
827
+ - one row (row_index=0)
828
+
829
+ This function is forgiving:
830
+ - If `column_name` is None or not found, it will auto-detect a struct column
831
+ that matches the OME-Arrow field names.
832
+ - If the table has multiple rows, you can choose which record to read
833
+ via `row_index`.
834
+
835
+ Parameters
836
+ ----------
837
+ parquet_path : str | Path
838
+ Path to the .parquet file.
839
+ column_name : Optional[str], default "ome_arrow"
840
+ Name of the column that stores the OME-Arrow struct. If None, auto-detect.
841
+ row_index : int, default 0
842
+ Which row to read if the table contains multiple rows.
843
+ strict_schema : bool, default False
844
+ If True, require the column's type to equal `OME_ARROW_STRUCT` exactly.
845
+ If False, we only require the column to be a Struct with the same field
846
+ names (order can vary).
847
+
848
+ Returns
849
+ -------
850
+ pa.StructScalar
851
+ A validated OME-Arrow struct scalar.
852
+
853
+ Raises
854
+ ------
855
+ FileNotFoundError
856
+ If the file does not exist.
857
+ ValueError
858
+ If a suitable column/row cannot be found or schema checks fail.
859
+ """
860
+ p = Path(parquet_path)
861
+ if not p.exists():
862
+ raise FileNotFoundError(f"No such file: {p}")
863
+
864
+ table = pq.read_table(p)
865
+
866
+ if table.num_rows == 0:
867
+ raise ValueError("Parquet file contains 0 rows; expected at least 1.")
868
+ if not (0 <= row_index < table.num_rows):
869
+ raise ValueError(f"row_index {row_index} out of range [0, {table.num_rows}).")
870
+
871
+ # 1) Locate the OME-Arrow column
872
+ def _struct_matches_ome_fields(t: pa.StructType) -> bool:
873
+ ome_fields = {f.name for f in OME_ARROW_STRUCT}
874
+ col_fields = {f.name for f in t}
875
+ return ome_fields == col_fields
876
+
877
+ candidate_col = None
878
+
879
+ if column_name is not None and column_name in table.column_names:
880
+ arr = table[column_name]
881
+ if not pa.types.is_struct(arr.type):
882
+ raise ValueError(f"Column '{column_name}' is not a Struct; got {arr.type}.")
883
+ if strict_schema and arr.type != OME_ARROW_STRUCT:
884
+ raise ValueError(
885
+ f"Column '{column_name}' schema != OME_ARROW_STRUCT.\n"
886
+ f"Got: {arr.type}\n"
887
+ f"Expect:{OME_ARROW_STRUCT}"
888
+ )
889
+ if not strict_schema and not _struct_matches_ome_fields(arr.type):
890
+ raise ValueError(
891
+ f"Column '{column_name}' does not have the expected OME-Arrow fields."
892
+ )
893
+ candidate_col = arr
894
+ else:
895
+ # Auto-detect a struct column that matches OME-Arrow fields
896
+ for name in table.column_names:
897
+ arr = table[name]
898
+ if pa.types.is_struct(arr.type):
899
+ if strict_schema and arr.type == OME_ARROW_STRUCT:
900
+ candidate_col = arr
901
+ column_name = name
902
+ break
903
+ if not strict_schema and _struct_matches_ome_fields(arr.type):
904
+ candidate_col = arr
905
+ column_name = name
906
+ break
907
+ if candidate_col is None:
908
+ if column_name is None:
909
+ hint = "no struct column with OME-Arrow fields was found."
910
+ else:
911
+ hint = f"column '{column_name}' not found and auto-detection failed."
912
+ raise ValueError(f"Could not locate an OME-Arrow struct column: {hint}")
913
+
914
+ # 2) Extract the row as a Python dict
915
+ # (Using to_pylist() for the single element slice is simple & reliable.)
916
+ record_dict: Dict[str, Any] = candidate_col.slice(row_index, 1).to_pylist()[0]
917
+
918
+ # 3) Reconstruct a typed StructScalar using the canonical schema
919
+ # (this validates field names/types and normalizes order)
920
+ scalar = pa.scalar(record_dict, type=OME_ARROW_STRUCT)
921
+
922
+ # Optional: soft validation via file-level metadata (if present)
923
+ try:
924
+ meta = table.schema.metadata or {}
925
+ meta.get(b"ome.arrow.type", b"").decode() == str(
926
+ OME_ARROW_TAG_TYPE
927
+ ) and meta.get(b"ome.arrow.version", b"").decode() == str(OME_ARROW_TAG_VERSION)
928
+ # You could log/print a warning if tag_ok is False, but don't fail.
929
+ except Exception:
930
+ pass
931
+
932
+ return scalar