ome-arrow 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ome_arrow/export.py ADDED
@@ -0,0 +1,422 @@
1
+ """
2
+ Module for exporting OME-Arrow data to other formats.
3
+ """
4
+
5
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
6
+
7
+ import numpy as np
8
+ import pyarrow as pa
9
+ import pyarrow.parquet as pq
10
+
11
+ from ome_arrow.meta import OME_ARROW_STRUCT, OME_ARROW_TAG_TYPE, OME_ARROW_TAG_VERSION
12
+
13
+
14
+ def to_numpy(
15
+ data: Dict[str, Any] | pa.StructScalar,
16
+ dtype: np.dtype = np.uint16,
17
+ strict: bool = True,
18
+ clamp: bool = False,
19
+ ) -> np.ndarray:
20
+ """
21
+ Convert an OME-Arrow record into a NumPy array shaped (T,C,Z,Y,X).
22
+
23
+ The OME-Arrow "planes" are flattened YX slices indexed by (z, t, c).
24
+ This function reconstitutes them into a dense TCZYX ndarray.
25
+
26
+ Args:
27
+ data:
28
+ OME-Arrow data as a Python dict or a `pa.StructScalar`.
29
+ dtype:
30
+ Output dtype (default: np.uint16). If different from plane
31
+ values, a cast (and optional clamp) is applied.
32
+ strict:
33
+ When True, raise if a plane has wrong pixel length. When
34
+ False, truncate/pad that plane to the expected length.
35
+ clamp:
36
+ If True, clamp values to the valid range of the target
37
+ dtype before casting.
38
+
39
+ Returns:
40
+ np.ndarray: Dense array with shape (T, C, Z, Y, X).
41
+
42
+ Raises:
43
+ KeyError: If required OME-Arrow fields are missing.
44
+ ValueError: If dimensions are invalid or planes are malformed.
45
+
46
+ Examples:
47
+ >>> arr = ome_arrow_to_tczyx(my_row) # (T, C, Z, Y, X)
48
+ >>> arr.shape
49
+ (1, 2, 1, 512, 512)
50
+ """
51
+ # Unwrap Arrow scalar to plain Python dict if needed.
52
+ if isinstance(data, pa.StructScalar):
53
+ data = data.as_py()
54
+
55
+ pm = data["pixels_meta"]
56
+ sx, sy = int(pm["size_x"]), int(pm["size_y"])
57
+ sz, sc, st = int(pm["size_z"]), int(pm["size_c"]), int(pm["size_t"])
58
+ if sx <= 0 or sy <= 0 or sz <= 0 or sc <= 0 or st <= 0:
59
+ raise ValueError("All size_* fields must be positive integers.")
60
+
61
+ expected_len = sx * sy
62
+
63
+ # Prepare target array (T,C,Z,Y,X), zero-filled by default.
64
+ out = np.zeros((st, sc, sz, sy, sx), dtype=dtype)
65
+
66
+ # Helper: cast (with optional clamp) to the output dtype.
67
+ if np.issubdtype(dtype, np.integer):
68
+ info = np.iinfo(dtype)
69
+ lo, hi = info.min, info.max
70
+ elif np.issubdtype(dtype, np.floating):
71
+ lo, hi = -np.inf, np.inf
72
+ else:
73
+ # Rare dtypes: no clamping logic; rely on astype.
74
+ lo, hi = -np.inf, np.inf
75
+
76
+ def _cast_plane(a: np.ndarray) -> np.ndarray:
77
+ if clamp:
78
+ a = np.clip(a, lo, hi)
79
+ return a.astype(dtype, copy=False)
80
+
81
+ # Fill planes.
82
+ for i, p in enumerate(data.get("planes", [])):
83
+ z = int(p["z"])
84
+ t = int(p["t"])
85
+ c = int(p["c"])
86
+
87
+ if not (0 <= z < sz and 0 <= t < st and 0 <= c < sc):
88
+ raise ValueError(f"planes[{i}] index out of range: (z,t,c)=({z},{t},{c})")
89
+
90
+ pix = p["pixels"]
91
+ # Ensure sequence-like and correct length.
92
+ try:
93
+ n = len(pix)
94
+ except Exception as e:
95
+ raise ValueError(f"planes[{i}].pixels is not a sequence") from e
96
+
97
+ if n != expected_len:
98
+ if strict:
99
+ raise ValueError(
100
+ f"planes[{i}].pixels length {n} != size_x*size_y {expected_len}"
101
+ )
102
+ # Lenient mode: fix length by truncation or zero-pad.
103
+ if n > expected_len:
104
+ pix = pix[:expected_len]
105
+ else:
106
+ pix = list(pix) + [0] * (expected_len - n)
107
+
108
+ # Reshape to (Y,X) and cast.
109
+ arr2d = np.asarray(pix).reshape(sy, sx)
110
+ arr2d = _cast_plane(arr2d)
111
+ out[t, c, z] = arr2d
112
+
113
+ return out
114
+
115
+
116
+ def to_ome_tiff(
117
+ data: Dict[str, Any] | pa.StructScalar,
118
+ out_path: str,
119
+ *,
120
+ dtype: np.dtype = np.uint16,
121
+ clamp: bool = False,
122
+ dim_order: str = "TCZYX",
123
+ compression: Optional[str] = "zlib", # "zlib","lzma","jpegxl", or None
124
+ compression_level: int = 6,
125
+ tile: Optional[Tuple[int, int]] = None, # (Y, X)
126
+ use_channel_colors: bool = False,
127
+ ) -> None:
128
+ """
129
+ Export an OME-Arrow record to OME-TIFF using BioIO's OmeTiffWriter.
130
+
131
+ Notes
132
+ -----
133
+ - No 'bigtiff' kwarg is passed (invalid for tifffile.TiffWriter.write()).
134
+ BigTIFF selection is automatic based on file size.
135
+ """
136
+ from ome_arrow.export import to_numpy # your existing function
137
+
138
+ try:
139
+ from bioio.writers import OmeTiffWriter
140
+ except Exception:
141
+ from bioio_ome_tiff.writers import OmeTiffWriter # type: ignore
142
+
143
+ # PhysicalPixelSizes (robust import or shim)
144
+ try:
145
+ from bioio import PhysicalPixelSizes # modern bioio
146
+ except Exception:
147
+ try:
148
+ from bioio.types import PhysicalPixelSizes
149
+ except Exception:
150
+ try:
151
+ from aicsimageio.types import PhysicalPixelSizes
152
+ except Exception:
153
+ from typing import NamedTuple
154
+ from typing import Optional as _Opt
155
+
156
+ class PhysicalPixelSizes(NamedTuple): # type: ignore
157
+ Z: _Opt[float] = None
158
+ Y: _Opt[float] = None
159
+ X: _Opt[float] = None
160
+
161
+ # 1) Dense array (T,C,Z,Y,X)
162
+ arr = to_numpy(data, dtype=dtype, clamp=clamp)
163
+
164
+ # 2) Metadata
165
+ row = data.as_py() if isinstance(data, pa.StructScalar) else data
166
+ pm = row["pixels_meta"]
167
+ _st, sc, _sz, _sy, _sx = arr.shape
168
+
169
+ # Channel names
170
+ chs: Sequence[Dict[str, Any]] = pm.get("channels", []) or []
171
+ channel_names = [f"C{i}" for i in range(sc)]
172
+ if len(chs) == sc:
173
+ for i, ch in enumerate(chs):
174
+ nm = ch.get("name")
175
+ if nm is not None:
176
+ channel_names[i] = str(nm)
177
+
178
+ # Optional channel colors (guarded)
179
+ channel_colors_for_writer = None
180
+ if use_channel_colors and len(chs) == sc:
181
+
182
+ def _rgba_to_rgb(rgba: int) -> int:
183
+ r = (rgba >> 24) & 0xFF
184
+ g = (rgba >> 16) & 0xFF
185
+ b = (rgba >> 8) & 0xFF
186
+ return (r << 16) | (g << 8) | b
187
+
188
+ flat_colors: list[int] = []
189
+ for ch in chs:
190
+ rgba = ch.get("color_rgba")
191
+ flat_colors.append(
192
+ _rgba_to_rgb(int(rgba)) if isinstance(rgba, int) else 0xFFFFFF
193
+ )
194
+ if len(flat_colors) == sc:
195
+ channel_colors_for_writer = [flat_colors] # list-per-image
196
+
197
+ # Physical sizes (µm) in Z, Y, X order for BioIO
198
+ p_dx = float(pm.get("physical_size_x", 1.0) or 1.0)
199
+ p_dy = float(pm.get("physical_size_y", 1.0) or 1.0)
200
+ p_dz = float(pm.get("physical_size_z", 1.0) or 1.0)
201
+ pps_list = [PhysicalPixelSizes(Z=p_dz, Y=p_dy, X=p_dx)]
202
+
203
+ # tifffile passthrough (NO 'bigtiff' here)
204
+ tifffile_kwargs: Dict[str, Any] = {}
205
+ if compression is not None:
206
+ tifffile_kwargs["compression"] = compression
207
+ if compression == "zlib":
208
+ tifffile_kwargs["compressionargs"] = {"level": int(compression_level)}
209
+ if tile is not None:
210
+ tifffile_kwargs["tile"] = (int(tile[0]), int(tile[1]))
211
+
212
+ # list-per-image payloads
213
+ data_list = [arr]
214
+ dim_order_list = [dim_order]
215
+ image_name_list = [str(row.get("name") or row.get("id") or "image")]
216
+ ch_names_list = [channel_names]
217
+
218
+ # 3) Write
219
+ OmeTiffWriter.save(
220
+ data_list,
221
+ out_path,
222
+ dim_order=dim_order_list,
223
+ image_name=image_name_list,
224
+ channel_names=ch_names_list,
225
+ channel_colors=channel_colors_for_writer, # None or [flat list len=sc]
226
+ physical_pixel_sizes=pps_list,
227
+ tifffile_kwargs=tifffile_kwargs,
228
+ )
229
+
230
+
231
+ def to_ome_zarr(
232
+ data: Dict[str, Any] | pa.StructScalar,
233
+ out_path: str,
234
+ *,
235
+ dtype: np.dtype = np.uint16,
236
+ clamp: bool = False,
237
+ # Axes order for the on-disk array — must match arr shape (T,C,Z,Y,X)
238
+ dim_order: str = "TCZYX",
239
+ # NGFF / multiscale
240
+ multiscale_levels: int = 1, # 1 = no pyramid; >1 builds levels
241
+ downscale_spatial_by: int = 2, # per-level factor for Z,Y,X
242
+ zarr_format: int = 3, # 3 (NGFF 0.5) or 2 (NGFF 0.4)
243
+ # Storage knobs
244
+ chunks: Optional[Tuple[int, int, int, int, int]] = None, # (T,C,Z,Y,X) or None
245
+ shards: Optional[Tuple[int, int, int, int, int]] = None, # v3 only, optional
246
+ compressor: Optional[str] = "zstd", # "zstd","lz4","gzip", or None
247
+ compressor_level: int = 3,
248
+ # Optional display metadata (carried through if you later enrich channels/rdefs)
249
+ image_name: Optional[str] = None,
250
+ ) -> None:
251
+ """
252
+ Write OME-Zarr using your `OMEZarrWriter` (instance API).
253
+
254
+ - Builds arr as (T,C,Z,Y,X) using your `to_numpy`.
255
+ - Creates level shapes for a multiscale pyramid (if multiscale_levels>1).
256
+ - Chooses Blosc codec compatible with zarr_format (v2 vs v3).
257
+ - Populates axes names/types/units and physical pixel sizes from pixels_meta.
258
+ """
259
+ # --- local import to avoid hard deps at module import time
260
+ # Use the class you showed
261
+ from bioio_ome_zarr.writers import OMEZarrWriter
262
+
263
+ from ome_arrow.export import to_numpy # your existing function
264
+
265
+ # Optional compressors for v2 vs v3
266
+ compressor_obj = None
267
+ if compressor is not None:
268
+ if zarr_format == 2:
269
+ # numcodecs Blosc (v2 path)
270
+ from numcodecs import Blosc as BloscV2
271
+
272
+ cname = {"zstd": "zstd", "lz4": "lz4", "gzip": "zlib"}.get(
273
+ compressor, "zstd"
274
+ )
275
+ compressor_obj = BloscV2(
276
+ cname=cname, clevel=int(compressor_level), shuffle=BloscV2.BITSHUFFLE
277
+ )
278
+ else:
279
+ # zarr v3 codec
280
+ from zarr.codecs import BloscCodec, BloscShuffle
281
+
282
+ cname = {"zstd": "zstd", "lz4": "lz4", "gzip": "zlib"}.get(
283
+ compressor, "zstd"
284
+ )
285
+ compressor_obj = BloscCodec(
286
+ cname=cname,
287
+ clevel=int(compressor_level),
288
+ shuffle=BloscShuffle.bitshuffle,
289
+ )
290
+
291
+ # 1) Dense pixel data (T,C,Z,Y,X)
292
+ arr = to_numpy(data, dtype=dtype, clamp=clamp)
293
+
294
+ # 2) Unwrap OME-Arrow metadata
295
+ row = data.as_py() if isinstance(data, pa.StructScalar) else data
296
+ pm = row["pixels_meta"]
297
+ st, sc, sz, sy, sx = arr.shape
298
+
299
+ # 3) Axis metadata (names/types/units aligned with T,C,Z,Y,X)
300
+ axes_names = [a.lower() for a in dim_order] # ["t","c","z","y","x"]
301
+ axes_types = ["time", "channel", "space", "space", "space"]
302
+ # Units: micrometers for spatial, leave T/C None
303
+ axes_units = [
304
+ None,
305
+ None,
306
+ pm.get("physical_size_z_unit") or "µm",
307
+ pm.get("physical_size_y_unit") or "µm",
308
+ pm.get("physical_size_x_unit") or "µm",
309
+ ]
310
+ # Physical pixel sizes at level 0 in axis order
311
+ p_dx = float(pm.get("physical_size_x", 1.0) or 1.0)
312
+ p_dy = float(pm.get("physical_size_y", 1.0) or 1.0)
313
+ p_dz = float(pm.get("physical_size_z", 1.0) or 1.0)
314
+ physical_pixel_size = [1.0, 1.0, p_dz, p_dy, p_dx] # T,C,Z,Y,X
315
+
316
+ # 4) Multiscale level shapes (level 0 first). Only spatial dims are downscaled.
317
+ def _down(a: int, f: int) -> int:
318
+ return max(1, a // f)
319
+
320
+ def _level_shapes_tcxyz(levels: int) -> List[Tuple[int, int, int, int, int]]:
321
+ shapes = [(st, sc, sz, sy, sx)]
322
+ for _ in range(levels - 1):
323
+ t, c, z, y, x = shapes[-1]
324
+ shapes.append(
325
+ (
326
+ t,
327
+ c,
328
+ _down(z, downscale_spatial_by),
329
+ _down(y, downscale_spatial_by),
330
+ _down(x, downscale_spatial_by),
331
+ )
332
+ )
333
+ return shapes
334
+
335
+ multiscale_levels = max(1, int(multiscale_levels))
336
+ level_shapes: List[Tuple[int, int, int, int, int]] = _level_shapes_tcxyz(
337
+ multiscale_levels
338
+ )
339
+
340
+ # 5) Chunking / shards (can be single-shape or per-level;
341
+ # we pass single-shape if provided)
342
+ chunk_shape: Optional[List[Tuple[int, ...]]] = None
343
+ if chunks is not None:
344
+ chunk_shape = [tuple(int(v) for v in chunks)] * multiscale_levels
345
+
346
+ shard_shape: Optional[List[Tuple[int, ...]]] = None
347
+ if shards is not None and zarr_format == 3:
348
+ shard_shape = [tuple(int(v) for v in shards)] * multiscale_levels
349
+
350
+ # 6) Image name default
351
+ img_name = image_name or str(row.get("name") or row.get("id") or "Image")
352
+
353
+ # 7) Instantiate writer with your class constructor
354
+ writer = OMEZarrWriter(
355
+ store=out_path,
356
+ level_shapes=level_shapes,
357
+ dtype=dtype,
358
+ chunk_shape=chunk_shape,
359
+ shard_shape=shard_shape,
360
+ compressor=compressor_obj,
361
+ zarr_format=3 if int(zarr_format) == 3 else 2,
362
+ image_name=img_name,
363
+ channels=None, # you can map your channel metadata here later
364
+ rdefs=None, # optional OMERO display metadata
365
+ creator_info=None, # optional "creator" block
366
+ root_transform=None, # optional NGFF root transform
367
+ axes_names=axes_names,
368
+ axes_types=axes_types,
369
+ axes_units=axes_units,
370
+ physical_pixel_size=physical_pixel_size,
371
+ )
372
+
373
+ # 8) Write full-resolution; writer will build & fill lower levels
374
+ writer.write_full_volume(arr)
375
+
376
+
377
+ def to_ome_parquet(
378
+ data: Dict[str, Any] | pa.StructScalar,
379
+ out_path: str,
380
+ column_name: str = "image",
381
+ file_metadata: Optional[Dict[str, str]] = None,
382
+ compression: Optional[str] = "zstd",
383
+ row_group_size: Optional[int] = None,
384
+ ) -> None:
385
+ """
386
+ Export an OME-Arrow record to a Parquet file as a single-row, single-column table.
387
+ The single column holds a struct with the OME-Arrow schema.
388
+ """
389
+
390
+ # 1) Normalize to a plain Python dict (works better with pyarrow builders,
391
+ # especially when the struct has a `null`-typed field like "masks").
392
+ if isinstance(data, pa.StructScalar):
393
+ record_dict = data.as_py()
394
+ else:
395
+ # Validate by round-tripping through a typed scalar, then back to dict.
396
+ record_dict = pa.scalar(data, type=OME_ARROW_STRUCT).as_py()
397
+
398
+ # 2) Build a single-row struct array from the dict, explicitly passing the schema
399
+ struct_array = pa.array([record_dict], type=OME_ARROW_STRUCT) # len=1
400
+
401
+ # 3) Wrap into a one-column table
402
+ table = pa.table({column_name: struct_array})
403
+
404
+ # 4) Attach optional file-level metadata
405
+ meta: Dict[bytes, bytes] = dict(table.schema.metadata or {})
406
+ try:
407
+ meta[b"ome.arrow.type"] = str(OME_ARROW_TAG_TYPE).encode("utf-8")
408
+ meta[b"ome.arrow.version"] = str(OME_ARROW_TAG_VERSION).encode("utf-8")
409
+ except Exception:
410
+ pass
411
+ if file_metadata:
412
+ for k, v in file_metadata.items():
413
+ meta[str(k).encode("utf-8")] = str(v).encode("utf-8")
414
+ table = table.replace_schema_metadata(meta)
415
+
416
+ # 5) Write Parquet (single row, single column)
417
+ pq.write_table(
418
+ table,
419
+ out_path,
420
+ compression=compression,
421
+ row_group_size=row_group_size,
422
+ )