microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. microarray/__init__.py +15 -0
  2. microarray/_version.py +3 -0
  3. microarray/datasets/__init__.py +3 -0
  4. microarray/datasets/_arrayexpress.py +1 -0
  5. microarray/datasets/_cdf_files.py +35 -0
  6. microarray/datasets/_geo.py +1 -0
  7. microarray/datasets/_utils.py +143 -0
  8. microarray/io/__init__.py +17 -0
  9. microarray/io/_anndata_converter.py +198 -0
  10. microarray/io/_cdf.py +575 -0
  11. microarray/io/_cel.py +591 -0
  12. microarray/io/_read.py +127 -0
  13. microarray/plotting/__init__.py +28 -0
  14. microarray/plotting/_base.py +253 -0
  15. microarray/plotting/_cel.py +75 -0
  16. microarray/plotting/_de_plots.py +239 -0
  17. microarray/plotting/_diagnostic_plots.py +268 -0
  18. microarray/plotting/_heatmap.py +279 -0
  19. microarray/plotting/_ma_plots.py +136 -0
  20. microarray/plotting/_pca.py +320 -0
  21. microarray/plotting/_qc_plots.py +335 -0
  22. microarray/plotting/_score.py +38 -0
  23. microarray/plotting/_top_table_heatmap.py +98 -0
  24. microarray/plotting/_utils.py +280 -0
  25. microarray/preprocessing/__init__.py +39 -0
  26. microarray/preprocessing/_background.py +862 -0
  27. microarray/preprocessing/_log2.py +77 -0
  28. microarray/preprocessing/_normalize.py +1292 -0
  29. microarray/preprocessing/_rma.py +243 -0
  30. microarray/preprocessing/_robust.py +170 -0
  31. microarray/preprocessing/_summarize.py +318 -0
  32. microarray/py.typed +0 -0
  33. microarray/tools/__init__.py +26 -0
  34. microarray/tools/_biomart.py +416 -0
  35. microarray/tools/_empirical_bayes.py +401 -0
  36. microarray/tools/_fdist.py +171 -0
  37. microarray/tools/_linear_models.py +387 -0
  38. microarray/tools/_mds.py +101 -0
  39. microarray/tools/_pca.py +88 -0
  40. microarray/tools/_score.py +86 -0
  41. microarray/tools/_toptable.py +360 -0
  42. microarray-0.1.0.dist-info/METADATA +75 -0
  43. microarray-0.1.0.dist-info/RECORD +44 -0
  44. microarray-0.1.0.dist-info/WHEEL +4 -0
microarray/io/_cel.py ADDED
@@ -0,0 +1,591 @@
1
+ import gzip
2
+ import os
3
+ import struct
4
+ from io import FileIO
5
+ from typing import TYPE_CHECKING
6
+
7
+ import numpy as np
8
+ from anndata import AnnData
9
+
10
+ if TYPE_CHECKING:
11
+ from microarray.io._cdf import CdfFile
12
+
13
+ __supported_sections = [
14
+ "HEADER",
15
+ "INTENSITY",
16
+ "MASKS",
17
+ "OUTLIERS",
18
+ "MODIFIED",
19
+ ]
20
+
21
+ __header_integer_keys = {
22
+ "Cols": "ncols",
23
+ "Rows": "nrows",
24
+ "TotalX": "total_x",
25
+ "TotalY": "total_y",
26
+ "OffsetX": "offset_x",
27
+ "OffsetY": "offset_y",
28
+ }
29
+
30
+ __header_integer_tuple_keys = {
31
+ "GridCornerUL": "grid_corner_ul",
32
+ "GridCornerUR": "grid_corner_ur",
33
+ "GridCornerLL": "grid_corner_ll",
34
+ "GridCornerLR": "grid_corner_lr",
35
+ }
36
+
37
+ __header_boolean_keys = {
38
+ "Axis-invertX": "axis_invert_x",
39
+ "AxisInvertY": "axis_invert_y",
40
+ "swapXY": "swap_xy",
41
+ }
42
+
43
+ __header_string_keys = {
44
+ "Algorithm": "algorithm",
45
+ }
46
+
47
+
48
+ class CelFile:
49
+ """Class representing the contents of a CEL file."""
50
+
51
+ def __init__(self):
52
+ self.version: int
53
+ self.nrows: int | None = None
54
+ self.ncols: int | None = None
55
+ self.total_x: int | None = None
56
+ self.total_y: int | None = None
57
+ self.offset_x: int | None = None
58
+ self.offset_y: int | None = None
59
+ self.grid_corner_ul: tuple[int, int] | None = None
60
+ self.grid_corner_ur: tuple[int, int] | None = None
61
+ self.grid_corner_ll: tuple[int, int] | None = None
62
+ self.grid_corner_lr: tuple[int, int] | None = None
63
+
64
+ self.algorithm: str | None = None
65
+
66
+ self.axis_invert_x: bool = False
67
+ self.axis_invert_y: bool = False
68
+ self.swap_xy: bool = False
69
+
70
+ self.dat_header: dict[str, str] = {}
71
+ self.algorithm_parameters: dict[str, str] = {}
72
+
73
+ # Numpy arrays - initialized after dimensions are known
74
+ self.intensities: np.ndarray | None = None
75
+ self.stdevs: np.ndarray | None = None
76
+ self.npixels: np.ndarray | None = None
77
+ self.masks: np.ndarray | None = None
78
+ self.outliers: np.ndarray | None = None
79
+ self.modified: np.ndarray | None = None
80
+
81
+ # Probe annotation array - populated by apply_probe_annotation()
82
+ self.probe_annotation: np.ndarray | None = None
83
+
84
+ def summary(self):
85
+ """Print a summary of the CEL file contents."""
86
+ print(f"Version: {self.version}")
87
+ print(f"Dimensions: {self.ncols} x {self.nrows}")
88
+ print(f"Algorithm: {self.algorithm}")
89
+ print(f"Algorithm Parameters: {self.algorithm_parameters}")
90
+
91
+ if self.intensities is not None:
92
+ print("\nData Summary:")
93
+ print(f"\tIntensity array shape: {self.intensities.shape}")
94
+ print(f"\tNon-zero intensity cells: {np.count_nonzero(self.intensities)}")
95
+ print(f"\tMasked cells: {np.sum(self.masks)}")
96
+ print(f"\tOutlier cells: {np.sum(self.outliers)}")
97
+ print(f"\tModified cells: {np.sum(~np.isnan(self.modified))}")
98
+
99
+ print("\nIntensity statistics:")
100
+ print(f"\tMean: {np.mean(self.intensities):.2f}")
101
+ print(f"\tStd: {np.std(self.intensities):.2f}")
102
+ print(f"\tMin: {np.min(self.intensities):.2f}")
103
+ print(f"\tMax: {np.max(self.intensities):.2f}")
104
+
105
+ def __str__(self):
106
+ return f"CelFile(version={self.version}, dimensions=({self.ncols} x {self.nrows}), algorithm={self.algorithm})"
107
+
108
+ def __repr__(self):
109
+ return self.__str__()
110
+
111
+
112
+ def read_cel(
113
+ path: str,
114
+ cdf_path: str | None = None,
115
+ as_anndata: bool = True,
116
+ **kwargs,
117
+ ) -> AnnData | CelFile:
118
+ """Read a CEL file and return a probe-level AnnData object or CelFile object.
119
+
120
+ Parameters
121
+ ----------
122
+ path : str
123
+ Path to the CEL file.
124
+ cdf_path : str | None
125
+ Path to the CDF file (required if as_anndata=True).
126
+ as_anndata : bool
127
+ If ``True`` (default), return a probe-level AnnData object (requires
128
+ *cdf_path*). If ``False``, return a :class:`CelFile` object.
129
+ **kwargs
130
+ Additional keyword arguments forwarded to
131
+ :func:`microarray.io.cel_to_anndata` (``sample_name``).
132
+
133
+ Returns:
134
+ -------
135
+ AnnData | CelFile
136
+ If ``as_anndata=True``: probe-level AnnData with shape (1, n_cells).
137
+ If ``as_anndata=False``: :class:`CelFile` object.
138
+
139
+ Raises:
140
+ ------
141
+ ValueError
142
+ If ``as_anndata=True`` but *cdf_path* is not provided.
143
+ """
144
+ if as_anndata:
145
+ if cdf_path is None:
146
+ raise ValueError(
147
+ "cdf_path is required when as_anndata=True. Set as_anndata=False to get a CelFile object instead."
148
+ )
149
+ from microarray.io._anndata_converter import cel_to_anndata
150
+
151
+ # Forward only the kwargs accepted by the new cel_to_anndata signature
152
+ allowed = {"sample_name"}
153
+ filtered_kwargs = {k: v for k, v in kwargs.items() if k in allowed}
154
+ return cel_to_anndata(path, cdf_path, **filtered_kwargs)
155
+ else:
156
+ return parse_cel(path)
157
+
158
+
159
+ def apply_probe_annotation(cel_file: "CelFile", cdf_file: "CdfFile") -> None:
160
+ """Apply probe annotation from a CDF file to a CelFile instance.
161
+
162
+ Populates the ``probe_annotation`` attribute of *cel_file* with a
163
+ 2-D numpy array of shape ``(nrows, ncols)`` and dtype ``object``. Each
164
+ element contains the probeset (unit) name for that array position as
165
+ defined in the CDF file, or ``None`` for positions not covered by the CDF.
166
+
167
+ Parameters
168
+ ----------
169
+ cel_file : CelFile
170
+ A parsed CEL file instance. Must have valid ``nrows`` and ``ncols``
171
+ attributes (i.e. :func:`parse_cel` must have been called first).
172
+ cdf_file : CdfFile
173
+ A parsed CDF file instance.
174
+
175
+ Returns:
176
+ --------
177
+ None
178
+ The ``probe_annotation`` attribute of *cel_file* is updated in-place.
179
+
180
+ Raises:
181
+ -------
182
+ ValueError
183
+ If *cel_file* does not have valid dimensions.
184
+ """
185
+ if cel_file.nrows is None or cel_file.ncols is None:
186
+ raise ValueError(
187
+ "CelFile does not have valid dimensions. "
188
+ "Make sure parse_cel was called successfully before applying annotation."
189
+ )
190
+
191
+ probe_annotation = cdf_file.get_annotated_array()
192
+
193
+ assert probe_annotation.shape == (cel_file.nrows, cel_file.ncols), (
194
+ f"Probe annotation shape {probe_annotation.shape} does not match CEL file dimensions "
195
+ f"({cel_file.nrows}, {cel_file.ncols})."
196
+ )
197
+
198
+ cel_file.probe_annotation = probe_annotation
199
+
200
+
201
+ def parse_cel(path: str | os.PathLike | FileIO) -> CelFile:
202
+ """Parse a CEL file and return a CelFile object.
203
+
204
+ Supports both version 3 (text) and version 4 (binary) CEL formats.
205
+
206
+ Parameters
207
+ ----------
208
+ path : str | os.PathLike | FileIO
209
+ Path to the CEL file.
210
+
211
+ Returns:
212
+ --------
213
+ CelFile
214
+ A CelFile object containing the parsed contents of the CEL file.
215
+ """
216
+ # Read the entire file as binary
217
+ if isinstance(path, (str, os.PathLike)):
218
+ if str(path).endswith(".gz"):
219
+ with gzip.open(path, "rb") as f:
220
+ file_data = f.read()
221
+ else:
222
+ with open(path, "rb") as f:
223
+ file_data = f.read()
224
+ elif isinstance(path, FileIO):
225
+ file_data = path.read()
226
+ else:
227
+ raise TypeError("Unsupported path type")
228
+
229
+ # Detect version
230
+ is_v4 = False
231
+ if len(file_data) >= 8:
232
+ try:
233
+ magic, version = struct.unpack("<II", file_data[:8])
234
+ if magic == 64:
235
+ is_v4 = True
236
+ except struct.error:
237
+ pass
238
+
239
+ if is_v4:
240
+ return _parse_cel_v4(file_data)
241
+
242
+ # Default to version 3 (text format)
243
+ return _parse_cel_v3(file_data)
244
+
245
+
246
+ def _parse_cel_v3(file_data: bytes) -> CelFile:
247
+ """Parse version 3 (text format) CEL file."""
248
+ try:
249
+ content = file_data.decode("utf-8").splitlines()
250
+ except UnicodeDecodeError:
251
+ content = file_data.decode("latin-1").splitlines()
252
+
253
+ assert content[0].strip() == "[CEL]", f"Expected '[CEL]' header, got: {content[0].strip()}"
254
+
255
+ def _parse_line_key_value(line: str) -> tuple[str, str]:
256
+ """Parse a line in the format 'key=value' and return the key and value as a tuple."""
257
+ key, value = line.strip().split("=", 1)
258
+ return key, value
259
+
260
+ cel_file = CelFile()
261
+
262
+ # Parse version from second line
263
+ version = _parse_line_key_value(content[1])[1]
264
+ cel_file.version = int(version)
265
+
266
+ last_section = None
267
+ for line in content[2:]:
268
+ clean_line = line.strip("\r\n")
269
+
270
+ if not clean_line:
271
+ continue
272
+ if clean_line.startswith("[") and clean_line.endswith("]"):
273
+ section_name = clean_line[1:-1]
274
+ if section_name in __supported_sections:
275
+ last_section = section_name
276
+ else:
277
+ assert last_section is not None, "Data line found before any section header"
278
+
279
+ if last_section == "HEADER":
280
+ key, value = _parse_line_key_value(clean_line)
281
+ if key in __header_integer_keys:
282
+ setattr(cel_file, __header_integer_keys[key], int(value))
283
+ # Initialize numpy arrays once we have dimensions
284
+ if cel_file.ncols is not None and cel_file.nrows is not None:
285
+ if cel_file.intensities is None:
286
+ cel_file.intensities = np.zeros((cel_file.nrows, cel_file.ncols), dtype=np.float32)
287
+ cel_file.stdevs = np.zeros((cel_file.nrows, cel_file.ncols), dtype=np.float32)
288
+ cel_file.npixels = np.zeros((cel_file.nrows, cel_file.ncols), dtype=np.int32)
289
+ cel_file.masks = np.zeros((cel_file.nrows, cel_file.ncols), dtype=bool)
290
+ cel_file.outliers = np.zeros((cel_file.nrows, cel_file.ncols), dtype=bool)
291
+ cel_file.modified = np.full((cel_file.nrows, cel_file.ncols), np.nan, dtype=np.float32)
292
+ elif key in __header_integer_tuple_keys:
293
+ tuple_value = tuple(map(int, value.split(" ")))
294
+ setattr(cel_file, __header_integer_tuple_keys[key], tuple_value)
295
+ elif key in __header_boolean_keys:
296
+ setattr(cel_file, __header_boolean_keys[key], bool(int(value)))
297
+ elif key in __header_string_keys:
298
+ setattr(cel_file, __header_string_keys[key], value)
299
+ elif key == "DatHeader":
300
+ # Parse DatHeader - complex format with multiple fields
301
+ cel_file.dat_header["_raw"] = value
302
+
303
+ # Split by first colon to separate identifier from the rest
304
+ if ":" in value:
305
+ parts = value.split(":", 1)
306
+ # The part before first colon often contains identifier info
307
+ # identifier_part = parts[0]
308
+ data_part = parts[1] if len(parts) > 1 else ""
309
+
310
+ # Extract key=value pairs from the data part
311
+ tokens = data_part.split()
312
+ for token in tokens:
313
+ if "=" in token:
314
+ try:
315
+ dat_key, dat_value = token.split("=", 1)
316
+ # Try to convert to int or float
317
+ if dat_value.isdecimal():
318
+ dat_value = int(dat_value)
319
+ elif "." in dat_value and dat_value.replace(".", "", 1).isdecimal():
320
+ dat_value = float(dat_value)
321
+ cel_file.dat_header[dat_key] = dat_value
322
+ except ValueError:
323
+ # Skip malformed pairs
324
+ pass
325
+ elif key == "AlgorithmParameters":
326
+ # Parse algorithm parameters in format "key1:value1;key2:value2"
327
+ pairs = value.split(";")
328
+ for pair in pairs:
329
+ if ":" in pair:
330
+ param_key, param_value = pair.split(":", 1)
331
+ # Try to convert to int or float
332
+ if param_value.isdecimal():
333
+ param_value = int(param_value)
334
+ elif "." in param_value and param_value.replace(".", "", 1).isdecimal():
335
+ param_value = float(param_value)
336
+ cel_file.algorithm_parameters[param_key] = param_value
337
+ else:
338
+ # Skip unknown header keys or store them
339
+ pass
340
+
341
+ elif last_section == "INTENSITY":
342
+ # Skip section headers like "NumberCells=..." and "CellHeader=..."
343
+ if "=" in clean_line and not clean_line[0].isdigit():
344
+ continue
345
+ # Parse intensity data: X Y MEAN STDV NPIXELS
346
+ parts = clean_line.split()
347
+ if len(parts) == 5:
348
+ x, y, mean, stdv, npix = parts
349
+ x, y = int(x), int(y)
350
+ cel_file.intensities[y, x] = float(mean)
351
+ cel_file.stdevs[y, x] = float(stdv)
352
+ cel_file.npixels[y, x] = int(npix)
353
+
354
+ elif last_section == "MASKS":
355
+ # Skip section headers
356
+ if "=" in clean_line and not clean_line[0].isdigit():
357
+ continue
358
+ # Parse mask data: X Y
359
+ parts = clean_line.split()
360
+ if len(parts) == 2:
361
+ x, y = int(parts[0]), int(parts[1])
362
+ cel_file.masks[y, x] = True
363
+
364
+ elif last_section == "OUTLIERS":
365
+ # Skip section headers
366
+ if "=" in clean_line and not clean_line[0].isdigit():
367
+ continue
368
+ # Parse outlier data: X Y
369
+ parts = clean_line.split()
370
+ if len(parts) == 2:
371
+ x, y = int(parts[0]), int(parts[1])
372
+ cel_file.outliers[y, x] = True
373
+
374
+ elif last_section == "MODIFIED":
375
+ # Skip section headers
376
+ if "=" in clean_line and not clean_line[0].isdigit():
377
+ continue
378
+ # Parse modified data: X Y ORIGMEAN
379
+ parts = clean_line.split()
380
+ if len(parts) == 3:
381
+ x, y = int(parts[0]), int(parts[1])
382
+ origmean = float(parts[2])
383
+ cel_file.modified[y, x] = origmean
384
+
385
+ return cel_file
386
+
387
+
388
+ def _parse_cel_v4(file_data: bytes) -> CelFile:
389
+ """Parse version 4 (binary format) CEL file.
390
+
391
+ Version 4 CEL format:
392
+ - 64-byte binary header
393
+ - Text metadata section
394
+ - Binary algorithm name and parameters (length-prefixed strings)
395
+ - Masks section (count + coordinate pairs)
396
+ - Outliers section (count + coordinate pairs)
397
+ - Intensity data (mean, stdv, npixels per cell in row-major order)
398
+
399
+ Parameters
400
+ ----------
401
+ file_data : bytes
402
+ The raw binary data of the CEL file
403
+
404
+ Returns:
405
+ -------
406
+ CelFile
407
+ Parsed CEL file object
408
+ """
409
+ cel_file = CelFile()
410
+
411
+ # Parse 64-byte binary header
412
+ magic, version, ncols, nrows, ncells, header_len = struct.unpack("<IIIIII", file_data[:24])
413
+
414
+ cel_file.version = version
415
+ cel_file.ncols = ncols
416
+ cel_file.nrows = nrows
417
+ cel_file.total_x = ncols
418
+ cel_file.total_y = nrows
419
+
420
+ # Initialize numpy arrays
421
+ cel_file.intensities = np.zeros((nrows, ncols), dtype=np.float32)
422
+ cel_file.stdevs = np.zeros((nrows, ncols), dtype=np.float32)
423
+ cel_file.npixels = np.zeros((nrows, ncols), dtype=np.int32)
424
+ cel_file.masks = np.zeros((nrows, ncols), dtype=bool)
425
+ cel_file.outliers = np.zeros((nrows, ncols), dtype=bool)
426
+ cel_file.modified = np.full((nrows, ncols), np.nan, dtype=np.float32)
427
+
428
+ # Parse text metadata section (starts at byte 24, length is header_len bytes)
429
+ text_metadata = file_data[24 : 24 + header_len].decode("latin-1", errors="ignore")
430
+
431
+ # Parse key=value pairs from text metadata
432
+ for line in text_metadata.splitlines():
433
+ line = line.strip()
434
+ if not line or "=" not in line:
435
+ continue
436
+
437
+ try:
438
+ key, value = line.split("=", 1)
439
+
440
+ if key in __header_integer_keys:
441
+ # Skip since we already have dims from binary header
442
+ pass
443
+ elif key in __header_integer_tuple_keys:
444
+ tuple_value = tuple(map(int, value.split(" ")))
445
+ setattr(cel_file, __header_integer_tuple_keys[key], tuple_value)
446
+ elif key in __header_boolean_keys:
447
+ setattr(cel_file, __header_boolean_keys[key], bool(int(value)))
448
+ elif key in __header_string_keys:
449
+ setattr(cel_file, __header_string_keys[key], value)
450
+ elif key == "DatHeader":
451
+ # Parse DatHeader - complex format with multiple fields
452
+ # Format: [range] scanner_info:CLS=... RWS=... XIN=... YIN=... VE=... date time ... chip_type ...
453
+ cel_file.dat_header["_raw"] = value
454
+
455
+ # Extract pixel range [min..max]
456
+ import re
457
+
458
+ range_match = re.match(r"\[(\d+)\.\.(\d+)\]", value)
459
+ if range_match:
460
+ cel_file.dat_header["pixel_min"] = int(range_match.group(1))
461
+ cel_file.dat_header["pixel_max"] = int(range_match.group(2))
462
+
463
+ # Split by first colon to separate scanner info from the rest
464
+ if ":" in value:
465
+ parts = value.split(":", 1)
466
+ # Extract scanner info (between ] and :)
467
+ scanner_part = parts[0]
468
+ if "]" in scanner_part:
469
+ scanner_info = scanner_part.split("]", 1)[1].strip()
470
+ if scanner_info:
471
+ cel_file.dat_header["scanner_info"] = scanner_info
472
+
473
+ data_part = parts[1] if len(parts) > 1 else ""
474
+ tokens = data_part.split()
475
+ non_kv_tokens = []
476
+ for token in tokens:
477
+ if "=" in token:
478
+ try:
479
+ dat_key, dat_value = token.split("=", 1)
480
+ if dat_value.isdecimal():
481
+ dat_value = int(dat_value)
482
+ elif "." in dat_value and dat_value.replace(".", "", 1).isdecimal():
483
+ dat_value = float(dat_value)
484
+ cel_file.dat_header[dat_key] = dat_value
485
+ except ValueError:
486
+ pass
487
+ else:
488
+ non_kv_tokens.append(token)
489
+
490
+ # Try to extract date/time and chip type from remaining tokens
491
+ date_pattern = re.compile(r"\d{1,2}/\d{1,2}/\d{2,4}")
492
+ time_pattern = re.compile(r"\d{1,2}:\d{2}:\d{2}")
493
+
494
+ for i, token in enumerate(non_kv_tokens):
495
+ if date_pattern.match(token):
496
+ cel_file.dat_header["scan_date"] = token
497
+ if i + 1 < len(non_kv_tokens) and time_pattern.match(non_kv_tokens[i + 1]):
498
+ cel_file.dat_header["scan_time"] = non_kv_tokens[i + 1]
499
+ elif "_" in token or "." in token:
500
+ if any(c.isalpha() for c in token):
501
+ cel_file.dat_header["chip_type"] = token
502
+ elif key == "Algorithm":
503
+ cel_file.algorithm = value
504
+ elif key == "AlgorithmParameters":
505
+ pairs = value.split(";")
506
+ for pair in pairs:
507
+ if ":" in pair:
508
+ param_key, param_value = pair.split(":", 1)
509
+ try:
510
+ if param_value.isdecimal():
511
+ param_value = int(param_value)
512
+ elif "." in param_value and param_value.replace(".", "", 1).isdecimal():
513
+ param_value = float(param_value)
514
+ cel_file.algorithm_parameters[param_key] = param_value
515
+ except ValueError:
516
+ cel_file.algorithm_parameters[param_key] = param_value
517
+ except ValueError:
518
+ # Skip malformed lines
519
+ continue
520
+
521
+ # Position after text metadata
522
+ pos = 24 + header_len
523
+
524
+ # Read algorithm name (length-prefixed string)
525
+ algo_len = struct.unpack("<I", file_data[pos : pos + 4])[0]
526
+ pos += 4
527
+ if cel_file.algorithm is None and algo_len < 1000:
528
+ cel_file.algorithm = file_data[pos : pos + algo_len].decode("latin-1", errors="ignore")
529
+ pos += algo_len
530
+
531
+ # Read algorithm parameters (length-prefixed string)
532
+ param_len = struct.unpack("<I", file_data[pos : pos + 4])[0]
533
+ pos += 4
534
+ pos += param_len # Skip, already parsed from text section
535
+
536
+ # Read masks section
537
+ nmasks = struct.unpack("<I", file_data[pos : pos + 4])[0]
538
+ pos += 4
539
+ for _ in range(nmasks):
540
+ if pos + 4 <= len(file_data):
541
+ x, y = struct.unpack("<HH", file_data[pos : pos + 4])
542
+ if y < nrows and x < ncols:
543
+ cel_file.masks[y, x] = True
544
+ pos += 4
545
+
546
+ # Read outliers section
547
+ noutliers = struct.unpack("<I", file_data[pos : pos + 4])[0]
548
+ pos += 4
549
+ for _ in range(noutliers):
550
+ if pos + 4 <= len(file_data):
551
+ x, y = struct.unpack("<HH", file_data[pos : pos + 4])
552
+ if y < nrows and x < ncols:
553
+ cel_file.outliers[y, x] = True
554
+ pos += 4
555
+
556
+ # Read modified section (if exists)
557
+ # Note: Modified section format is: count (4 bytes) + entries (x, y, orig_value)
558
+ # Each entry is 8 bytes: x (uint16), y (uint16), orig_value (float32)
559
+ if pos + 4 <= len(file_data):
560
+ nmodified = struct.unpack("<I", file_data[pos : pos + 4])[0]
561
+ # Sanity check: nmodified should be reasonable
562
+ if 0 < nmodified < ncells and pos + 4 + nmodified * 8 < len(file_data):
563
+ pos += 4
564
+ for _ in range(nmodified):
565
+ if pos + 8 <= len(file_data):
566
+ x, y, orig_val = struct.unpack("<HHf", file_data[pos : pos + 8])
567
+ if y < nrows and x < ncols:
568
+ cel_file.modified[y, x] = orig_val
569
+ pos += 8
570
+ else:
571
+ break
572
+ elif nmodified == 0:
573
+ pos += 4 # Skip the zero count
574
+
575
+ # Read intensity data
576
+ # Format: mean (float32), stdv (float32), npixels (uint16) per cell
577
+ # Data is in row-major order (cell 0 = (0,0), cell 1 = (0,1), etc.)
578
+ for i in range(ncells):
579
+ if pos + 10 <= len(file_data):
580
+ mean, stdv, npix = struct.unpack("<ffH", file_data[pos : pos + 10])
581
+ y = i // ncols
582
+ x = i % ncols
583
+ cel_file.intensities[y, x] = mean
584
+ cel_file.stdevs[y, x] = stdv
585
+ cel_file.npixels[y, x] = npix
586
+ pos += 10
587
+ else:
588
+ # If we run out of data, stop reading
589
+ break
590
+
591
+ return cel_file