spacr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. spacr/__init__.py +2 -2
  2. spacr/core.py +52 -10
  3. spacr/deep_spacr.py +2 -3
  4. spacr/gui.py +0 -1
  5. spacr/gui_core.py +247 -41
  6. spacr/gui_elements.py +133 -2
  7. spacr/gui_utils.py +22 -17
  8. spacr/io.py +624 -149
  9. spacr/ml.py +141 -258
  10. spacr/plot.py +76 -34
  11. spacr/resources/MEDIAR/__pycache__/SetupDict.cpython-39.pyc +0 -0
  12. spacr/resources/MEDIAR/__pycache__/evaluate.cpython-39.pyc +0 -0
  13. spacr/resources/MEDIAR/__pycache__/generate_mapping.cpython-39.pyc +0 -0
  14. spacr/resources/MEDIAR/__pycache__/main.cpython-39.pyc +0 -0
  15. spacr/resources/MEDIAR/core/Baseline/__pycache__/Predictor.cpython-39.pyc +0 -0
  16. spacr/resources/MEDIAR/core/Baseline/__pycache__/Trainer.cpython-39.pyc +0 -0
  17. spacr/resources/MEDIAR/core/Baseline/__pycache__/__init__.cpython-39.pyc +0 -0
  18. spacr/resources/MEDIAR/core/Baseline/__pycache__/utils.cpython-39.pyc +0 -0
  19. spacr/resources/MEDIAR/core/MEDIAR/__pycache__/EnsemblePredictor.cpython-39.pyc +0 -0
  20. spacr/resources/MEDIAR/core/MEDIAR/__pycache__/Predictor.cpython-39.pyc +0 -0
  21. spacr/resources/MEDIAR/core/MEDIAR/__pycache__/Trainer.cpython-39.pyc +0 -0
  22. spacr/resources/MEDIAR/core/MEDIAR/__pycache__/__init__.cpython-39.pyc +0 -0
  23. spacr/resources/MEDIAR/core/MEDIAR/__pycache__/utils.cpython-39.pyc +0 -0
  24. spacr/resources/MEDIAR/core/__pycache__/BasePredictor.cpython-39.pyc +0 -0
  25. spacr/resources/MEDIAR/core/__pycache__/BaseTrainer.cpython-39.pyc +0 -0
  26. spacr/resources/MEDIAR/core/__pycache__/__init__.cpython-39.pyc +0 -0
  27. spacr/resources/MEDIAR/core/__pycache__/utils.cpython-39.pyc +0 -0
  28. spacr/resources/MEDIAR/train_tools/__pycache__/__init__.cpython-39.pyc +0 -0
  29. spacr/resources/MEDIAR/train_tools/__pycache__/measures.cpython-39.pyc +0 -0
  30. spacr/resources/MEDIAR/train_tools/__pycache__/utils.cpython-39.pyc +0 -0
  31. spacr/resources/MEDIAR/train_tools/data_utils/__pycache__/__init__.cpython-39.pyc +0 -0
  32. spacr/resources/MEDIAR/train_tools/data_utils/__pycache__/datasetter.cpython-39.pyc +0 -0
  33. spacr/resources/MEDIAR/train_tools/data_utils/__pycache__/transforms.cpython-39.pyc +0 -0
  34. spacr/resources/MEDIAR/train_tools/data_utils/__pycache__/utils.cpython-39.pyc +0 -0
  35. spacr/resources/MEDIAR/train_tools/data_utils/custom/__pycache__/CellAware.cpython-39.pyc +0 -0
  36. spacr/resources/MEDIAR/train_tools/data_utils/custom/__pycache__/LoadImage.cpython-39.pyc +0 -0
  37. spacr/resources/MEDIAR/train_tools/data_utils/custom/__pycache__/NormalizeImage.cpython-39.pyc +0 -0
  38. spacr/resources/MEDIAR/train_tools/data_utils/custom/__pycache__/__init__.cpython-39.pyc +0 -0
  39. spacr/resources/MEDIAR/train_tools/models/__pycache__/MEDIARFormer.cpython-39.pyc +0 -0
  40. spacr/resources/MEDIAR/train_tools/models/__pycache__/__init__.cpython-39.pyc +0 -0
  41. spacr/sequencing.py +73 -38
  42. spacr/settings.py +161 -135
  43. spacr/submodules.py +618 -215
  44. spacr/timelapse.py +197 -29
  45. spacr/toxo.py +23 -23
  46. spacr/utils.py +186 -128
  47. {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/METADATA +5 -2
  48. {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/RECORD +53 -24
  49. spacr/stats.py +0 -221
  50. /spacr/{cellpose.py → spacr_cellpose.py} +0 -0
  51. {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/LICENSE +0 -0
  52. {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/WHEEL +0 -0
  53. {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/entry_points.txt +0 -0
  54. {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/top_level.txt +0 -0
spacr/io.py CHANGED
@@ -1,4 +1,4 @@
1
- import os, re, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, cellpose, glob, queue, tifffile, czifile, atexit, datetime
1
+ import os, re, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, cellpose, glob, queue, tifffile, czifile, atexit, datetime, traceback
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
  from PIL import Image, ImageOps
@@ -23,6 +23,8 @@ import seaborn as sns
23
23
  from nd2reader import ND2Reader
24
24
  from torchvision import transforms
25
25
  from sklearn.model_selection import train_test_split
26
+ import readlif
27
+ from pylibCZIrw import czi as pyczi
26
28
 
27
29
  def process_non_tif_non_2D_images(folder):
28
30
  """Processes all images in the folder and splits them into grayscale channels, preserving bit depth."""
@@ -131,58 +133,61 @@ def process_non_tif_non_2D_images(folder):
131
133
 
132
134
  def _load_images_and_labels(image_files, label_files, invert=False):
133
135
 
134
- from .utils import invert_image, apply_mask
136
+ from .utils import invert_image
135
137
 
136
138
  images = []
137
139
  labels = []
138
-
139
- if not image_files is None:
140
- image_names = sorted([os.path.basename(f) for f in image_files])
141
- else:
142
- image_names = []
143
-
144
- if not label_files is None:
145
- label_names = sorted([os.path.basename(f) for f in label_files])
146
- else:
147
- label_names = []
148
140
 
149
- if not image_files is None and not label_files is None:
141
+ image_names = sorted([os.path.basename(f) for f in image_files]) if image_files else []
142
+ label_names = sorted([os.path.basename(f) for f in label_files]) if label_files else []
143
+
144
+ if image_files and label_files:
150
145
  for img_file, lbl_file in zip(image_files, label_files):
151
146
  image = cellpose.io.imread(img_file)
147
+ if image is None:
148
+ print(f"WARNING: Could not load image: {img_file}")
149
+ continue
152
150
  if invert:
153
151
  image = invert_image(image)
154
- label = cellpose.io.imread(lbl_file)
155
152
  if image.max() > 1:
156
153
  image = image / image.max()
154
+
155
+ label = cellpose.io.imread(lbl_file)
156
+ if label is None:
157
+ print(f"WARNING: Could not load label: {lbl_file}")
158
+ continue
159
+
157
160
  images.append(image)
158
161
  labels.append(label)
159
- elif not image_files is None:
162
+
163
+ elif image_files:
160
164
  for img_file in image_files:
161
165
  image = cellpose.io.imread(img_file)
166
+ if image is None:
167
+ print(f"WARNING: Could not load image: {img_file}")
168
+ continue
162
169
  if invert:
163
170
  image = invert_image(image)
164
171
  if image.max() > 1:
165
172
  image = image / image.max()
166
173
  images.append(image)
167
- elif not image_files is None:
168
- for lbl_file in label_files:
169
- label = cellpose.io.imread(lbl_file)
174
+
175
+ elif label_files:
176
+ for lbl_file in label_files:
177
+ label = cellpose.io.imread(lbl_file)
178
+ if label is None:
179
+ print(f"WARNING: Could not load label: {lbl_file}")
180
+ continue
170
181
  labels.append(label)
171
-
172
- if not image_files is None:
173
- image_dir = os.path.dirname(image_files[0])
174
- else:
175
- image_dir = None
176
-
177
- if not label_files is None:
178
- label_dir = os.path.dirname(label_files[0])
179
- else:
180
- label_dir = None
181
-
182
- # Log the number of loaded images and labels
182
+
183
+ image_dir = os.path.dirname(image_files[0]) if image_files else None
184
+ label_dir = os.path.dirname(label_files[0]) if label_files else None
185
+
183
186
  print(f'Loaded {len(images)} images and {len(labels)} labels from {image_dir} and {label_dir}')
184
- if len(labels) > 0 and len(images) > 0:
185
- print(f'image shape: {images[0].shape}, image type: images[0].shape mask shape: {labels[0].shape}, image type: labels[0].shape')
187
+ if images and labels:
188
+ print(f'image shape: {images[0].shape}, image type: {images[0].dtype}; '
189
+ f'label shape: {labels[0].shape}, label type: {labels[0].dtype}')
190
+
186
191
  return images, labels, image_names, label_names
187
192
 
188
193
  def _load_normalized_images_and_labels(image_files, label_files, channels=None, percentiles=None,
@@ -647,8 +652,8 @@ def load_images_from_paths(images_by_key):
647
652
 
648
653
  return images_dict
649
654
 
650
- #@log_function_call
651
- def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=False, skip_mode='01', metadata_type='', img_format='.tif'):
655
+ #@log_function_call
656
+ def _rename_and_organize_image_files(src, regex, batch_size=100, metadata_type='', img_format='.tif', timelapse=False):
652
657
  """
653
658
  Convert z-stack images to maximum intensity projection (MIP) images.
654
659
 
@@ -656,24 +661,26 @@ def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=Fals
656
661
  src (str): The source directory containing the z-stack images.
657
662
  regex (str): The regular expression pattern used to match the filenames of the z-stack images.
658
663
  batch_size (int, optional): The number of images to process in each batch. Defaults to 100.
659
- pick_slice (bool, optional): Whether to pick a specific slice based on the provided skip mode. Defaults to False.
660
- skip_mode (str, optional): The skip mode used to filter out specific slices. Defaults to '01'.
661
664
  metadata_type (str, optional): The type of metadata associated with the images. Defaults to ''.
662
665
 
663
666
  Returns:
664
667
  None
665
668
  """
666
669
 
670
+ if isinstance(img_format, str):
671
+ img_format = [img_format]
672
+
667
673
  from .utils import _extract_filename_metadata, print_progress
668
674
 
669
675
  regular_expression = re.compile(regex)
670
676
  stack_path = os.path.join(src, 'stack')
671
677
  files_processed = 0
672
678
  if not os.path.exists(stack_path) or (os.path.isdir(stack_path) and len(os.listdir(stack_path)) == 0):
673
- all_filenames = [filename for filename in os.listdir(src) if filename.endswith(img_format)]
679
+ all_filenames = [filename for filename in os.listdir(src) if any(filename.endswith(ext) for ext in img_format)]
674
680
  print(f'All files: {len(all_filenames)} in {src}')
681
+ all_filenames = [f for f in all_filenames if not f.startswith('.')] #Exclude hidden files
675
682
  time_ls = []
676
- image_paths_by_key = _extract_filename_metadata(all_filenames, src, regular_expression, metadata_type, pick_slice, skip_mode)
683
+ image_paths_by_key = _extract_filename_metadata(all_filenames, src, regular_expression, metadata_type)
677
684
  # Convert dictionary keys to a list for batching
678
685
  batching_keys = list(image_paths_by_key.keys())
679
686
  print(f'All unique FOV: {len(image_paths_by_key)} in {src}')
@@ -684,56 +691,43 @@ def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=Fals
684
691
  batch_keys = batching_keys[idx:idx+batch_size]
685
692
  batch_images_by_key = {key: image_paths_by_key[key] for key in batch_keys}
686
693
  images_by_key = load_images_from_paths(batch_images_by_key)
687
-
688
- if pick_slice:
689
- for i, key in enumerate(images_by_key):
690
- plate, well, field, channel, mode = key
691
- max_intensity_slice = max(images_by_key[key], key=lambda x: np.percentile(x, 90))
692
- mip_image = Image.fromarray(max_intensity_slice)
693
- output_dir = os.path.join(src, channel)
694
- os.makedirs(output_dir, exist_ok=True)
695
- output_filename = f'{plate}_{well}_{field}.tif'
696
- output_path = os.path.join(output_dir, output_filename)
697
- files_processed += 1
698
- stop = time.time()
699
- duration = stop - start
700
- time_ls.append(duration)
701
- files_to_process = len(all_filenames)
702
- print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type='Preprocessing filenames')
703
-
704
- if not os.path.exists(output_path):
705
- mip_image.save(output_path)
706
- else:
707
- print(f'WARNING: A file with the same name already exists at location {output_filename}')
708
- else:
709
- for i, (key, images) in enumerate(images_by_key.items()):
710
- plate, well, field, channel = key[:4]
711
- output_dir = os.path.join(src, channel)
712
- mip = np.max(np.stack(images), axis=0)
713
- mip_image = Image.fromarray(mip)
714
- os.makedirs(output_dir, exist_ok=True)
694
+
695
+ # Process each batch of images
696
+ for i, (key, images) in enumerate(images_by_key.items()):
697
+
698
+ plate, well, field, channel, timeID, sliceID = key
699
+
700
+ if timelapse:
715
701
  output_filename = f'{plate}_{well}_{field}.tif'
716
- output_path = os.path.join(output_dir, output_filename)
717
- files_processed += 1
718
- stop = time.time()
719
- duration = stop - start
720
- time_ls.append(duration)
721
- files_to_process = len(all_filenames)
722
- print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type='Preprocessing filenames')
723
-
724
- if not os.path.exists(output_path):
725
- mip_image.save(output_path)
726
- else:
727
- print(f'WARNING: A file with the same name already exists at location {output_filename}')
702
+ else:
703
+ output_filename = f'{plate}_{well}_{field}_{timeID}.tif'
704
+
705
+ output_dir = os.path.join(src, channel)
706
+ os.makedirs(output_dir, exist_ok=True)
707
+ output_path = os.path.join(output_dir, output_filename)
708
+ mip = np.max(np.stack(images), axis=0)
709
+ mip_image = Image.fromarray(mip)
710
+
711
+ files_processed += 1
712
+ stop = time.time()
713
+ duration = stop - start
714
+ time_ls.append(duration)
715
+ files_to_process = len(all_filenames)
716
+ print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type='Preprocessing filenames')
717
+
718
+ if not os.path.exists(output_path):
719
+ mip_image.save(output_path)
720
+ else:
721
+ print(f'WARNING: A file with the same name already exists at location {output_filename}')
728
722
 
729
723
  images_by_key.clear()
730
724
 
731
725
  # Move original images to a new directory
732
- valid_exts = [img_format]
733
726
  newpath = os.path.join(src, 'orig')
734
727
  os.makedirs(newpath, exist_ok=True)
735
728
  for filename in os.listdir(src):
736
- if os.path.splitext(filename)[1] in valid_exts:
729
+ #print(f"{filename}: {os.path.splitext(filename)[1]}")
730
+ if os.path.splitext(filename)[1] in img_format:
737
731
  move = os.path.join(newpath, filename)
738
732
  if os.path.exists(move):
739
733
  print(f'WARNING: A file with the same name already exists at location {move}')
@@ -1236,7 +1230,11 @@ def concatenate_and_normalize(src, channels, save_dtype=np.float32, settings={})
1236
1230
  files_processed = 0
1237
1231
  for i, path in enumerate(paths):
1238
1232
  start = time.time()
1239
- array = np.load(path)
1233
+ try:
1234
+ array = np.load(path)
1235
+ except Exception as e:
1236
+ print(f"Error loading file {path}: {e}")
1237
+ continue
1240
1238
  stack_ls.append(array)
1241
1239
  filenames_batch.append(os.path.basename(path))
1242
1240
  stop = time.time()
@@ -1564,30 +1562,34 @@ def preprocess_img_data(settings):
1564
1562
  save_dtype (type, optional): The data type used for saving the preprocessed images. Defaults to np.float32.
1565
1563
  randomize (bool, optional): Whether to randomize the order of the images. Defaults to True.
1566
1564
  all_to_mip (bool, optional): Whether to convert all images to MIP. Defaults to False.
1567
- pick_slice (bool, optional): Whether to pick a specific slice based on the provided skip mode. Defaults to False.
1568
- skip_mode (str, optional): The skip mode used to filter out specific slices. Defaults to '01'.
1569
1565
  settings (dict, optional): Additional settings for preprocessing. Defaults to {}.
1570
1566
 
1571
1567
  Returns:
1572
1568
  None
1573
1569
  """
1574
-
1570
+
1575
1571
  src = settings['src']
1576
- valid_ext = ['tif', 'tiff', 'png', 'jpeg']
1572
+ delete_empty_subdirectories(src)
1577
1573
  files = os.listdir(src)
1578
- extensions = [file.split('.')[-1] for file in files]
1579
- extension_counts = Counter(extensions)
1580
- most_common_extension = extension_counts.most_common(1)[0][0]
1581
- img_format = None
1582
1574
 
1583
- delete_empty_subdirectories(src)
1575
+ valid_ext = ['tif', 'tiff', 'png', 'jpg', 'jpeg', 'bmp', 'nd2', 'czi', 'lif']
1576
+ extensions = [file.split('.')[-1].lower() for file in files]
1577
+ # Filter only valid extensions
1578
+ valid_extensions = [ext for ext in extensions if ext in valid_ext]
1584
1579
 
1585
- # Check if the most common extension is one of the specified image formats
1586
- if most_common_extension in valid_ext:
1587
- img_format = f'.{most_common_extension}'
1588
- print(f'Found {extension_counts[most_common_extension]} {most_common_extension} files')
1580
+ # Determine most common valid extension
1581
+ img_format = None
1582
+ if valid_extensions:
1583
+ extension_counts = Counter(valid_extensions)
1584
+ most_common_extension = Counter(valid_extensions).most_common(1)[0][0]
1585
+ img_format = most_common_extension
1586
+
1587
+ print(f"Found {extension_counts[most_common_extension]} {most_common_extension} files")
1588
+
1589
1589
  else:
1590
- print(f'Could not find any {valid_ext} files in {src} only found {extension_counts[0]}')
1590
+ print(f"Could not find any {valid_ext} files in {src} only found {extension_counts[0]}")
1591
+ print(f"{files} in {src}")
1592
+ print(f"Please check the folder and try again")
1591
1593
 
1592
1594
  if os.path.exists(os.path.join(src,'stack')):
1593
1595
  print('Found existing stack folder.')
@@ -1598,23 +1600,24 @@ def preprocess_img_data(settings):
1598
1600
  return settings, src
1599
1601
 
1600
1602
  mask_channels = [settings['nucleus_channel'], settings['cell_channel'], settings['pathogen_channel']]
1601
- backgrounds = [settings['nucleus_background'], settings['cell_background'], settings['pathogen_background']]
1602
1603
 
1603
- settings, metadata_type, custom_regex, nr, plot, batch_size, timelapse, lower_percentile, randomize, all_to_mip, pick_slice, skip_mode, cmap, figuresize, normalize, save_dtype, test_mode, test_images, random_test = set_default_settings_preprocess_img_data(settings)
1604
+ settings = set_default_settings_preprocess_img_data(settings)
1604
1605
 
1605
- regex = _get_regex(metadata_type, img_format, custom_regex)
1606
-
1607
- if test_mode:
1606
+ regex = _get_regex(settings['metadata_type'], img_format, settings['custom_regex'])
1607
+
1608
+ if settings['test_mode']:
1608
1609
 
1609
- print(f'Running spacr in test mode')
1610
+ print(f"Running spacr in test mode")
1610
1611
  settings['plot'] = True
1611
1612
  try:
1612
1613
  os.rmdir(os.path.join(src, 'test'))
1613
1614
  print(f"Deleted test directory: {os.path.join(src, 'test')}")
1614
1615
  except OSError as e:
1616
+ print(f"Error deleting test directory: {e}")
1617
+ print(f"Delete manually before running test mode")
1615
1618
  pass
1616
1619
 
1617
- src = _run_test_mode(settings['src'], regex, timelapse, test_images, random_test)
1620
+ src = _run_test_mode(settings['src'], regex, settings['timelapse'], settings['test_images'], settings['random_test'])
1618
1621
  settings['src'] = src
1619
1622
 
1620
1623
  stack_path = os.path.join(src, 'stack')
@@ -1625,46 +1628,45 @@ def preprocess_img_data(settings):
1625
1628
  if not os.path.exists(stack_path):
1626
1629
  try:
1627
1630
  if not img_format == None:
1628
- if timelapse:
1629
- _move_to_chan_folder(src, regex, timelapse, metadata_type)
1630
- else:
1631
- _rename_and_organize_image_files(src, regex, batch_size, pick_slice, skip_mode, metadata_type, img_format)
1632
-
1633
- #Make sure no batches will be of only one image
1634
- all_imgs = len(stack_path)
1635
- full_batches = all_imgs // batch_size
1636
- last_batch_size = all_imgs % batch_size
1637
-
1638
- # Check if the last batch is of size 1
1639
- if last_batch_size == 1:
1640
- # If there's only one batch and its size is 1, it's also an issue
1641
- if full_batches == 0:
1642
- raise ValueError("Only one batch of size 1 detected. Adjust the batch size.")
1643
- # If the last batch is of size 1, merge it with the second last batch
1644
- elif full_batches > 0:
1645
- print(f"all images: {all_imgs}, full batch: {full_batches}, last batch: {last_batch_size}")
1646
- raise ValueError("Last batch of size 1 detected. Adjust the batch size.")
1631
+ img_format = ['.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.nd2', '.czi', '.lif']
1632
+ _rename_and_organize_image_files(src, regex, settings['batch_size'], settings['metadata_type'], img_format)
1633
+
1634
+ #Make sure no batches will be of only one image
1635
+ all_imgs = len(stack_path)
1636
+ full_batches = all_imgs // settings['batch_size']
1637
+ last_batch_size = all_imgs % settings['batch_size']
1638
+
1639
+ # Check if the last batch is of size 1
1640
+ if last_batch_size == 1:
1641
+ # If there's only one batch and its size is 1, it's also an issue
1642
+ if full_batches == 0:
1643
+ raise ValueError("Only one batch of size 1 detected. Adjust the batch size.")
1644
+ # If the last batch is of size 1, merge it with the second last batch
1645
+ elif full_batches > 0:
1646
+ print(f"all images: {all_imgs}, full batch: {full_batches}, last batch: {last_batch_size}")
1647
+ raise ValueError("Last batch of size 1 detected. Adjust the batch size.")
1647
1648
 
1648
1649
  nr_channel_folders = _merge_channels(src, plot=False)
1649
1650
 
1650
1651
  if len(settings['channels']) != nr_channel_folders:
1651
1652
  print(f"Number of channels does not match number of channel folders. channels: {settings['channels']} channel folders: {nr_channel_folders}")
1652
1653
  new_channels = list(range(nr_channel_folders))
1653
- print(f"Setting channels to {new_channels}")
1654
+ print(f"Changing channels from {settings['channels']} to {new_channels}")
1654
1655
  settings['channels'] = new_channels
1655
1656
 
1656
- if timelapse:
1657
- _create_movies_from_npy_per_channel(stack_path, fps=2)
1657
+ if settings['timelapse']:
1658
+ _create_movies_from_npy_per_channel(stack_path, fps=settings['fps'])
1658
1659
 
1659
- if plot:
1660
- print(f'plotting {nr} images from {src}/stack')
1661
- plot_arrays(stack_path, figuresize, cmap, nr=nr, normalize=normalize)
1660
+ if settings['plot']:
1661
+ print(f"plotting {settings['nr']} images from {src}/stack")
1662
+ plot_arrays(stack_path, settings['figuresize'], settings['cmap'], nr=settings['nr'], normalize=settings['normalize'])
1662
1663
 
1663
- if all_to_mip:
1664
+ if settings['all_to_mip']:
1664
1665
  _mip_all(stack_path)
1665
- if plot:
1666
- print(f'plotting {nr} images from {src}/stack')
1667
- plot_arrays(stack_path, figuresize, cmap, nr=nr, normalize=normalize)
1666
+ if settings['plot']:
1667
+ print(f"plotting {settings['nr']} images from {src}/stack")
1668
+ plot_arrays(stack_path, settings['figuresize'], settings['cmap'], nr=settings['nr'], normalize=settings['normalize'])
1669
+
1668
1670
  except Exception as e:
1669
1671
  print(f"Error: {e}")
1670
1672
 
@@ -1673,9 +1675,6 @@ def preprocess_img_data(settings):
1673
1675
  save_dtype=np.float32,
1674
1676
  settings=settings)
1675
1677
 
1676
- #if plot:
1677
- # _plot_4D_arrays(src+'/norm_channel_stack', nr_npz=1, nr=nr)
1678
-
1679
1678
  return settings, src
1680
1679
 
1681
1680
  def _check_masks(batch, batch_filenames, output_folder):
@@ -1780,11 +1779,11 @@ def _read_and_join_tables(db_path, table_names=['cell', 'cytoplasm', 'nucleus',
1780
1779
  print(e)
1781
1780
  conn.close()
1782
1781
  if 'png_list' in dataframes:
1783
- png_list_df = dataframes['png_list'][['cell_id', 'png_path', 'plate', 'row_name', 'column_name', 'field']].copy()
1782
+ png_list_df = dataframes['png_list'][['cell_id', 'png_path', 'plateID', 'rowID', 'columnID', 'fieldID']].copy()
1784
1783
  png_list_df['cell_id'] = png_list_df['cell_id'].str[1:].astype(int)
1785
1784
  png_list_df.rename(columns={'cell_id': 'object_label'}, inplace=True)
1786
1785
  if 'cell' in dataframes:
1787
- join_cols = ['object_label', 'plate', 'row_name', 'column_name','field']
1786
+ join_cols = ['object_label', 'plateID', 'rowID', 'columnID','fieldID']
1788
1787
  dataframes['cell'] = pd.merge(dataframes['cell'], png_list_df, on=join_cols, how='left')
1789
1788
  else:
1790
1789
  print("Cell table not found in database tables.")
@@ -2085,14 +2084,18 @@ def _read_db(db_loc, tables):
2085
2084
  Returns:
2086
2085
  - dfs (list): A list of pandas DataFrames, each containing the data from a table.
2087
2086
  """
2088
- from .utils import rename_columns_in_db
2087
+ from .utils import rename_columns_in_db, correct_metadata
2088
+
2089
2089
  rename_columns_in_db(db_loc)
2090
2090
  conn = sqlite3.connect(db_loc)
2091
2091
  dfs = []
2092
+
2092
2093
  for table in tables:
2093
2094
  query = f'SELECT * FROM {table}'
2094
2095
  df = pd.read_sql_query(query, conn)
2096
+ df = correct_metadata(df)
2095
2097
  dfs.append(df)
2098
+
2096
2099
  conn.close()
2097
2100
  return dfs
2098
2101
 
@@ -2271,7 +2274,7 @@ def _copy_missclassified(df):
2271
2274
 
2272
2275
  def _read_db(db_loc, tables):
2273
2276
 
2274
- from .utils import rename_columns_in_db
2277
+ from .utils import rename_columns_in_db, correct_metadata
2275
2278
 
2276
2279
  rename_columns_in_db(db_loc)
2277
2280
  conn = sqlite3.connect(db_loc) # Create a connection to the database
@@ -2279,12 +2282,13 @@ def _read_db(db_loc, tables):
2279
2282
  for table in tables:
2280
2283
  query = f'SELECT * FROM {table}' # Write a SQL query to get the data from the database
2281
2284
  df = pd.read_sql_query(query, conn) # Use the read_sql_query function to get the data and save it as a DataFrame
2285
+ df = correct_metadata(df)
2282
2286
  dfs.append(df)
2283
2287
  conn.close() # Close the connection
2284
2288
  return dfs
2285
2289
 
2286
2290
  def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_limit=10, change_plate=False):
2287
- from .io import _read_db
2291
+
2288
2292
  from .utils import _split_data
2289
2293
 
2290
2294
  # Initialize an empty dictionary to store DataFrames by table name
@@ -2294,8 +2298,8 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
2294
2298
  for idx, loc in enumerate(locs):
2295
2299
  db_dfs = _read_db(loc, tables)
2296
2300
  if change_plate:
2297
- db_dfs['plate'] = f'plate{idx+1}'
2298
- db_dfs['prc'] = db_dfs['plate'].astype(str) + '_' + db_dfs['row_name'].astype(str) + '_' + db_dfs['column_name'].astype(str)
2301
+ db_dfs['plateID'] = f'plate{idx+1}'
2302
+ db_dfs['prc'] = db_dfs['plateID'].astype(str) + '_' + db_dfs['rowID'].astype(str) + '_' + db_dfs['columnID'].astype(str)
2299
2303
  for table, df in zip(tables, db_dfs):
2300
2304
  data_dict[table].append(df)
2301
2305
 
@@ -2303,6 +2307,7 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
2303
2307
  for table, dfs in data_dict.items():
2304
2308
  if dfs:
2305
2309
  data_dict[table] = pd.concat(dfs, axis=0)
2310
+
2306
2311
  if verbose:
2307
2312
  print(f"{table}: {len(data_dict[table])}")
2308
2313
 
@@ -2389,18 +2394,18 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
2389
2394
  if 'png_list' in data_dict:
2390
2395
  png_list = data_dict['png_list'].copy()
2391
2396
  png_list_g_df_numeric, png_list_g_df_non_numeric = _split_data(png_list, 'prcfo', 'cell_id')
2392
- png_list_g_df_non_numeric.drop(columns=['plate','row_name','column_name','field','file_name','cell_id', 'prcf'], inplace=True)
2397
+ png_list_g_df_non_numeric.drop(columns=['plateID','rowID','columnID','fieldID','file_name','cell_id', 'prcf'], inplace=True)
2393
2398
  if verbose:
2394
2399
  print(f'png_list: {len(png_list)}, png_list grouped: {len(png_list_g_df_numeric)}')
2395
2400
  print(f"Added png_list columns: {png_list_g_df_numeric.columns}, {png_list_g_df_non_numeric.columns}")
2396
2401
  merged_df = merged_df.merge(png_list_g_df_numeric, left_index=True, right_index=True)
2397
2402
  merged_df = merged_df.merge(png_list_g_df_non_numeric, left_index=True, right_index=True)
2398
-
2403
+
2399
2404
  # Add prc (plate row column) and prcfo (plate row column field object) columns
2400
- metadata = metadata.assign(prc=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'])
2405
+ metadata = metadata.assign(prc=lambda x: x['plateID'] + '_' + x['rowID'] + '_' + x['columnID'])
2401
2406
  cells_well = metadata.groupby('prc')['object_label'].nunique().reset_index(name='cells_per_well')
2402
2407
  metadata = metadata.merge(cells_well, on='prc')
2403
- metadata = metadata.assign(prcfo=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'] + '_' + x['field'] + '_' + x['object_label'])
2408
+ metadata = metadata.assign(prcfo=lambda x: x['plateID'] + '_' + x['rowID'] + '_' + x['columnID'] + '_' + x['fieldID'] + '_' + x['object_label'])
2404
2409
  metadata.set_index('prcfo', inplace=True)
2405
2410
 
2406
2411
  # Merge metadata with final merged DataFrame
@@ -2988,7 +2993,7 @@ def training_dataset_from_annotation(db_path, dst, annotation_column='test', ann
2988
2993
 
2989
2994
  return class_paths
2990
2995
 
2991
- def training_dataset_from_annotation_metadata(db_path, dst, annotation_column='test', annotated_classes=(1, 2), metadata_type_by='column_name', class_metadata=['c1','c2']):
2996
+ def training_dataset_from_annotation_metadata(db_path, dst, annotation_column='test', annotated_classes=(1, 2), metadata_type_by='columnID', class_metadata=['c1','c2']):
2992
2997
  all_paths = []
2993
2998
 
2994
2999
  # Connect to the database and retrieve the image paths and annotations
@@ -3010,9 +3015,9 @@ def training_dataset_from_annotation_metadata(db_path, dst, annotation_column='t
3010
3015
 
3011
3016
  # Filter all_paths by metadata_type_by and class_metadata
3012
3017
  filtered_paths = []
3013
- metadata_index = {'row_name': 2, 'column_name': 3}.get(metadata_type_by, None)
3018
+ metadata_index = {'rowID': 2, 'columnID': 3}.get(metadata_type_by, None)
3014
3019
  if metadata_index is None:
3015
- raise ValueError(f"Invalid metadata_type_by value: {metadata_type_by}. Must be 'row_name' or 'column_name'. {class_metadata} must be a list formatted as ['c1', 'c2'] or ['r1', 'r2']")
3020
+ raise ValueError(f"Invalid metadata_type_by value: {metadata_type_by}. Must be 'rowID' or 'columnID'. {class_metadata} must be a list formatted as ['c1', 'c2'] or ['r1', 'r2']")
3016
3021
 
3017
3022
  for row in all_paths:
3018
3023
  if row[metadata_index] in class_metadata:
@@ -3102,4 +3107,474 @@ def generate_dataset_from_lists(dst, class_data, classes, test_split=0.1):
3102
3107
  test_class_dir = os.path.join(dst, f'test/{cls}')
3103
3108
  print(f'Train class {cls}: {len(os.listdir(train_class_dir))}, Test class {cls}: {len(os.listdir(test_class_dir))}')
3104
3109
 
3105
- return os.path.join(dst, 'train'), os.path.join(dst, 'test')
3110
+ return os.path.join(dst, 'train'), os.path.join(dst, 'test')
3111
+
3112
+ def convert_separate_files_to_yokogawa(folder, regex):
3113
+
3114
+ ROWS = "ABCDEFGHIJKLMNOP"
3115
+ COLS = [f"{i:02d}" for i in range(1, 25)]
3116
+ WELLS = [f"{r}{c}" for r in ROWS for c in COLS]
3117
+
3118
+ def _get_next_well(used_wells):
3119
+ plate = 1
3120
+ for well in WELLS:
3121
+ well_name = f"plate{plate}_{well}"
3122
+ if well_name not in used_wells:
3123
+ return well_name
3124
+ if well == "P24":
3125
+ plate += 1
3126
+ return f"plate{plate}_A01"
3127
+
3128
+ pattern = re.compile(regex, re.I)
3129
+
3130
+ files_by_region = {}
3131
+ rename_log = []
3132
+ csv_path = os.path.join(folder, "rename_log.csv")
3133
+ used_wells = set()
3134
+ region_to_well = {}
3135
+
3136
+ # Group files by (plateID, wellID, fieldID, timeID, chanID)
3137
+ for file in os.listdir(folder):
3138
+ match = pattern.match(file)
3139
+ if not match:
3140
+ print(f"Skipping {file}: does not match regex.")
3141
+ continue
3142
+
3143
+ meta = match.groupdict()
3144
+
3145
+ # Mandatory metadata
3146
+ if 'wellID' not in meta or meta['wellID'] is None:
3147
+ print(f"Skipping {file}: missing mandatory wellID.")
3148
+ continue
3149
+ wellID = meta['wellID']
3150
+
3151
+ # Optional metadata with defaults
3152
+ plateID = meta.get('plateID', '1') or '1'
3153
+ fieldID = meta.get('fieldID', '1') or '1'
3154
+ timeID = int(meta.get('timeID', 1) or 1)
3155
+ chanID = int(meta.get('chanID', 1) or 1)
3156
+ sliceID = meta.get('sliceID')
3157
+ sliceID = int(sliceID) if sliceID is not None else None
3158
+
3159
+ region_key = (plateID, wellID, fieldID, timeID, chanID)
3160
+
3161
+ files_by_region.setdefault(region_key, []).append((file, sliceID))
3162
+
3163
+ # Assign wells and process files per region
3164
+ for region, file_list in files_by_region.items():
3165
+ if region[:3] not in region_to_well:
3166
+ next_well = _get_next_well(used_wells)
3167
+ region_to_well[region[:3]] = next_well
3168
+ used_wells.add(next_well)
3169
+
3170
+ assigned_well = region_to_well[region[:3]]
3171
+ plateID, wellID, fieldID, timeID, chanID = region
3172
+
3173
+ # Check if multiple slices exist and are meaningful
3174
+ slice_ids = [sid for _, sid in file_list if sid is not None]
3175
+ unique_slices = set(slice_ids)
3176
+
3177
+ images = []
3178
+ for filename, _ in sorted(file_list, key=lambda x: x[1] or 1):
3179
+ img = tifffile.imread(os.path.join(folder, filename))
3180
+ images.append(img)
3181
+
3182
+ # Perform MIP only if multiple unique slices are present
3183
+ if len(unique_slices) > 1:
3184
+ img_to_save = np.max(np.stack(images), axis=0)
3185
+ else:
3186
+ img_to_save = images[0]
3187
+
3188
+ dtype = img_to_save.dtype
3189
+
3190
+ new_filename = f"{assigned_well}_T{timeID:04d}F{int(fieldID):03d}L01C{chanID:02d}.tif"
3191
+ new_filepath = os.path.join(folder, new_filename)
3192
+ tifffile.imwrite(new_filepath, img_to_save.astype(dtype))
3193
+
3194
+ # Log original filenames involved in MIP or single file rename
3195
+ original_files = ";".join(f[0] for f in file_list)
3196
+ rename_log.append({"Original File(s)": original_files, "Renamed TIFF": new_filename})
3197
+
3198
+ pd.DataFrame(rename_log).to_csv(csv_path, index=False)
3199
+ print(f"Processing complete. Files saved in {folder} and rename log saved as {csv_path}.")
3200
+
3201
+ def convert_to_yokogawa(folder):
3202
+ """
3203
+ Detects file type in the folder and converts them
3204
+ to Yokogawa-style naming with Maximum Intensity Projection (MIP).
3205
+ """
3206
+
3207
+ def _get_next_well(used_wells):
3208
+ """
3209
+ Determines the next available well position across multiple 384-well plates.
3210
+ """
3211
+ ROWS = "ABCDEFGHIJKLMNOP"
3212
+ COLS = [f"{i:02d}" for i in range(1, 25)]
3213
+ WELLS = [f"{r}{c}" for r in ROWS for c in COLS]
3214
+
3215
+ plate = 1
3216
+ while True:
3217
+ for well in WELLS:
3218
+ well_name = f"plate{plate}_{well}"
3219
+ if well_name not in used_wells:
3220
+ used_wells.add(well_name)
3221
+ return well_name
3222
+ plate += 1 # All wells exhausted in current plate, increment to next plate
3223
+
3224
+
3225
+ # Define 384-well plate format
3226
+ ROWS = "ABCDEFGHIJKLMNOP"
3227
+ COLS = [f"{i:02d}" for i in range(1, 25)]
3228
+ WELLS = [f"{r}{c}" for r in ROWS for c in COLS]
3229
+
3230
+ filenames = []
3231
+ rename_log = []
3232
+ csv_path = os.path.join(folder, "rename_log.csv")
3233
+ used_wells = set()
3234
+
3235
+ # **Dictionary to store well assignments per original file**
3236
+ file_to_well = {}
3237
+
3238
+ for file in os.listdir(folder):
3239
+ path = os.path.join(folder, file)
3240
+ ext = file.lower().split('.')[-1]
3241
+
3242
+ # **Assign a well only once per original file**
3243
+ if file not in file_to_well:
3244
+ file_to_well[file] = _get_next_well(used_wells)
3245
+ #used_wells.add(file_to_well[file]) # Mark it as used
3246
+
3247
+ well = file_to_well[file] # Use the same well for all channels/times
3248
+
3249
+ ### **Process Nikon ND2 Files**
3250
+ if ext == 'nd2':
3251
+ try:
3252
+ nd2 = ND2Reader(path)
3253
+ metadata = nd2.metadata
3254
+
3255
+ timepoints = list(range(len(metadata.get("frames", [0])))) or [0]
3256
+ fields = list(range(len(metadata.get("fields_of_view", [0])))) or [0]
3257
+ z_levels = list(metadata.get("z_levels", range(1))) if metadata.get("z_levels") else [0]
3258
+ channels = metadata.get("channels", [])
3259
+
3260
+ for t_idx in timepoints:
3261
+ for f_idx in fields:
3262
+ for c_idx, channel in enumerate(channels):
3263
+ try:
3264
+ mip_image = np.max.reduce([
3265
+ nd2.get_frame_2D(t=t_idx, v=f_idx, z=z_idx, c=c_idx)
3266
+ for z_idx in z_levels
3267
+ ], axis=0)
3268
+
3269
+ dtype = mip_image.dtype
3270
+ filename = f"{well}_T{t_idx+1:04d}F{f_idx+1:03d}L01C{c_idx+1:02d}.tif"
3271
+ filepath = os.path.join(folder, filename)
3272
+
3273
+ tifffile.imwrite(filepath, mip_image.astype(dtype))
3274
+ rename_log.append({"Original File": file,
3275
+ "Renamed TIFF": filename,
3276
+ "ext": ext,
3277
+ "time": t_idx,
3278
+ "field": f_idx,
3279
+ "channel": channel,
3280
+ "z": z_levels})
3281
+
3282
+ except IndexError:
3283
+ print(f"Warning: ND2 file {file} has an incomplete data structure. Skipping.")
3284
+
3285
+ except Exception as e:
3286
+ print(f"Error processing ND2 file {file}: {e}")
3287
+
3288
+ elif ext == 'czi':
3289
+ try:
3290
+ # Open the CZI in streaming mode
3291
+ with pyczi.open_czi(path) as czidoc:
3292
+
3293
+ # 1) Global dimension ranges
3294
+ bbox = czidoc.total_bounding_box
3295
+ _, tlen = bbox.get('T', (0,1))
3296
+ _, clen = bbox.get('C', (0,1))
3297
+ _, zlen = bbox.get('Z', (0,1))
3298
+
3299
+ # 2) Scene → list of scene indices
3300
+ scenes_bb = czidoc.scenes_bounding_rectangle
3301
+ scenes = sorted(scenes_bb.keys()) if scenes_bb else [None]
3302
+
3303
+ # 3) Output folder (same as .czi)
3304
+ folder = os.path.dirname(path)
3305
+
3306
+ # 4) Loop scene × time × channel × Z
3307
+ for scene in scenes:
3308
+ # *** assign a unique well for this scene ***
3309
+ scene_well = _get_next_well(used_wells)
3310
+
3311
+ # Field index = scene+1 (or 1 if no scene)
3312
+ F_idx = scene + 1 if scene is not None else 1
3313
+ # Scene index for “A”
3314
+ A_idx = scene + 1 if scene is not None else 1
3315
+
3316
+ for t in range(tlen):
3317
+ for c in range(clen):
3318
+ for z in range(zlen):
3319
+ # Read exactly one 2D plane
3320
+ arr = czidoc.read(
3321
+ plane={'T': t, 'C': c, 'Z': z},
3322
+ scene=scene
3323
+ )
3324
+ plane = np.squeeze(arr)
3325
+
3326
+ # Build Yokogawa‐style filename:
3327
+ fn = (
3328
+ f"{scene_well}_"
3329
+ f"T{t+1:04d}"
3330
+ f"F{F_idx:03d}"
3331
+ f"L01"
3332
+ f"A{A_idx:02d}"
3333
+ f"Z{z+1:02d}"
3334
+ f"C{c+1:02d}.tif"
3335
+ )
3336
+ outpath = os.path.join(folder, fn)
3337
+
3338
+ # Write with lossless compression
3339
+ tifffile.imwrite(
3340
+ outpath,
3341
+ plane.astype(plane.dtype),
3342
+ compression='zlib'
3343
+ )
3344
+
3345
+ # Log it
3346
+ rename_log.append({
3347
+ "Original File": file,
3348
+ "Renamed TIFF": fn,
3349
+ "ext": ext,
3350
+ "scene": scene,
3351
+ "time": t,
3352
+ "slice": z,
3353
+ "field": F_idx,
3354
+ "channel": c,
3355
+ "well": scene_well
3356
+ })
3357
+
3358
+ except Exception as e:
3359
+ print(f"Error processing CZI file {file}: {e}")
3360
+
3361
+ ### **Process Leica LIF Files**
3362
+ elif ext == 'lif':
3363
+ try:
3364
+ lif_file = readlif.Reader(path)
3365
+
3366
+ for image_idx, image in enumerate(lif_file.getIterImage()):
3367
+ timepoints = range(getattr(image.dims, 't', 1))
3368
+ z_levels = range(getattr(image.dims, 'z', 1))
3369
+ channels = range(getattr(image.dims, 'c', 1))
3370
+
3371
+ for t_idx in timepoints:
3372
+ for c_idx in channels:
3373
+ z_stack = []
3374
+ for z_idx in z_levels:
3375
+ try:
3376
+ frame = image.getFrame(z=z_idx, t=t_idx, c=c_idx)
3377
+ z_stack.append(frame)
3378
+ except IndexError:
3379
+ print(f"Missing frame: T{t_idx}, Z{z_idx}, C{c_idx} in {file}, skipping frame.")
3380
+
3381
+ if z_stack:
3382
+ mip_image = np.max(np.stack(z_stack), axis=0)
3383
+ dtype = mip_image.dtype
3384
+ filename = f"{well}_T{t_idx+1:04d}F{image_idx+1:03d}L01C{c_idx+1:02d}.tif"
3385
+ filepath = os.path.join(folder, filename)
3386
+
3387
+ tifffile.imwrite(filepath, mip_image.astype(dtype))
3388
+ rename_log.append({"Original File": file, "Renamed TIFF": filename})
3389
+
3390
+ except Exception as e:
3391
+ print(f"Error processing LIF file {file}: {e}")
3392
+
3393
+ ### **Process Standard Image Files (TIFF, PNG, JPEG, BMP)**
3394
+ elif ext in ['tif', 'tiff', 'png', 'jpg', 'jpeg', 'bmp'] and not file.startswith("plate"):
3395
+ try:
3396
+ with tifffile.TiffFile(path) as tif:
3397
+ images = tif.asarray()
3398
+ ndim = images.ndim
3399
+
3400
+ # Defaults
3401
+ t_dim = z_dim = c_dim = 1
3402
+
3403
+ # Determine dimensions more explicitly
3404
+ if ndim == 2:
3405
+ mip_image = images
3406
+ filename = f"{well}_T0001F001L01C01.tif"
3407
+ tifffile.imwrite(os.path.join(folder, filename), mip_image)
3408
+ rename_log.append({"Original File": file, "Renamed TIFF": filename})
3409
+ continue
3410
+
3411
+ elif ndim == 3:
3412
+ if images.shape[0] <= 4: # Likely channels
3413
+ c_dim = images.shape[0]
3414
+ for c in range(c_dim):
3415
+ mip_image = images[c, :, :]
3416
+ filename = f"{well}_T0001F001L01C{c+1:02d}.tif"
3417
+ tifffile.imwrite(os.path.join(folder, filename), mip_image)
3418
+ rename_log.append({"Original File": file, "Renamed TIFF": filename})
3419
+ else: # Z-stack
3420
+ mip_image = np.max(images, axis=0)
3421
+ filename = f"{well}_T0001F001L01C01.tif"
3422
+ tifffile.imwrite(os.path.join(folder, filename), mip_image)
3423
+ rename_log.append({"Original File": file, "Renamed TIFF": filename})
3424
+
3425
+ elif ndim == 4:
3426
+ t_dim, z_dim, y_dim, x_dim = images.shape
3427
+ for t in range(t_dim):
3428
+ mip_image = np.max(images[t, :, :, :], axis=0)
3429
+ filename = f"{well}_T{t+1:04d}F001L01C01.tif"
3430
+ tifffile.imwrite(os.path.join(folder, filename), mip_image)
3431
+ rename_log.append({"Original File": file, "Renamed TIFF": filename})
3432
+
3433
+ else:
3434
+ raise ValueError(f"Unsupported TIFF dimensions: {images.shape}")
3435
+
3436
+ except Exception as e:
3437
+ print(f"Error processing standard image file {file}: {e}")
3438
+
3439
+ # Save rename log as CSV
3440
+ pd.DataFrame(rename_log).to_csv(csv_path, index=False)
3441
+ print(f"Processing complete. Files saved in {folder} and rename log saved as {csv_path}.")
3442
+
3443
+ def apply_augmentation(image, method):
3444
+ if method == 'rotate90':
3445
+ return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
3446
+ elif method == 'rotate180':
3447
+ return cv2.rotate(image, cv2.ROTATE_180)
3448
+ elif method == 'rotate270':
3449
+ return cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
3450
+ elif method == 'flip_h':
3451
+ return cv2.flip(image, 1)
3452
+ elif method == 'flip_v':
3453
+ return cv2.flip(image, 0)
3454
+ return image
3455
+
3456
+ def process_instruction(entry):
3457
+ img = tifffile.imread(entry["src_img"])
3458
+ msk = tifffile.imread(entry["src_msk"])
3459
+ if entry["augment"]:
3460
+ img = apply_augmentation(img, entry["augment"])
3461
+ msk = apply_augmentation(msk, entry["augment"])
3462
+ tifffile.imwrite(entry["dst_img"], img)
3463
+ tifffile.imwrite(entry["dst_msk"], msk)
3464
+ return 1
3465
+
3466
+ def prepare_cellpose_dataset(input_root, augment_data=False, train_fraction=0.8, n_jobs=None):
3467
+
3468
+ from .utils import print_progress
3469
+
3470
+ time_ls = []
3471
+ input_root = os.path.abspath(input_root)
3472
+ output_root = os.path.join(input_root, "cellpose_dataset")
3473
+
3474
+ def get_augmentations():
3475
+ return ['rotate90', 'rotate180', 'rotate270', 'flip_h', 'flip_v']
3476
+
3477
+ def find_image_mask_pairs(dataset_path):
3478
+ mask_dir = os.path.join(dataset_path, "masks")
3479
+ pairs = []
3480
+ for fname in os.listdir(dataset_path):
3481
+ if fname.lower().endswith((".tif", ".tiff")):
3482
+ img_path = os.path.join(dataset_path, fname)
3483
+ msk_path = os.path.join(mask_dir, fname)
3484
+ if os.path.isfile(msk_path):
3485
+ pairs.append((img_path, msk_path))
3486
+ return pairs
3487
+
3488
+ def prepare_output_folders(base):
3489
+ for subset in ["train", "test"]:
3490
+ os.makedirs(os.path.join(base, subset, "images"), exist_ok=True)
3491
+ os.makedirs(os.path.join(base, subset, "masks"), exist_ok=True)
3492
+
3493
+ print("Scanning datasets...")
3494
+ datasets = []
3495
+ for subdir in os.listdir(input_root):
3496
+ dataset_path = os.path.join(input_root, subdir)
3497
+ if os.path.isdir(dataset_path) and os.path.isdir(os.path.join(dataset_path, "masks")):
3498
+ pairs = find_image_mask_pairs(dataset_path)
3499
+ if pairs:
3500
+ datasets.append(pairs)
3501
+ print(f" Found {len(pairs)} images in {dataset_path}")
3502
+
3503
+ if not datasets:
3504
+ raise ValueError("No valid datasets with images and masks found.")
3505
+
3506
+ prepare_output_folders(output_root)
3507
+
3508
+ min_size = min(len(pairs) for pairs in datasets)
3509
+ target_size = min_size if not augment_data else max(len(pairs) for pairs in datasets)
3510
+
3511
+ print("\nPreparing instruction list...")
3512
+ instructions = []
3513
+ global_index = 0
3514
+
3515
+ for pairs in datasets:
3516
+ dataset_len = len(pairs)
3517
+
3518
+ # --- Step 1: Sample or augment ---
3519
+ sampled_pairs = []
3520
+ if dataset_len >= target_size:
3521
+ sampled_pairs = random.sample(pairs, target_size)
3522
+ else:
3523
+ sampled_pairs = pairs.copy()
3524
+ if augment_data:
3525
+ needed = target_size - dataset_len
3526
+ aug_methods = get_augmentations()
3527
+ full_loops = needed // len(aug_methods)
3528
+ extra = needed % len(aug_methods)
3529
+
3530
+ for _ in range(full_loops):
3531
+ for (img_path, msk_path), aug in zip(pairs, aug_methods * (dataset_len // len(aug_methods))):
3532
+ sampled_pairs.append((img_path, msk_path, aug))
3533
+ if extra > 0:
3534
+ subset = random.sample(pairs * ((extra // len(aug_methods)) + 1), extra)
3535
+ for (img_path, msk_path), aug in zip(subset, aug_methods[:extra]):
3536
+ sampled_pairs.append((img_path, msk_path, aug))
3537
+
3538
+ # Add "no augmentation" tag to original files
3539
+ augmented_sampled = [
3540
+ (tup[0], tup[1], None) if len(tup) == 2 else tup
3541
+ for tup in sampled_pairs
3542
+ ]
3543
+
3544
+ # --- Step 2: Split into train/test ---
3545
+ random.shuffle(augmented_sampled)
3546
+ split_idx = int(train_fraction * len(augmented_sampled))
3547
+ split_sets = {
3548
+ "train": augmented_sampled[:split_idx],
3549
+ "test": augmented_sampled[split_idx:]
3550
+ }
3551
+
3552
+ for subset, items in split_sets.items():
3553
+ for img_path, msk_path, aug in items:
3554
+ dst_img = os.path.join(output_root, subset, "images", f"{global_index:05d}.tif")
3555
+ dst_msk = os.path.join(output_root, subset, "masks", f"{global_index:05d}.tif")
3556
+ instructions.append({
3557
+ "src_img": img_path,
3558
+ "src_msk": msk_path,
3559
+ "dst_img": dst_img,
3560
+ "dst_msk": dst_msk,
3561
+ "augment": aug
3562
+ })
3563
+ global_index += 1
3564
+
3565
+ print(f"Total files to process: {len(instructions)}")
3566
+
3567
+ # --- Step 3: Process with multiprocessing ---
3568
+ print("Processing images with multiprocessing...")
3569
+
3570
+ if n_jobs is None:
3571
+ n_jobs = max(1, cpu_count() - 1)
3572
+ else:
3573
+ n_jobs = int(n_jobs)
3574
+
3575
+ with Pool(n_jobs) as pool:
3576
+ for i, _ in enumerate(pool.imap_unordered(process_instruction, instructions), 1):
3577
+ print_progress(i, len(instructions), n_jobs=n_jobs, time_ls=time_ls, batch_size=None, operation_type="cellpose dataset")
3578
+
3579
+ print(f"Done. Dataset saved to: {output_root}")
3580
+