PyPI - spacr - Versions diffs - 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

spacr 0.4.15py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

spacr/io.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os, re, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, cellpose, glob, queue, tifffile, czifile, atexit, datetime
+import os, re, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, cellpose, glob, queue, tifffile, czifile, atexit, datetime, traceback
 import numpy as np
 import pandas as pd
 from PIL import Image, ImageOps
@@ -23,6 +23,8 @@ import seaborn as sns
 from nd2reader import ND2Reader
 from torchvision import transforms
 from sklearn.model_selection import train_test_split
+import readlif
+from pylibCZIrw import czi as pyczi
 def process_non_tif_non_2D_images(folder):
     """Processes all images in the folder and splits them into grayscale channels, preserving bit depth."""
@@ -131,58 +133,61 @@ def process_non_tif_non_2D_images(folder):
 def _load_images_and_labels(image_files, label_files, invert=False):
-    from .utils import invert_image, apply_mask
+    from .utils import invert_image
     images = []
     labels = []
-    if not image_files is None:
-        image_names = sorted([os.path.basename(f) for f in image_files])
-    else:
-        image_names = []
-    if not label_files is None:
-        label_names = sorted([os.path.basename(f) for f in label_files])
-    else:
-        label_names = []
-    if not image_files is None and not label_files is None:
+    image_names = sorted([os.path.basename(f) for f in image_files]) if image_files else []
+    label_names = sorted([os.path.basename(f) for f in label_files]) if label_files else []
+    if image_files and label_files:
         for img_file, lbl_file in zip(image_files, label_files):
             image = cellpose.io.imread(img_file)
+            if image is None:
+                print(f"WARNING: Could not load image: {img_file}")
+                continue
             if invert:
                 image = invert_image(image)
-            label = cellpose.io.imread(lbl_file)
             if image.max() > 1:
                 image = image / image.max()
+            label = cellpose.io.imread(lbl_file)
+            if label is None:
+                print(f"WARNING: Could not load label: {lbl_file}")
+                continue
             images.append(image)
             labels.append(label)
-    elif not image_files is None:
+    elif image_files:
         for img_file in image_files:
             image = cellpose.io.imread(img_file)
+            if image is None:
+                print(f"WARNING: Could not load image: {img_file}")
+                continue
             if invert:
                 image = invert_image(image)
             if image.max() > 1:
                 image = image / image.max()
             images.append(image)
-    elif not image_files is None:
-            for lbl_file in label_files:
-                label = cellpose.io.imread(lbl_file)
+    elif label_files:
+        for lbl_file in label_files:
+            label = cellpose.io.imread(lbl_file)
+            if label is None:
+                print(f"WARNING: Could not load label: {lbl_file}")
+                continue
             labels.append(label)
-    if not image_files is None:
-        image_dir = os.path.dirname(image_files[0])
-    else:
-        image_dir = None
-    if not label_files is None:
-        label_dir = os.path.dirname(label_files[0])
-    else:
-        label_dir = None
-    # Log the number of loaded images and labels
+    image_dir = os.path.dirname(image_files[0]) if image_files else None
+    label_dir = os.path.dirname(label_files[0]) if label_files else None
     print(f'Loaded {len(images)} images and {len(labels)} labels from {image_dir} and {label_dir}')
-    if len(labels) > 0 and len(images) > 0:
-        print(f'image shape: {images[0].shape}, image type: images[0].shape mask shape: {labels[0].shape}, image type: labels[0].shape')
+    if images and labels:
+        print(f'image shape: {images[0].shape}, image type: {images[0].dtype}; '
+              f'label shape: {labels[0].shape}, label type: {labels[0].dtype}')
     return images, labels, image_names, label_names
 def _load_normalized_images_and_labels(image_files, label_files, channels=None, percentiles=None,
@@ -647,8 +652,8 @@ def load_images_from_paths(images_by_key):
     return images_dict
-#@log_function_call
-def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=False, skip_mode='01', metadata_type='', img_format='.tif'):
+#@log_function_call
+def _rename_and_organize_image_files(src, regex, batch_size=100, metadata_type='', img_format='.tif', timelapse=False):
     """
     Convert z-stack images to maximum intensity projection (MIP) images.
@@ -656,24 +661,26 @@ def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=Fals
         src (str): The source directory containing the z-stack images.
         regex (str): The regular expression pattern used to match the filenames of the z-stack images.
         batch_size (int, optional): The number of images to process in each batch. Defaults to 100.
-        pick_slice (bool, optional): Whether to pick a specific slice based on the provided skip mode. Defaults to False.
-        skip_mode (str, optional): The skip mode used to filter out specific slices. Defaults to '01'.
         metadata_type (str, optional): The type of metadata associated with the images. Defaults to ''.
     Returns:
         None
     """
+    if isinstance(img_format, str):
+        img_format = [img_format]
     from .utils import _extract_filename_metadata, print_progress
     regular_expression = re.compile(regex)
     stack_path = os.path.join(src, 'stack')
     files_processed = 0
     if not os.path.exists(stack_path) or (os.path.isdir(stack_path) and len(os.listdir(stack_path)) == 0):
-        all_filenames = [filename for filename in os.listdir(src) if filename.endswith(img_format)]
+        all_filenames = [filename for filename in os.listdir(src) if any(filename.endswith(ext) for ext in img_format)]
         print(f'All files: {len(all_filenames)} in {src}')
+        all_filenames = [f for f in all_filenames if not f.startswith('.')] #Exclude hidden files
         time_ls = []
-        image_paths_by_key = _extract_filename_metadata(all_filenames, src, regular_expression, metadata_type, pick_slice, skip_mode)
+        image_paths_by_key = _extract_filename_metadata(all_filenames, src, regular_expression, metadata_type)
         # Convert dictionary keys to a list for batching
         batching_keys = list(image_paths_by_key.keys())
         print(f'All unique FOV: {len(image_paths_by_key)} in {src}')
@@ -684,56 +691,43 @@ def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=Fals
             batch_keys = batching_keys[idx:idx+batch_size]
             batch_images_by_key = {key: image_paths_by_key[key] for key in batch_keys}
             images_by_key = load_images_from_paths(batch_images_by_key)
-            if pick_slice:
-                for i, key in enumerate(images_by_key):
-                    plate, well, field, channel, mode = key
-                    max_intensity_slice = max(images_by_key[key], key=lambda x: np.percentile(x, 90))
-                    mip_image = Image.fromarray(max_intensity_slice)
-                    output_dir = os.path.join(src, channel)
-                    os.makedirs(output_dir, exist_ok=True)
-                    output_filename = f'{plate}_{well}_{field}.tif'
-                    output_path = os.path.join(output_dir, output_filename)
-                    files_processed += 1
-                    stop = time.time()
-                    duration = stop - start
-                    time_ls.append(duration)
-                    files_to_process = len(all_filenames)
-                    print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type='Preprocessing filenames')
-                    if not os.path.exists(output_path):
-                        mip_image.save(output_path)
-                    else:
-                        print(f'WARNING: A file with the same name already exists at location {output_filename}')
-            else:
-                for i, (key, images) in enumerate(images_by_key.items()):
-                    plate, well, field, channel = key[:4]
-                    output_dir = os.path.join(src, channel)
-                    mip = np.max(np.stack(images), axis=0)
-                    mip_image = Image.fromarray(mip)
-                    os.makedirs(output_dir, exist_ok=True)
+            # Process each batch of images
+            for i, (key, images) in enumerate(images_by_key.items()):
+                plate, well, field, channel, timeID, sliceID = key
+                if timelapse:
                     output_filename = f'{plate}_{well}_{field}.tif'
-                    output_path = os.path.join(output_dir, output_filename)
-                    files_processed += 1
-                    stop = time.time()
-                    duration = stop - start
-                    time_ls.append(duration)
-                    files_to_process = len(all_filenames)
-                    print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type='Preprocessing filenames')
-                    if not os.path.exists(output_path):
-                        mip_image.save(output_path)
-                    else:
-                        print(f'WARNING: A file with the same name already exists at location {output_filename}')
+                else:
+                    output_filename = f'{plate}_{well}_{field}_{timeID}.tif'
+                output_dir = os.path.join(src, channel)
+                os.makedirs(output_dir, exist_ok=True)
+                output_path = os.path.join(output_dir, output_filename)
+                mip = np.max(np.stack(images), axis=0)
+                mip_image = Image.fromarray(mip)
+                files_processed += 1
+                stop = time.time()
+                duration = stop - start
+                time_ls.append(duration)
+                files_to_process = len(all_filenames)
+                print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type='Preprocessing filenames')
+                if not os.path.exists(output_path):
+                    mip_image.save(output_path)
+                else:
+                    print(f'WARNING: A file with the same name already exists at location {output_filename}')
             images_by_key.clear()
         # Move original images to a new directory
-        valid_exts = [img_format]
         newpath = os.path.join(src, 'orig')
         os.makedirs(newpath, exist_ok=True)
         for filename in os.listdir(src):
-            if os.path.splitext(filename)[1] in valid_exts:
+            #print(f"{filename}: {os.path.splitext(filename)[1]}")
+            if os.path.splitext(filename)[1] in img_format:
                 move = os.path.join(newpath, filename)
                 if os.path.exists(move):
                     print(f'WARNING: A file with the same name already exists at location {move}')
@@ -1236,7 +1230,11 @@ def concatenate_and_normalize(src, channels, save_dtype=np.float32, settings={})
         files_processed = 0
         for i, path in enumerate(paths):
             start = time.time()
-            array = np.load(path)
+            try:
+                array = np.load(path)
+            except Exception as e:
+                print(f"Error loading file {path}: {e}")
+                continue
             stack_ls.append(array)
             filenames_batch.append(os.path.basename(path))
             stop = time.time()
@@ -1564,30 +1562,34 @@ def preprocess_img_data(settings):
         save_dtype (type, optional): The data type used for saving the preprocessed images. Defaults to np.float32.
         randomize (bool, optional): Whether to randomize the order of the images. Defaults to True.
         all_to_mip (bool, optional): Whether to convert all images to MIP. Defaults to False.
-        pick_slice (bool, optional): Whether to pick a specific slice based on the provided skip mode. Defaults to False.
-        skip_mode (str, optional): The skip mode used to filter out specific slices. Defaults to '01'.
         settings (dict, optional): Additional settings for preprocessing. Defaults to {}.
     Returns:
         None
     """
     src = settings['src']
-    valid_ext = ['tif', 'tiff', 'png', 'jpeg']
+    delete_empty_subdirectories(src)
     files = os.listdir(src)
-    extensions = [file.split('.')[-1] for file in files]
-    extension_counts = Counter(extensions)
-    most_common_extension = extension_counts.most_common(1)[0][0]
-    img_format = None
-    delete_empty_subdirectories(src)
+    valid_ext = ['tif', 'tiff', 'png', 'jpg', 'jpeg', 'bmp', 'nd2', 'czi', 'lif']
+    extensions = [file.split('.')[-1].lower() for file in files]
+    # Filter only valid extensions
+    valid_extensions = [ext for ext in extensions if ext in valid_ext]
-    # Check if the most common extension is one of the specified image formats
-    if most_common_extension in valid_ext:
-        img_format = f'.{most_common_extension}'
-        print(f'Found {extension_counts[most_common_extension]} {most_common_extension} files')
+    # Determine most common valid extension
+    img_format = None
+    if valid_extensions:
+        extension_counts = Counter(valid_extensions)
+        most_common_extension = Counter(valid_extensions).most_common(1)[0][0]
+        img_format = most_common_extension
+        print(f"Found {extension_counts[most_common_extension]} {most_common_extension} files")
     else:
-        print(f'Could not find any {valid_ext} files in {src} only found {extension_counts[0]}')
+        print(f"Could not find any {valid_ext} files in {src} only found {extension_counts[0]}")
+        print(f"{files} in {src}")
+        print(f"Please check the folder and try again")
         if os.path.exists(os.path.join(src,'stack')):
             print('Found existing stack folder.')
@@ -1598,23 +1600,24 @@ def preprocess_img_data(settings):
             return settings, src
     mask_channels = [settings['nucleus_channel'], settings['cell_channel'], settings['pathogen_channel']]
-    backgrounds = [settings['nucleus_background'], settings['cell_background'], settings['pathogen_background']]
-    settings, metadata_type, custom_regex, nr, plot, batch_size, timelapse, lower_percentile, randomize, all_to_mip, pick_slice, skip_mode, cmap, figuresize, normalize, save_dtype, test_mode, test_images, random_test = set_default_settings_preprocess_img_data(settings)
+    settings = set_default_settings_preprocess_img_data(settings)
-    regex = _get_regex(metadata_type, img_format, custom_regex)
-    if test_mode:
+    regex = _get_regex(settings['metadata_type'], img_format, settings['custom_regex'])
+    if settings['test_mode']:
-        print(f'Running spacr in test mode')
+        print(f"Running spacr in test mode")
         settings['plot'] = True
         try:
             os.rmdir(os.path.join(src, 'test'))
             print(f"Deleted test directory: {os.path.join(src, 'test')}")
         except OSError as e:
+            print(f"Error deleting test directory: {e}")
+            print(f"Delete manually before running test mode")
             pass
-        src = _run_test_mode(settings['src'], regex, timelapse, test_images, random_test)
+        src = _run_test_mode(settings['src'], regex, settings['timelapse'], settings['test_images'], settings['random_test'])
         settings['src'] = src
     stack_path = os.path.join(src, 'stack')
@@ -1625,46 +1628,45 @@ def preprocess_img_data(settings):
     if not os.path.exists(stack_path):
         try:
             if not img_format == None:
-                if timelapse:
-                    _move_to_chan_folder(src, regex, timelapse, metadata_type)
-                else:
-                    _rename_and_organize_image_files(src, regex, batch_size, pick_slice, skip_mode, metadata_type, img_format)
-                    #Make sure no batches will be of only one image
-                    all_imgs = len(stack_path)
-                    full_batches = all_imgs // batch_size
-                    last_batch_size = all_imgs % batch_size
-                    # Check if the last batch is of size 1
-                    if last_batch_size == 1:
-                        # If there's only one batch and its size is 1, it's also an issue
-                        if full_batches == 0:
-                            raise ValueError("Only one batch of size 1 detected. Adjust the batch size.")
-                        # If the last batch is of size 1, merge it with the second last batch
-                        elif full_batches > 0:
-                            print(f"all images: {all_imgs},  full batch: {full_batches}, last batch: {last_batch_size}")
-                            raise ValueError("Last batch of size 1 detected. Adjust the batch size.")
+                img_format = ['.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.nd2', '.czi', '.lif']
+                _rename_and_organize_image_files(src, regex, settings['batch_size'], settings['metadata_type'], img_format)
+                #Make sure no batches will be of only one image
+                all_imgs = len(stack_path)
+                full_batches = all_imgs // settings['batch_size']
+                last_batch_size = all_imgs % settings['batch_size']
+                # Check if the last batch is of size 1
+                if last_batch_size == 1:
+                    # If there's only one batch and its size is 1, it's also an issue
+                    if full_batches == 0:
+                        raise ValueError("Only one batch of size 1 detected. Adjust the batch size.")
+                    # If the last batch is of size 1, merge it with the second last batch
+                    elif full_batches > 0:
+                        print(f"all images: {all_imgs},  full batch: {full_batches}, last batch: {last_batch_size}")
+                        raise ValueError("Last batch of size 1 detected. Adjust the batch size.")
                 nr_channel_folders = _merge_channels(src, plot=False)
                 if len(settings['channels']) != nr_channel_folders:
                     print(f"Number of channels does not match number of channel folders. channels: {settings['channels']} channel folders: {nr_channel_folders}")
                     new_channels = list(range(nr_channel_folders))
-                    print(f"Setting channels to {new_channels}")
+                    print(f"Changing channels from {settings['channels']} to {new_channels}")
                     settings['channels'] = new_channels
-                if timelapse:
-                    _create_movies_from_npy_per_channel(stack_path, fps=2)
+                if settings['timelapse']:
+                    _create_movies_from_npy_per_channel(stack_path, fps=settings['fps'])
-                if plot:
-                    print(f'plotting {nr} images from {src}/stack')
-                    plot_arrays(stack_path, figuresize, cmap, nr=nr, normalize=normalize)
+                if settings['plot']:
+                    print(f"plotting {settings['nr']} images from {src}/stack")
+                    plot_arrays(stack_path, settings['figuresize'], settings['cmap'], nr=settings['nr'], normalize=settings['normalize'])
-                if all_to_mip:
+                if settings['all_to_mip']:
                     _mip_all(stack_path)
-                    if plot:
-                        print(f'plotting {nr} images from {src}/stack')
-                        plot_arrays(stack_path, figuresize, cmap, nr=nr, normalize=normalize)
+                    if settings['plot']:
+                        print(f"plotting {settings['nr']} images from {src}/stack")
+                        plot_arrays(stack_path, settings['figuresize'], settings['cmap'], nr=settings['nr'], normalize=settings['normalize'])
         except Exception as e:
             print(f"Error: {e}")
@@ -1673,9 +1675,6 @@ def preprocess_img_data(settings):
                               save_dtype=np.float32,
                               settings=settings)
-    #if plot:
-    #    _plot_4D_arrays(src+'/norm_channel_stack', nr_npz=1, nr=nr)
     return settings, src
 def _check_masks(batch, batch_filenames, output_folder):
@@ -1780,11 +1779,11 @@ def _read_and_join_tables(db_path, table_names=['cell', 'cytoplasm', 'nucleus',
             print(e)
     conn.close()
     if 'png_list' in dataframes:
-        png_list_df = dataframes['png_list'][['cell_id', 'png_path', 'plate', 'row_name', 'column_name', 'field']].copy()
+        png_list_df = dataframes['png_list'][['cell_id', 'png_path', 'plateID', 'rowID', 'columnID', 'fieldID']].copy()
         png_list_df['cell_id'] = png_list_df['cell_id'].str[1:].astype(int)
         png_list_df.rename(columns={'cell_id': 'object_label'}, inplace=True)
         if 'cell' in dataframes:
-            join_cols = ['object_label', 'plate', 'row_name', 'column_name','field']
+            join_cols = ['object_label', 'plateID', 'rowID', 'columnID','fieldID']
             dataframes['cell'] = pd.merge(dataframes['cell'], png_list_df, on=join_cols, how='left')
         else:
             print("Cell table not found in database tables.")
@@ -2085,14 +2084,18 @@ def _read_db(db_loc, tables):
     Returns:
     - dfs (list): A list of pandas DataFrames, each containing the data from a table.
     """
-    from .utils import rename_columns_in_db
+    from .utils import rename_columns_in_db, correct_metadata
     rename_columns_in_db(db_loc)
     conn = sqlite3.connect(db_loc)
     dfs = []
     for table in tables:
         query = f'SELECT * FROM {table}'
         df = pd.read_sql_query(query, conn)
+        df = correct_metadata(df)
         dfs.append(df)
     conn.close()
     return dfs
@@ -2271,7 +2274,7 @@ def _copy_missclassified(df):
 def _read_db(db_loc, tables):
-    from .utils import rename_columns_in_db
+    from .utils import rename_columns_in_db, correct_metadata
     rename_columns_in_db(db_loc)
     conn = sqlite3.connect(db_loc) # Create a connection to the database
@@ -2279,12 +2282,13 @@ def _read_db(db_loc, tables):
     for table in tables:
         query = f'SELECT * FROM {table}' # Write a SQL query to get the data from the database
         df = pd.read_sql_query(query, conn) # Use the read_sql_query function to get the data and save it as a DataFrame
+        df = correct_metadata(df)
         dfs.append(df)
     conn.close() # Close the connection
     return dfs
 def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_limit=10, change_plate=False):
-    from .io import _read_db
     from .utils import _split_data
     # Initialize an empty dictionary to store DataFrames by table name
@@ -2294,8 +2298,8 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
     for idx, loc in enumerate(locs):
         db_dfs = _read_db(loc, tables)
         if change_plate:
-            db_dfs['plate'] = f'plate{idx+1}'
-            db_dfs['prc'] = db_dfs['plate'].astype(str) + '_' + db_dfs['row_name'].astype(str) + '_' + db_dfs['column_name'].astype(str)
+            db_dfs['plateID'] = f'plate{idx+1}'
+            db_dfs['prc'] = db_dfs['plateID'].astype(str) + '_' + db_dfs['rowID'].astype(str) + '_' + db_dfs['columnID'].astype(str)
         for table, df in zip(tables, db_dfs):
             data_dict[table].append(df)
@@ -2303,6 +2307,7 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
     for table, dfs in data_dict.items():
         if dfs:
             data_dict[table] = pd.concat(dfs, axis=0)
         if verbose:
             print(f"{table}: {len(data_dict[table])}")
@@ -2389,18 +2394,18 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
     if 'png_list' in data_dict:
         png_list = data_dict['png_list'].copy()
         png_list_g_df_numeric, png_list_g_df_non_numeric = _split_data(png_list, 'prcfo', 'cell_id')
-        png_list_g_df_non_numeric.drop(columns=['plate','row_name','column_name','field','file_name','cell_id', 'prcf'], inplace=True)
+        png_list_g_df_non_numeric.drop(columns=['plateID','rowID','columnID','fieldID','file_name','cell_id', 'prcf'], inplace=True)
         if verbose:
             print(f'png_list: {len(png_list)}, png_list grouped: {len(png_list_g_df_numeric)}')
             print(f"Added png_list columns: {png_list_g_df_numeric.columns}, {png_list_g_df_non_numeric.columns}")
         merged_df = merged_df.merge(png_list_g_df_numeric, left_index=True, right_index=True)
         merged_df = merged_df.merge(png_list_g_df_non_numeric, left_index=True, right_index=True)
     # Add prc (plate row column) and prcfo (plate row column field object) columns
-    metadata = metadata.assign(prc=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'])
+    metadata = metadata.assign(prc=lambda x: x['plateID'] + '_' + x['rowID'] + '_' + x['columnID'])
     cells_well = metadata.groupby('prc')['object_label'].nunique().reset_index(name='cells_per_well')
     metadata = metadata.merge(cells_well, on='prc')
-    metadata = metadata.assign(prcfo=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'] + '_' + x['field'] + '_' + x['object_label'])
+    metadata = metadata.assign(prcfo=lambda x: x['plateID'] + '_' + x['rowID'] + '_' + x['columnID'] + '_' + x['fieldID'] + '_' + x['object_label'])
     metadata.set_index('prcfo', inplace=True)
     # Merge metadata with final merged DataFrame
@@ -2988,7 +2993,7 @@ def training_dataset_from_annotation(db_path, dst, annotation_column='test', ann
     return class_paths
-def training_dataset_from_annotation_metadata(db_path, dst, annotation_column='test', annotated_classes=(1, 2), metadata_type_by='column_name', class_metadata=['c1','c2']):
+def training_dataset_from_annotation_metadata(db_path, dst, annotation_column='test', annotated_classes=(1, 2), metadata_type_by='columnID', class_metadata=['c1','c2']):
     all_paths = []
     # Connect to the database and retrieve the image paths and annotations
@@ -3010,9 +3015,9 @@ def training_dataset_from_annotation_metadata(db_path, dst, annotation_column='t
     # Filter all_paths by metadata_type_by and class_metadata
     filtered_paths = []
-    metadata_index = {'row_name': 2, 'column_name': 3}.get(metadata_type_by, None)
+    metadata_index = {'rowID': 2, 'columnID': 3}.get(metadata_type_by, None)
     if metadata_index is None:
-        raise ValueError(f"Invalid metadata_type_by value: {metadata_type_by}. Must be 'row_name' or 'column_name'. {class_metadata} must be a list formatted as ['c1', 'c2'] or ['r1', 'r2']")
+        raise ValueError(f"Invalid metadata_type_by value: {metadata_type_by}. Must be 'rowID' or 'columnID'. {class_metadata} must be a list formatted as ['c1', 'c2'] or ['r1', 'r2']")
     for row in all_paths:
         if row[metadata_index] in class_metadata:
@@ -3102,4 +3107,474 @@ def generate_dataset_from_lists(dst, class_data, classes, test_split=0.1):
         test_class_dir = os.path.join(dst, f'test/{cls}')
         print(f'Train class {cls}: {len(os.listdir(train_class_dir))}, Test class {cls}: {len(os.listdir(test_class_dir))}')
-    return os.path.join(dst, 'train'), os.path.join(dst, 'test')
+    return os.path.join(dst, 'train'), os.path.join(dst, 'test')
+def convert_separate_files_to_yokogawa(folder, regex):
+    ROWS = "ABCDEFGHIJKLMNOP"
+    COLS = [f"{i:02d}" for i in range(1, 25)]
+    WELLS = [f"{r}{c}" for r in ROWS for c in COLS]
+    def _get_next_well(used_wells):
+        plate = 1
+        for well in WELLS:
+            well_name = f"plate{plate}_{well}"
+            if well_name not in used_wells:
+                return well_name
+            if well == "P24":
+                plate += 1
+        return f"plate{plate}_A01"
+    pattern = re.compile(regex, re.I)
+    files_by_region = {}
+    rename_log = []
+    csv_path = os.path.join(folder, "rename_log.csv")
+    used_wells = set()
+    region_to_well = {}
+    # Group files by (plateID, wellID, fieldID, timeID, chanID)
+    for file in os.listdir(folder):
+        match = pattern.match(file)
+        if not match:
+            print(f"Skipping {file}: does not match regex.")
+            continue
+        meta = match.groupdict()
+        # Mandatory metadata
+        if 'wellID' not in meta or meta['wellID'] is None:
+            print(f"Skipping {file}: missing mandatory wellID.")
+            continue
+        wellID = meta['wellID']
+        # Optional metadata with defaults
+        plateID = meta.get('plateID', '1') or '1'
+        fieldID = meta.get('fieldID', '1') or '1'
+        timeID = int(meta.get('timeID', 1) or 1)
+        chanID = int(meta.get('chanID', 1) or 1)
+        sliceID = meta.get('sliceID')
+        sliceID = int(sliceID) if sliceID is not None else None
+        region_key = (plateID, wellID, fieldID, timeID, chanID)
+        files_by_region.setdefault(region_key, []).append((file, sliceID))
+    # Assign wells and process files per region
+    for region, file_list in files_by_region.items():
+        if region[:3] not in region_to_well:
+            next_well = _get_next_well(used_wells)
+            region_to_well[region[:3]] = next_well
+            used_wells.add(next_well)
+        assigned_well = region_to_well[region[:3]]
+        plateID, wellID, fieldID, timeID, chanID = region
+        # Check if multiple slices exist and are meaningful
+        slice_ids = [sid for _, sid in file_list if sid is not None]
+        unique_slices = set(slice_ids)
+        images = []
+        for filename, _ in sorted(file_list, key=lambda x: x[1] or 1):
+            img = tifffile.imread(os.path.join(folder, filename))
+            images.append(img)
+        # Perform MIP only if multiple unique slices are present
+        if len(unique_slices) > 1:
+            img_to_save = np.max(np.stack(images), axis=0)
+        else:
+            img_to_save = images[0]
+        dtype = img_to_save.dtype
+        new_filename = f"{assigned_well}_T{timeID:04d}F{int(fieldID):03d}L01C{chanID:02d}.tif"
+        new_filepath = os.path.join(folder, new_filename)
+        tifffile.imwrite(new_filepath, img_to_save.astype(dtype))
+        # Log original filenames involved in MIP or single file rename
+        original_files = ";".join(f[0] for f in file_list)
+        rename_log.append({"Original File(s)": original_files, "Renamed TIFF": new_filename})
+    pd.DataFrame(rename_log).to_csv(csv_path, index=False)
+    print(f"Processing complete. Files saved in {folder} and rename log saved as {csv_path}.")
+def convert_to_yokogawa(folder):
+    """
+    Detects file type in the folder and converts them
+    to Yokogawa-style naming with Maximum Intensity Projection (MIP).
+    """
+    def _get_next_well(used_wells):
+        """
+        Determines the next available well position across multiple 384-well plates.
+        """
+        ROWS = "ABCDEFGHIJKLMNOP"
+        COLS = [f"{i:02d}" for i in range(1, 25)]
+        WELLS = [f"{r}{c}" for r in ROWS for c in COLS]
+        plate = 1
+        while True:
+            for well in WELLS:
+                well_name = f"plate{plate}_{well}"
+                if well_name not in used_wells:
+                    used_wells.add(well_name)
+                    return well_name
+            plate += 1  # All wells exhausted in current plate, increment to next plate
+    # Define 384-well plate format
+    ROWS = "ABCDEFGHIJKLMNOP"
+    COLS = [f"{i:02d}" for i in range(1, 25)]
+    WELLS = [f"{r}{c}" for r in ROWS for c in COLS]
+    filenames = []
+    rename_log = []
+    csv_path = os.path.join(folder, "rename_log.csv")
+    used_wells = set()
+    # **Dictionary to store well assignments per original file**
+    file_to_well = {}
+    for file in os.listdir(folder):
+        path = os.path.join(folder, file)
+        ext = file.lower().split('.')[-1]
+        # **Assign a well only once per original file**
+        if file not in file_to_well:
+            file_to_well[file] = _get_next_well(used_wells)
+            #used_wells.add(file_to_well[file])  # Mark it as used
+        well = file_to_well[file]  # Use the same well for all channels/times
+        ### **Process Nikon ND2 Files**
+        if ext == 'nd2':
+            try:
+                nd2 = ND2Reader(path)
+                metadata = nd2.metadata
+                timepoints = list(range(len(metadata.get("frames", [0])))) or [0]
+                fields = list(range(len(metadata.get("fields_of_view", [0])))) or [0]
+                z_levels = list(metadata.get("z_levels", range(1))) if metadata.get("z_levels") else [0]
+                channels = metadata.get("channels", [])
+                for t_idx in timepoints:
+                    for f_idx in fields:
+                        for c_idx, channel in enumerate(channels):
+                            try:
+                                mip_image = np.max.reduce([
+                                    nd2.get_frame_2D(t=t_idx, v=f_idx, z=z_idx, c=c_idx)
+                                    for z_idx in z_levels
+                                ], axis=0)
+                                dtype = mip_image.dtype
+                                filename = f"{well}_T{t_idx+1:04d}F{f_idx+1:03d}L01C{c_idx+1:02d}.tif"
+                                filepath = os.path.join(folder, filename)
+                                tifffile.imwrite(filepath, mip_image.astype(dtype))
+                                rename_log.append({"Original File": file,
+                                                   "Renamed TIFF": filename,
+                                                   "ext": ext,
+                                                   "time": t_idx,
+                                                   "field": f_idx,
+                                                   "channel": channel,
+                                                   "z": z_levels})
+                            except IndexError:
+                                print(f"Warning: ND2 file {file} has an incomplete data structure. Skipping.")
+            except Exception as e:
+                print(f"Error processing ND2 file {file}: {e}")
+        elif ext == 'czi':
+            try:
+                # Open the CZI in streaming mode
+                with pyczi.open_czi(path) as czidoc:
+                    # 1) Global dimension ranges
+                    bbox    = czidoc.total_bounding_box
+                    _, tlen = bbox.get('T', (0,1))
+                    _, clen = bbox.get('C', (0,1))
+                    _, zlen = bbox.get('Z', (0,1))
+                    # 2) Scene → list of scene indices
+                    scenes_bb = czidoc.scenes_bounding_rectangle
+                    scenes    = sorted(scenes_bb.keys()) if scenes_bb else [None]
+                    # 3) Output folder (same as .czi)
+                    folder = os.path.dirname(path)
+                    # 4) Loop scene × time × channel × Z
+                    for scene in scenes:
+                        # *** assign a unique well for this scene ***
+                        scene_well = _get_next_well(used_wells)
+                        # Field index = scene+1 (or 1 if no scene)
+                        F_idx = scene + 1 if scene is not None else 1
+                        # Scene index for “A”
+                        A_idx = scene + 1 if scene is not None else 1
+                        for t in range(tlen):
+                            for c in range(clen):
+                                for z in range(zlen):
+                                    # Read exactly one 2D plane
+                                    arr = czidoc.read(
+                                        plane={'T': t, 'C': c, 'Z': z},
+                                        scene=scene
+                                    )
+                                    plane = np.squeeze(arr)
+                                    # Build Yokogawa‐style filename:
+                                    fn = (
+                                        f"{scene_well}_"
+                                        f"T{t+1:04d}"
+                                        f"F{F_idx:03d}"
+                                        f"L01"
+                                        f"A{A_idx:02d}"
+                                        f"Z{z+1:02d}"
+                                        f"C{c+1:02d}.tif"
+                                    )
+                                    outpath = os.path.join(folder, fn)
+                                    # Write with lossless compression
+                                    tifffile.imwrite(
+                                        outpath,
+                                        plane.astype(plane.dtype),
+                                        compression='zlib'
+                                    )
+                                    # Log it
+                                    rename_log.append({
+                                        "Original File": file,
+                                        "Renamed TIFF": fn,
+                                        "ext": ext,
+                                        "scene": scene,
+                                        "time": t,
+                                        "slice": z,
+                                        "field": F_idx,
+                                        "channel": c,
+                                        "well": scene_well
+                                    })
+            except Exception as e:
+                print(f"Error processing CZI file {file}: {e}")
+        ### **Process Leica LIF Files**
+        elif ext == 'lif':
+            try:
+                lif_file = readlif.Reader(path)
+                for image_idx, image in enumerate(lif_file.getIterImage()):
+                    timepoints = range(getattr(image.dims, 't', 1))
+                    z_levels = range(getattr(image.dims, 'z', 1))
+                    channels = range(getattr(image.dims, 'c', 1))
+                    for t_idx in timepoints:
+                        for c_idx in channels:
+                            z_stack = []
+                            for z_idx in z_levels:
+                                try:
+                                    frame = image.getFrame(z=z_idx, t=t_idx, c=c_idx)
+                                    z_stack.append(frame)
+                                except IndexError:
+                                    print(f"Missing frame: T{t_idx}, Z{z_idx}, C{c_idx} in {file}, skipping frame.")
+                            if z_stack:
+                                mip_image = np.max(np.stack(z_stack), axis=0)
+                                dtype = mip_image.dtype
+                                filename = f"{well}_T{t_idx+1:04d}F{image_idx+1:03d}L01C{c_idx+1:02d}.tif"
+                                filepath = os.path.join(folder, filename)
+                                tifffile.imwrite(filepath, mip_image.astype(dtype))
+                                rename_log.append({"Original File": file, "Renamed TIFF": filename})
+            except Exception as e:
+                print(f"Error processing LIF file {file}: {e}")
+        ### **Process Standard Image Files (TIFF, PNG, JPEG, BMP)**
+        elif ext in ['tif', 'tiff', 'png', 'jpg', 'jpeg', 'bmp'] and not file.startswith("plate"):
+            try:
+                with tifffile.TiffFile(path) as tif:
+                    images = tif.asarray()
+                    ndim = images.ndim
+                    # Defaults
+                    t_dim = z_dim = c_dim = 1
+                    # Determine dimensions more explicitly
+                    if ndim == 2:
+                        mip_image = images
+                        filename = f"{well}_T0001F001L01C01.tif"
+                        tifffile.imwrite(os.path.join(folder, filename), mip_image)
+                        rename_log.append({"Original File": file, "Renamed TIFF": filename})
+                        continue
+                    elif ndim == 3:
+                        if images.shape[0] <= 4:  # Likely channels
+                            c_dim = images.shape[0]
+                            for c in range(c_dim):
+                                mip_image = images[c, :, :]
+                                filename = f"{well}_T0001F001L01C{c+1:02d}.tif"
+                                tifffile.imwrite(os.path.join(folder, filename), mip_image)
+                                rename_log.append({"Original File": file, "Renamed TIFF": filename})
+                        else:  # Z-stack
+                            mip_image = np.max(images, axis=0)
+                            filename = f"{well}_T0001F001L01C01.tif"
+                            tifffile.imwrite(os.path.join(folder, filename), mip_image)
+                            rename_log.append({"Original File": file, "Renamed TIFF": filename})
+                    elif ndim == 4:
+                        t_dim, z_dim, y_dim, x_dim = images.shape
+                        for t in range(t_dim):
+                            mip_image = np.max(images[t, :, :, :], axis=0)
+                            filename = f"{well}_T{t+1:04d}F001L01C01.tif"
+                            tifffile.imwrite(os.path.join(folder, filename), mip_image)
+                            rename_log.append({"Original File": file, "Renamed TIFF": filename})
+                    else:
+                        raise ValueError(f"Unsupported TIFF dimensions: {images.shape}")
+            except Exception as e:
+                print(f"Error processing standard image file {file}: {e}")
+    # Save rename log as CSV
+    pd.DataFrame(rename_log).to_csv(csv_path, index=False)
+    print(f"Processing complete. Files saved in {folder} and rename log saved as {csv_path}.")
+def apply_augmentation(image, method):
+    if method == 'rotate90':
+        return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
+    elif method == 'rotate180':
+        return cv2.rotate(image, cv2.ROTATE_180)
+    elif method == 'rotate270':
+        return cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
+    elif method == 'flip_h':
+        return cv2.flip(image, 1)
+    elif method == 'flip_v':
+        return cv2.flip(image, 0)
+    return image
+def process_instruction(entry):
+    img = tifffile.imread(entry["src_img"])
+    msk = tifffile.imread(entry["src_msk"])
+    if entry["augment"]:
+        img = apply_augmentation(img, entry["augment"])
+        msk = apply_augmentation(msk, entry["augment"])
+    tifffile.imwrite(entry["dst_img"], img)
+    tifffile.imwrite(entry["dst_msk"], msk)
+    return 1
+def prepare_cellpose_dataset(input_root, augment_data=False, train_fraction=0.8, n_jobs=None):
+    from .utils import print_progress
+    time_ls = []
+    input_root = os.path.abspath(input_root)
+    output_root = os.path.join(input_root, "cellpose_dataset")
+    def get_augmentations():
+        return ['rotate90', 'rotate180', 'rotate270', 'flip_h', 'flip_v']
+    def find_image_mask_pairs(dataset_path):
+        mask_dir = os.path.join(dataset_path, "masks")
+        pairs = []
+        for fname in os.listdir(dataset_path):
+            if fname.lower().endswith((".tif", ".tiff")):
+                img_path = os.path.join(dataset_path, fname)
+                msk_path = os.path.join(mask_dir, fname)
+                if os.path.isfile(msk_path):
+                    pairs.append((img_path, msk_path))
+        return pairs
+    def prepare_output_folders(base):
+        for subset in ["train", "test"]:
+            os.makedirs(os.path.join(base, subset, "images"), exist_ok=True)
+            os.makedirs(os.path.join(base, subset, "masks"), exist_ok=True)
+    print("Scanning datasets...")
+    datasets = []
+    for subdir in os.listdir(input_root):
+        dataset_path = os.path.join(input_root, subdir)
+        if os.path.isdir(dataset_path) and os.path.isdir(os.path.join(dataset_path, "masks")):
+            pairs = find_image_mask_pairs(dataset_path)
+            if pairs:
+                datasets.append(pairs)
+                print(f"  Found {len(pairs)} images in {dataset_path}")
+    if not datasets:
+        raise ValueError("No valid datasets with images and masks found.")
+    prepare_output_folders(output_root)
+    min_size = min(len(pairs) for pairs in datasets)
+    target_size = min_size if not augment_data else max(len(pairs) for pairs in datasets)
+    print("\nPreparing instruction list...")
+    instructions = []
+    global_index = 0
+    for pairs in datasets:
+        dataset_len = len(pairs)
+        # --- Step 1: Sample or augment ---
+        sampled_pairs = []
+        if dataset_len >= target_size:
+            sampled_pairs = random.sample(pairs, target_size)
+        else:
+            sampled_pairs = pairs.copy()
+            if augment_data:
+                needed = target_size - dataset_len
+                aug_methods = get_augmentations()
+                full_loops = needed // len(aug_methods)
+                extra = needed % len(aug_methods)
+                for _ in range(full_loops):
+                    for (img_path, msk_path), aug in zip(pairs, aug_methods * (dataset_len // len(aug_methods))):
+                        sampled_pairs.append((img_path, msk_path, aug))
+                if extra > 0:
+                    subset = random.sample(pairs * ((extra // len(aug_methods)) + 1), extra)
+                    for (img_path, msk_path), aug in zip(subset, aug_methods[:extra]):
+                        sampled_pairs.append((img_path, msk_path, aug))
+        # Add "no augmentation" tag to original files
+        augmented_sampled = [
+            (tup[0], tup[1], None) if len(tup) == 2 else tup
+            for tup in sampled_pairs
+        ]
+        # --- Step 2: Split into train/test ---
+        random.shuffle(augmented_sampled)
+        split_idx = int(train_fraction * len(augmented_sampled))
+        split_sets = {
+            "train": augmented_sampled[:split_idx],
+            "test": augmented_sampled[split_idx:]
+        }
+        for subset, items in split_sets.items():
+            for img_path, msk_path, aug in items:
+                dst_img = os.path.join(output_root, subset, "images", f"{global_index:05d}.tif")
+                dst_msk = os.path.join(output_root, subset, "masks", f"{global_index:05d}.tif")
+                instructions.append({
+                    "src_img": img_path,
+                    "src_msk": msk_path,
+                    "dst_img": dst_img,
+                    "dst_msk": dst_msk,
+                    "augment": aug
+                })
+                global_index += 1
+    print(f"Total files to process: {len(instructions)}")
+    # --- Step 3: Process with multiprocessing ---
+    print("Processing images with multiprocessing...")
+    if n_jobs is None:
+        n_jobs = max(1, cpu_count() - 1)
+    else:
+        n_jobs = int(n_jobs)
+    with Pool(n_jobs) as pool:
+        for i, _ in enumerate(pool.imap_unordered(process_instruction, instructions), 1):
+            print_progress(i, len(instructions), n_jobs=n_jobs, time_ls=time_ls, batch_size=None, operation_type="cellpose dataset")
+    print(f"Done. Dataset saved to: {output_root}")

spacr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl

spacr 0.4.15py3-none-any.whl → 0.5.0py3-none-any.whl