spacr 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spacr/settings.py CHANGED
@@ -86,10 +86,10 @@ def set_default_settings_preprocess_generate_masks(settings={}):
86
86
  settings.setdefault('fps', 2)
87
87
  settings.setdefault('timelapse_displacement', None)
88
88
  settings.setdefault('timelapse_memory', 3)
89
- settings.setdefault('timelapse_frame_limits', None)
89
+ settings.setdefault('timelapse_frame_limits', [5,])
90
90
  settings.setdefault('timelapse_remove_transient', False)
91
91
  settings.setdefault('timelapse_mode', 'trackpy')
92
- settings.setdefault('timelapse_objects', 'cells')
92
+ settings.setdefault('timelapse_objects', None)
93
93
 
94
94
  # Misc settings
95
95
  settings.setdefault('all_to_mip', False)
@@ -256,7 +256,13 @@ def get_measure_crop_settings(settings={}):
256
256
  settings.setdefault('homogeneity', True)
257
257
  settings.setdefault('homogeneity_distances', [8,16,32])
258
258
 
259
- # Cropping settings
259
+ # Cropping settings # Cropping settings
260
+ settings.setdefault('save_arrays', False)
261
+ settings.setdefault('save_png',True)
262
+ settings.setdefault('use_bounding_box',False)
263
+ settings.setdefault('png_size',[224,224])
264
+ settings.setdefault('png_dims',[0,1,2])
265
+ settings.setdefault('normalize',False) # Cropping settings
260
266
  settings.setdefault('save_arrays', False)
261
267
  settings.setdefault('save_png',True)
262
268
  settings.setdefault('use_bounding_box',False)
@@ -277,9 +283,9 @@ def get_measure_crop_settings(settings={}):
277
283
  settings.setdefault('n_jobs', os.cpu_count()-2)
278
284
 
279
285
  # Object settings
280
- settings.setdefault('cell_mask_dim',None)
281
- settings.setdefault('nucleus_mask_dim',None)
282
- settings.setdefault('pathogen_mask_dim',None)
286
+ settings.setdefault('cell_mask_dim',4)
287
+ settings.setdefault('nucleus_mask_dim',5)
288
+ settings.setdefault('pathogen_mask_dim',6)
283
289
  settings.setdefault('cytoplasm',False)
284
290
  settings.setdefault('uninfected',True)
285
291
  settings.setdefault('cell_min_size',0)
@@ -473,7 +479,7 @@ def get_train_test_model_settings(settings):
473
479
  return settings
474
480
 
475
481
  def get_analyze_recruitment_default_settings(settings):
476
- settings.setdefault('src','path')
482
+ settings.setdefault('src', 'path')
477
483
  settings.setdefault('target','protein')
478
484
  settings.setdefault('cell_types',['HeLa'])
479
485
  settings.setdefault('cell_plate_metadata',None)
@@ -672,6 +678,7 @@ expected_types = {
672
678
  "timelapse_displacement": int,
673
679
  "timelapse_memory": int,
674
680
  "timelapse_frame_limits": (list, type(None)), # This can be a list of lists
681
+ #"timelapse_frame_limits": (list, type(None)), # This can be a list of lists
675
682
  "timelapse_remove_transient": bool,
676
683
  "timelapse_mode": str,
677
684
  "timelapse_objects": list,
@@ -944,13 +951,13 @@ expected_types = {
944
951
  }
945
952
 
946
953
  categories = {"Paths":[ "src", "grna", "barcodes", "custom_model_path", "dataset","model_path","grna_csv","row_csv","column_csv", "metadata_files", "score_data","count_data"],
947
- "General": ["metadata_type", "custom_regex", "experiment", "channels", "magnification", "channel_dims", "apply_model_to_dataset", "generate_training_dataset", "train_DL_model", "segmentation_mode", "delete_intermediate"],
954
+ "General": ["cell_mask_dim", "cytoplasm", "cell_chann_dim", "cell_channel", "nucleus_chann_dim", "nucleus_channel", "nucleus_mask_dim", "pathogen_mask_dim", "pathogen_chann_dim", "pathogen_channel", "test_mode", "plot", "metadata_type", "custom_regex", "experiment", "channels", "magnification", "channel_dims", "apply_model_to_dataset", "generate_training_dataset", "train_DL_model", "segmentation_mode", "delete_intermediate", "uninfected", ],
948
955
  "Cellpose":["fill_in","from_scratch", "n_epochs", "width_height", "model_name", "custom_model", "resample", "rescale", "CP_prob", "flow_threshold", "percentiles", "invert", "diameter", "grayscale", "Signal_to_noise", "resize", "target_height", "target_width"],
949
- "Cell": ["cell_diamiter","cell_intensity_range", "cell_size_range", "cell_chann_dim", "cell_channel", "cell_background", "cell_Signal_to_noise", "cell_CP_prob", "cell_FT", "remove_background_cell", "cell_min_size", "cell_mask_dim", "cytoplasm", "cytoplasm_min_size", "uninfected", "merge_edge_pathogen_cells", "adjust_cells", "cells", "cell_loc"],
950
- "Nucleus": ["nucleus_diamiter","nucleus_intensity_range", "nucleus_size_range", "nucleus_chann_dim", "nucleus_channel", "nucleus_background", "nucleus_Signal_to_noise", "nucleus_CP_prob", "nucleus_FT", "remove_background_nucleus", "nucleus_min_size", "nucleus_mask_dim", "nucleus_loc"],
951
- "Pathogen": ["pathogen_diamiter","pathogen_intensity_range", "pathogen_size_range", "pathogen_chann_dim", "pathogen_channel", "pathogen_background", "pathogen_Signal_to_noise", "pathogen_CP_prob", "pathogen_FT", "pathogen_model", "remove_background_pathogen", "pathogen_min_size", "pathogen_mask_dim", "pathogens", "pathogen_loc", "pathogen_types", "pathogen_plate_metadata", ],
956
+ "Cell": ["cell_diamiter","cell_intensity_range", "cell_size_range", "cell_background", "cell_Signal_to_noise", "cell_CP_prob", "cell_FT", "remove_background_cell", "cell_min_size", "cytoplasm_min_size", "adjust_cells", "cells", "cell_loc"],
957
+ "Nucleus": ["nucleus_diamiter","nucleus_intensity_range", "nucleus_size_range", "nucleus_background", "nucleus_Signal_to_noise", "nucleus_CP_prob", "nucleus_FT", "remove_background_nucleus", "nucleus_min_size", "nucleus_loc"],
958
+ "Pathogen": ["pathogen_diamiter","pathogen_intensity_range", "pathogen_size_range", "pathogen_background", "pathogen_Signal_to_noise", "pathogen_CP_prob", "pathogen_FT", "pathogen_model", "remove_background_pathogen", "pathogen_min_size", "pathogens", "pathogen_loc", "pathogen_types", "pathogen_plate_metadata", ],
952
959
  "Measurements": ["remove_image_canvas", "remove_highly_correlated", "homogeneity", "homogeneity_distances", "radial_dist", "calculate_correlation", "manders_thresholds", "save_measurements", "tables", "image_nr", "dot_size", "filter_by", "remove_highly_correlated_features", "remove_low_variance_features", "channel_of_interest"],
953
- "Object Image": ["save_png", "dialate_pngs", "dialate_png_ratios", "png_size", "png_dims", "save_arrays", "normalize_by", "crop_mode", "normalize", "use_bounding_box"],
960
+ "Object Image": ["save_png", "dialate_pngs", "dialate_png_ratios", "png_size", "png_dims", "save_arrays", "normalize_by", "crop_mode", "use_bounding_box"],
954
961
  "Sequencing": ["outlier_detection","offset_start","chunk_size","single_direction", "signal_direction","mode","comp_level","comp_type","save_h5","expected_end","offset","target_sequence","regex", "highlight"],
955
962
  "Generate Dataset":["save_to_db","file_metadata","class_metadata", "annotation_column","annotated_classes", "dataset_mode", "metadata_type_by","custom_measurement", "sample", "size"],
956
963
  "Hyperparamiters (Training)": ["png_type", "score_threshold","file_type", "train_channels", "epochs", "loss_type", "optimizer_type","image_size","val_split","learning_rate","weight_decay","dropout_rate", "init_weights", "train", "classes", "augment", "amsgrad","use_checkpoint","gradient_accumulation","gradient_accumulation_steps","intermedeate_save","pin_memory"],
@@ -959,11 +966,10 @@ categories = {"Paths":[ "src", "grna", "barcodes", "custom_model_path", "dataset
959
966
  "Hyperparamiters (Regression)":["cross_validation","prune_features","reg_lambda","reg_alpha","cov_type", "class_1_threshold", "plate", "other", "fraction_threshold", "alpha", "random_row_column_effects", "regression_type", "min_cell_count", "agg_type", "transform", "dependent_variable"],
960
967
  "Hyperparamiters (Activation)":["cam_type", "overlay", "correlation", "target_layer", "normalize_input"],
961
968
  "Annotation": ["filter_column", "filter_value","volcano", "toxo", "controls", "nc_loc", "pc_loc", "nc", "pc", "cell_plate_metadata","treatment_plate_metadata", "metadata_types", "cell_types", "target","positive_control","negative_control", "location_column", "treatment_loc", "channel_of_interest", "measurement", "treatments", "um_per_pixel", "nr_imgs", "exclude", "exclude_conditions", "mix", "pos", "neg"],
962
- "Plot": ["plot", "split_axis_lims", "x_lim","log_x","log_y", "plot_control", "plot_nr", "examples_to_plot", "normalize_plots", "cmap", "figuresize", "plot_cluster_grids", "img_zoom", "row_limit", "color_by", "plot_images", "smooth_lines", "plot_points", "plot_outlines", "black_background", "plot_by_cluster", "heatmap_feature","grouping","min_max","cmap","save_figure"],
963
- "Test": ["test_mode", "test_images", "random_test", "test_nr", "test", "test_split"],
969
+ "Plot": ["split_axis_lims", "x_lim","log_x","log_y", "plot_control", "plot_nr", "examples_to_plot", "normalize_plots", "cmap", "figuresize", "plot_cluster_grids", "img_zoom", "row_limit", "color_by", "plot_images", "smooth_lines", "plot_points", "plot_outlines", "black_background", "plot_by_cluster", "heatmap_feature","grouping","min_max","cmap","save_figure"],
964
970
  "Timelapse": ["timelapse", "fps", "timelapse_displacement", "timelapse_memory", "timelapse_frame_limits", "timelapse_remove_transient", "timelapse_mode", "timelapse_objects", "compartments"],
965
- "Advanced": ["target_unique_count","threshold_multiplier", "threshold_method", "min_n","shuffle", "target_intensity_min", "cells_per_well", "nuclei_limit", "pathogen_limit", "background", "backgrounds", "schedule", "test_size","exclude","n_repeats","top_features", "model_type_ml", "model_type","minimum_cell_count","n_estimators","preprocess", "remove_background", "normalize", "lower_percentile", "merge_pathogens", "batch_size", "filter", "save", "masks", "verbose", "randomize", "n_jobs"],
966
- "Miscellaneous": ["all_to_mip", "pick_slice", "skip_mode", "upscale", "upscale_factor"]
971
+ "Advanced": ["merge_edge_pathogen_cells", "test_images", "random_test", "test_nr", "test", "test_split", "normalize", "target_unique_count","threshold_multiplier", "threshold_method", "min_n","shuffle", "target_intensity_min", "cells_per_well", "nuclei_limit", "pathogen_limit", "background", "backgrounds", "schedule", "test_size","exclude","n_repeats","top_features", "model_type_ml", "model_type","minimum_cell_count","n_estimators","preprocess", "remove_background", "normalize", "lower_percentile", "merge_pathogens", "batch_size", "filter", "save", "masks", "verbose", "randomize", "n_jobs"],
972
+ "Beta": ["all_to_mip", "pick_slice", "skip_mode", "upscale", "upscale_factor"]
967
973
  }
968
974
 
969
975
 
@@ -972,6 +978,127 @@ category_keys = list(categories.keys())
972
978
  def check_settings(vars_dict, expected_types, q=None):
973
979
  from .gui_utils import parse_list
974
980
 
981
+ if q is None:
982
+ from multiprocessing import Queue
983
+ q = Queue()
984
+
985
+ settings = {}
986
+ errors = [] # Collect errors instead of stopping at the first one
987
+
988
+ for key, (label, widget, var, _) in vars_dict.items():
989
+ if key not in expected_types and key not in category_keys:
990
+ errors.append(f"Warning: Key '{key}' not found in expected types.")
991
+ continue
992
+
993
+ value = var.get()
994
+ if value in ['None', '']:
995
+ value = None
996
+
997
+ expected_type = expected_types.get(key, str)
998
+
999
+ try:
1000
+ if key in ["cell_plate_metadata", "timelapse_frame_limits", "png_size", "png_dims", "pathogen_plate_metadata", "treatment_plate_metadata", "class_metadata", "crop_mode"]:
1001
+ if value is None:
1002
+ parsed_value = None
1003
+ else:
1004
+ try:
1005
+ parsed_value = ast.literal_eval(value)
1006
+ except (ValueError, SyntaxError):
1007
+ raise ValueError(f"Expected a list or list of lists but got an invalid format: {value}")
1008
+
1009
+ if isinstance(parsed_value, list):
1010
+ if all(isinstance(i, list) for i in parsed_value) or all(not isinstance(i, list) for i in parsed_value):
1011
+ settings[key] = parsed_value
1012
+ else:
1013
+ raise ValueError(f"Invalid format: '{key}' contains mixed types (single values and lists).")
1014
+
1015
+ else:
1016
+ raise ValueError(f"Expected a list for '{key}', but got {type(parsed_value).__name__}.")
1017
+
1018
+ elif expected_type == list:
1019
+ settings[key] = parse_list(value) if value else None
1020
+
1021
+ if isinstance(settings[key], list) and len(settings[key]) == 1:
1022
+ settings[key] = settings[key][0]
1023
+
1024
+ elif expected_type == bool:
1025
+ settings[key] = value.lower() in ['true', '1', 't', 'y', 'yes'] if isinstance(value, str) else bool(value)
1026
+
1027
+ elif expected_type == (int, type(None)):
1028
+ if value is None or str(value).isdigit():
1029
+ settings[key] = int(value) if value is not None else None
1030
+ else:
1031
+ raise ValueError(f"Expected an integer or None for '{key}', but got '{value}'.")
1032
+
1033
+ elif expected_type == (float, type(None)):
1034
+ if value is None or (isinstance(value, str) and value.replace(".", "", 1).isdigit()):
1035
+ settings[key] = float(value) if value is not None else None
1036
+ else:
1037
+ raise ValueError(f"Expected a float or None for '{key}', but got '{value}'.")
1038
+
1039
+ elif expected_type == (int, float):
1040
+ try:
1041
+ settings[key] = float(value) if '.' in str(value) else int(value)
1042
+ except ValueError:
1043
+ raise ValueError(f"Expected an integer or float for '{key}', but got '{value}'.")
1044
+
1045
+ elif expected_type == (str, type(None)):
1046
+ settings[key] = str(value) if value is not None else None
1047
+
1048
+ elif expected_type == (str, type(None), list):
1049
+ if isinstance(value, list):
1050
+ settings[key] = parse_list(value) if value else None
1051
+ elif isinstance(value, str):
1052
+ settings[key] = str(value)
1053
+ else:
1054
+ settings[key] = None
1055
+
1056
+ elif expected_type == dict:
1057
+ try:
1058
+ if isinstance(value, str):
1059
+ parsed_dict = ast.literal_eval(value)
1060
+ else:
1061
+ raise ValueError("Expected a string representation of a dictionary.")
1062
+
1063
+ if not isinstance(parsed_dict, dict):
1064
+ raise ValueError(f"Expected a dictionary for '{key}', but got {type(parsed_dict).__name__}.")
1065
+
1066
+ settings[key] = parsed_dict
1067
+ except (ValueError, SyntaxError) as e:
1068
+ settings[key] = {}
1069
+ errors.append(f"Error: Invalid dictionary format for '{key}'. Expected type: dict. Error: {e}")
1070
+
1071
+ elif isinstance(expected_type, tuple):
1072
+ for typ in expected_type:
1073
+ try:
1074
+ settings[key] = typ(value) if value else None
1075
+ break
1076
+ except (ValueError, TypeError):
1077
+ continue
1078
+ else:
1079
+ raise ValueError(f"Value '{value}' for '{key}' does not match any expected types: {expected_type}.")
1080
+
1081
+ else:
1082
+ try:
1083
+ settings[key] = expected_type(value) if value else None
1084
+ except (ValueError, TypeError):
1085
+ raise ValueError(f"Expected type {expected_type.__name__} for '{key}', but got '{value}'.")
1086
+
1087
+ except (ValueError, SyntaxError) as e:
1088
+ expected_type_name = ' or '.join([t.__name__ for t in expected_type]) if isinstance(expected_type, tuple) else expected_type.__name__
1089
+ errors.append(f"Error: '{key}' has invalid format. Expected type: {expected_type_name}. Got value: '{value}'. Error: {e}")
1090
+
1091
+ # Send all collected errors to the queue
1092
+ for error in errors:
1093
+ q.put(error)
1094
+
1095
+
1096
+
1097
+ return settings, errors
1098
+
1099
+ def check_settings_v1(vars_dict, expected_types, q=None):
1100
+ from .gui_utils import parse_list
1101
+
975
1102
  if q is None:
976
1103
  from multiprocessing import Queue
977
1104
  q = Queue()
@@ -984,22 +1111,26 @@ def check_settings(vars_dict, expected_types, q=None):
984
1111
  q.put(f"Key {key} not found in expected types.")
985
1112
  continue
986
1113
 
987
- value = var.get()
988
- if value == 'None':
1114
+ value = var.get()
1115
+ if value in ['None', '']:
989
1116
  value = None
990
1117
 
991
1118
  expected_type = expected_types.get(key, str)
992
1119
 
993
1120
  try:
994
- if key in ["cell_plate_metadata", "timelapse_frame_limits", "png_size", "pathogen_loc", "treatment_loc", "pathogen_plate_metadata", "treatment_plate_metadata", "barcode_coordinates", "class_metadata"]:
995
- parsed_value = ast.literal_eval(value) if value else None
1121
+ #if key in ["cell_plate_metadata", "timelapse_frame_limits", "png_size", "pathogen_loc", "treatment_loc", "pathogen_plate_metadata", "treatment_plate_metadata", "barcode_coordinates", "class_metadata"]:
1122
+ if key in ["cell_plate_metadata", "timelapse_frame_limits", "png_size", "png_dims", "pathogen_plate_metadata", "treatment_plate_metadata", "class_metadata", "crop_mode"]:
1123
+
1124
+ if value is None:
1125
+ parsed_value = None
1126
+ else:
1127
+ parsed_value = ast.literal_eval(value) if isinstance(value, str) and value.strip() else None
1128
+
996
1129
  if isinstance(parsed_value, list):
997
1130
  if all(isinstance(i, list) for i in parsed_value) or all(not isinstance(i, list) for i in parsed_value):
998
1131
  settings[key] = parsed_value
999
1132
  else:
1000
1133
  raise ValueError("Invalid format: Mixed list and list of lists")
1001
- #elif parsed_value == None:
1002
- # settings[key] = None
1003
1134
  else:
1004
1135
  raise ValueError("Invalid format for list or list of lists")
1005
1136
 
@@ -1180,30 +1311,7 @@ def generate_fields(variables, scrollable_frame):
1180
1311
  "n_epochs": "(int) - Number of epochs for training the Cellpose model.",
1181
1312
  "n_jobs": "(int) - The number of n_jobs to use for processing the images. This will determine how many images are processed in parallel. Increase to speed up processing.",
1182
1313
  "n_neighbors": "(int) - Number of neighbors for UMAP.",
1183
- "n_repeats": "(int) - Number of repeats for cross-validation.",
1184
- "normalize": "(list) - The percentiles to use for normalizing the images. This will be used to determine the range of intensities to normalize images to. If None, no normalization is done.",
1185
- "normalize_by": "(str) - Whether to normalize the images by field of view (fov) or by PNG image (png).",
1186
- "normalize_plots": "(bool) - Whether to normalize the plots.",
1187
- "nr_imgs": "(int) - The number of images to plot.",
1188
- "nucleus_CP_prob": "(float) - The cellpose probability threshold for the nucleus channel. This will be used to segment the nucleus.",
1189
- "nucleus_FT": "(float) - The flow threshold for nucleus objects. This will be used in nucleus segmentation.",
1190
- "nucleus_background": "(float) - The background intensity for the nucleus channel. This will be used to remove background noise.",
1191
- "nucleus_chann_dim": "(int) - Dimension of the channel to use for nucleus segmentation.",
1192
- "nucleus_channel": "(int) - The channel to use for the nucleus. If None, the nucleus will not be segmented.",
1193
- "nucleus_intensity_range": "(list) - Intensity range for nucleus segmentation.",
1194
- "nucleus_loc": "(str) - Location of the nucleus in the images.",
1195
- "nucleus_mask_dim": "(int) - The dimension of the array the nucleus mask is saved in.",
1196
- "nucleus_min_size": "(int) - The minimum size of nucleus objects in pixels^2.",
1197
- "nucleus_Signal_to_noise": "(float) - The signal-to-noise ratio for the nucleus channel. This will be used to determine the range of intensities to normalize images to for nucleus segmentation.",
1198
- "nucleus_size_range": "(list) - Size range for nucleus segmentation.",
1199
- "optimizer_type": "(str) - Type of optimizer to use.",
1200
- "other": "(dict) - Additional parameters for the regression analysis.",
1201
- "pathogen_CP_prob": "(float) - The cellpose probability threshold for the pathogen channel. This will be used to segment the pathogen.",
1202
- "pathogen_FT": "(float) - The flow threshold for pathogen objects. This will be used in pathogen segmentation.",
1203
- "pathogen_background": "(float) - The background intensity for the pathogen channel. This will be used to remove background noise.",
1204
- "pathogen_chann_dim": "(int) - Dimension of the channel to use for pathogen segmentation.",
1205
- "pathogen_channel": "(int) - The channel to use for the pathogen. If None, the pathogen will not be segmented.",
1206
- "pathogen_intensity_range": "(str) - Metadata for the pathogen plate.",
1314
+ "n_repeats": "(int) - Number of repeats for the pathogen plate.",
1207
1315
  "pathogen_Signal_to_noise": "(float) - The signal-to-noise ratio for the pathogen channel. This will be used to determine the range of intensities to normalize images to for pathogen segmentation.",
1208
1316
  "pathogen_size_range": "(list) - Size range for pathogen segmentation.",
1209
1317
  "pathogen_types": "(list) - Types of pathogens to include in the analysis.",
@@ -1222,7 +1330,7 @@ def generate_fields(variables, scrollable_frame):
1222
1330
  "plot_nr": "(int) - Number of plots to generate.",
1223
1331
  "plot_outlines": "(bool) - Whether to plot outlines of segmented objects.",
1224
1332
  "png_dims": "(list) - The dimensions of the PNG images to save. This will determine the dimensions of the saved images. Maximum of 3 dimensions e.g. [1,2,3].",
1225
- "png_size": "(int) - The size of the PNG images to save. This will determine the size of the saved images.",
1333
+ "png_size": "(list) - The size of the PNG images to save. This will determine the size of the saved images.",
1226
1334
  "positive_control": "(str) - Identifier for the positive control.",
1227
1335
  "preprocess": "(bool) - Whether to preprocess the images before segmentation. This includes background removal and normalization. Set to False only if this step has already been done.",
1228
1336
  "radial_dist": "(list) - Radial distances for measuring features.",
@@ -1385,8 +1493,8 @@ def set_annotate_default_settings(settings):
1385
1493
  settings.setdefault('normalize', 'False')
1386
1494
  settings.setdefault('normalize_channels', "r,g,b")
1387
1495
  settings.setdefault('percentiles', [2, 98])
1388
- settings.setdefault('measurement', '')#'cytoplasm_channel_3_mean_intensity,pathogen_channel_3_mean_intensity')
1389
- settings.setdefault('threshold', '')#'2')
1496
+ settings.setdefault('measurement', '') #'cytoplasm_channel_3_mean_intensity,pathogen_channel_3_mean_intensity')
1497
+ settings.setdefault('threshold', '') #'2')
1390
1498
  return settings
1391
1499
 
1392
1500
  def set_default_generate_barecode_mapping(settings={}):
spacr/sp_stats.py ADDED
@@ -0,0 +1,221 @@
1
+ from scipy.stats import shapiro, normaltest, levene, ttest_ind, mannwhitneyu, kruskal, f_oneway
2
+ from statsmodels.stats.multicomp import pairwise_tukeyhsd
3
+ import scikit_posthocs as sp
4
+ import numpy as np
5
+ import pandas as pd
6
+ from scipy.stats import chi2_contingency, fisher_exact
7
+ import itertools
8
+ from statsmodels.stats.multitest import multipletests
9
+
10
+
11
+ def choose_p_adjust_method(num_groups, num_data_points):
12
+ """
13
+ Selects the most appropriate p-value adjustment method based on data characteristics.
14
+
15
+ Parameters:
16
+ - num_groups: Number of unique groups being compared
17
+ - num_data_points: Number of data points per group (assuming balanced groups)
18
+
19
+ Returns:
20
+ - A string representing the recommended p-adjustment method
21
+ """
22
+ num_comparisons = (num_groups * (num_groups - 1)) // 2 # Number of pairwise comparisons
23
+
24
+ # Decision logic for choosing the adjustment method
25
+ if num_comparisons <= 10 and num_data_points > 5:
26
+ return 'holm' # Balanced between power and Type I error control
27
+ elif num_comparisons > 10 and num_data_points <= 5:
28
+ return 'fdr_bh' # FDR control for large number of comparisons and small sample size
29
+ elif num_comparisons <= 10:
30
+ return 'sidak' # Less conservative than Bonferroni, good for independent comparisons
31
+ else:
32
+ return 'bonferroni' # Very conservative, use for strict control of Type I errors
33
+
34
+ def perform_normality_tests(df, grouping_column, data_columns):
35
+ """Perform normality tests for each group and data column."""
36
+ unique_groups = df[grouping_column].unique()
37
+ normality_results = []
38
+
39
+ for column in data_columns:
40
+ for group in unique_groups:
41
+ data = df.loc[df[grouping_column] == group, column].dropna()
42
+ n_samples = len(data)
43
+
44
+ if n_samples < 3:
45
+ # Skip test if there aren't enough data points
46
+ print(f"Skipping normality test for group '{group}' on column '{column}' - Not enough data.")
47
+ normality_results.append({
48
+ 'Comparison': f'Normality test for {group} on {column}',
49
+ 'Test Statistic': None,
50
+ 'p-value': None,
51
+ 'Test Name': 'Skipped',
52
+ 'Column': column,
53
+ 'n': n_samples
54
+ })
55
+ continue
56
+
57
+ # Choose the appropriate normality test based on the sample size
58
+ if n_samples >= 8:
59
+ stat, p_value = normaltest(data)
60
+ test_name = "D'Agostino-Pearson test"
61
+ else:
62
+ stat, p_value = shapiro(data)
63
+ test_name = "Shapiro-Wilk test"
64
+
65
+ normality_results.append({
66
+ 'Comparison': f'Normality test for {group} on {column}',
67
+ 'Test Statistic': stat,
68
+ 'p-value': p_value,
69
+ 'Test Name': test_name,
70
+ 'Column': column,
71
+ 'n': n_samples
72
+ })
73
+
74
+ # Check if all groups are normally distributed (p > 0.05)
75
+ normal_p_values = [result['p-value'] for result in normality_results if result['Column'] == column and result['p-value'] is not None]
76
+ is_normal = all(p > 0.05 for p in normal_p_values)
77
+
78
+ return is_normal, normality_results
79
+
80
+
81
+ def perform_levene_test(df, grouping_column, data_column):
82
+ """Perform Levene's test for equal variance."""
83
+ unique_groups = df[grouping_column].unique()
84
+ grouped_data = [df.loc[df[grouping_column] == group, data_column].dropna() for group in unique_groups]
85
+ stat, p_value = levene(*grouped_data)
86
+ return stat, p_value
87
+
88
+ def perform_statistical_tests(df, grouping_column, data_columns, paired=False):
89
+ """Perform statistical tests for each data column."""
90
+ unique_groups = df[grouping_column].unique()
91
+ test_results = []
92
+
93
+ for column in data_columns:
94
+ grouped_data = [df.loc[df[grouping_column] == group, column].dropna() for group in unique_groups]
95
+ if len(unique_groups) == 2: # For two groups
96
+ if paired:
97
+ print("Performing paired tests (not implemented in this template).")
98
+ continue # Extend as needed
99
+ else:
100
+ # Check normality for two groups
101
+ is_normal, _ = perform_normality_tests(df, grouping_column, [column])
102
+ if is_normal:
103
+ stat, p = ttest_ind(grouped_data[0], grouped_data[1])
104
+ test_name = 'T-test'
105
+ else:
106
+ stat, p = mannwhitneyu(grouped_data[0], grouped_data[1])
107
+ test_name = 'Mann-Whitney U test'
108
+ else:
109
+ # Check normality for multiple groups
110
+ is_normal, _ = perform_normality_tests(df, grouping_column, [column])
111
+ if is_normal:
112
+ stat, p = f_oneway(*grouped_data)
113
+ test_name = 'One-way ANOVA'
114
+ else:
115
+ stat, p = kruskal(*grouped_data)
116
+ test_name = 'Kruskal-Wallis test'
117
+
118
+ test_results.append({
119
+ 'Column': column,
120
+ 'Test Name': test_name,
121
+ 'Test Statistic': stat,
122
+ 'p-value': p,
123
+ 'Groups': len(unique_groups)
124
+ })
125
+
126
+ return test_results
127
+
128
+
129
+ def perform_posthoc_tests(df, grouping_column, data_column, is_normal):
130
+ """Perform post-hoc tests for multiple groups with both original and adjusted p-values."""
131
+ unique_groups = df[grouping_column].unique()
132
+ posthoc_results = []
133
+
134
+ if len(unique_groups) > 2:
135
+ num_groups = len(unique_groups)
136
+ num_data_points = len(df[data_column].dropna()) // num_groups # Assuming roughly equal data points per group
137
+ p_adjust_method = choose_p_adjust_method(num_groups, num_data_points)
138
+
139
+ if is_normal:
140
+ # Tukey's HSD automatically adjusts p-values
141
+ tukey_result = pairwise_tukeyhsd(df[data_column], df[grouping_column], alpha=0.05)
142
+ for comparison, p_value in zip(tukey_result._results_table.data[1:], tukey_result.pvalues):
143
+ posthoc_results.append({
144
+ 'Comparison': f"{comparison[0]} vs {comparison[1]}",
145
+ 'Original p-value': None, # Tukey HSD does not provide raw p-values
146
+ 'Adjusted p-value': p_value,
147
+ 'Adjusted Method': 'Tukey HSD',
148
+ 'Test Name': 'Tukey HSD'
149
+ })
150
+ else:
151
+ # Dunn's test with p-value adjustment
152
+ raw_dunn_result = sp.posthoc_dunn(df, val_col=data_column, group_col=grouping_column, p_adjust=None)
153
+ adjusted_dunn_result = sp.posthoc_dunn(df, val_col=data_column, group_col=grouping_column, p_adjust=p_adjust_method)
154
+ for i, group_a in enumerate(adjusted_dunn_result.index):
155
+ for j, group_b in enumerate(adjusted_dunn_result.columns):
156
+ if i < j: # Only consider unique pairs
157
+ posthoc_results.append({
158
+ 'Comparison': f"{group_a} vs {group_b}",
159
+ 'Original p-value': raw_dunn_result.iloc[i, j],
160
+ 'Adjusted p-value': adjusted_dunn_result.iloc[i, j],
161
+ 'Adjusted Method': p_adjust_method,
162
+ 'Test Name': "Dunn's Post-hoc"
163
+ })
164
+
165
+ return posthoc_results
166
+
167
+ def chi_pairwise(raw_counts, verbose=False):
168
+ """
169
+ Perform pairwise chi-square or Fisher's exact tests between all unique group pairs
170
+ and apply p-value correction.
171
+
172
+ Parameters:
173
+ - raw_counts (DataFrame): Contingency table with group-wise counts.
174
+ - verbose (bool): Whether to print results for each pair.
175
+
176
+ Returns:
177
+ - pairwise_df (DataFrame): DataFrame with pairwise test results, including corrected p-values.
178
+ """
179
+ pairwise_results = []
180
+ groups = raw_counts.index.unique() # Use index from raw_counts for group pairs
181
+ raw_p_values = [] # Store raw p-values for correction later
182
+
183
+ # Calculate the number of groups and average number of data points per group
184
+ num_groups = len(groups)
185
+ num_data_points = raw_counts.sum(axis=1).mean() # Average total data points per group
186
+ p_adjust_method = choose_p_adjust_method(num_groups, num_data_points)
187
+
188
+ for group1, group2 in itertools.combinations(groups, 2):
189
+ contingency_table = raw_counts.loc[[group1, group2]].values
190
+ if contingency_table.shape[1] == 2: # Fisher's Exact Test for 2x2 tables
191
+ oddsratio, p_value = fisher_exact(contingency_table)
192
+ test_name = "Fisher's Exact Test"
193
+ else: # Chi-Square Test for larger tables
194
+ chi2_stat, p_value, _, _ = chi2_contingency(contingency_table)
195
+ test_name = 'Pairwise Chi-Square Test'
196
+
197
+ pairwise_results.append({
198
+ 'Group 1': group1,
199
+ 'Group 2': group2,
200
+ 'Test Name': test_name,
201
+ 'p-value': p_value
202
+ })
203
+ raw_p_values.append(p_value)
204
+
205
+ # Apply p-value correction
206
+ corrected_p_values = multipletests(raw_p_values, method=p_adjust_method)[1]
207
+
208
+ # Add corrected p-values to results
209
+ for i, result in enumerate(pairwise_results):
210
+ result['p-value_adj'] = corrected_p_values[i]
211
+
212
+ pairwise_df = pd.DataFrame(pairwise_results)
213
+
214
+ pairwise_df['adj'] = p_adjust_method
215
+
216
+ if verbose:
217
+ # Print pairwise results
218
+ print("\nPairwise Frequency Analysis Results:")
219
+ print(pairwise_df.to_string(index=False))
220
+
221
+ return pairwise_df
spacr/submodules.py CHANGED
@@ -21,7 +21,7 @@ from sklearn.metrics import mean_absolute_error
21
21
  import matplotlib.pyplot as plt
22
22
  from natsort import natsorted
23
23
 
24
- def analyze_recruitment(settings={}):
24
+ def analyze_recruitment(settings):
25
25
  """
26
26
  Analyze recruitment data by grouping the DataFrame by well coordinates and plotting controls and recruitment data.
27
27
 
@@ -1041,7 +1041,7 @@ def analyze_class_proportion(settings):
1041
1041
  from .io import _read_and_merge_data
1042
1042
  from .settings import set_analyze_class_proportion_defaults
1043
1043
  from .plot import plot_plates, plot_proportion_stacked_bars
1044
- from .stats import perform_normality_tests, perform_levene_test, perform_statistical_tests, perform_posthoc_tests
1044
+ from .sp_stats import perform_normality_tests, perform_levene_test, perform_statistical_tests, perform_posthoc_tests
1045
1045
 
1046
1046
  settings = set_analyze_class_proportion_defaults(settings)
1047
1047
  save_settings(settings, name='analyze_class_proportion', show=True)