spacr 0.3.50__py3-none-any.whl → 0.3.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/gui_elements.py +1 -1
- spacr/gui_utils.py +4 -116
- spacr/io.py +114 -140
- spacr/measure.py +14 -12
- spacr/ml.py +41 -32
- spacr/plot.py +167 -307
- spacr/sequencing.py +13 -9
- spacr/settings.py +29 -39
- spacr/submodules.py +19 -19
- spacr/timelapse.py +16 -16
- spacr/toxo.py +180 -1
- spacr/utils.py +95 -164
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/METADATA +2 -1
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/RECORD +18 -18
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/LICENSE +0 -0
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/WHEEL +0 -0
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/entry_points.txt +0 -0
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/top_level.txt +0 -0
spacr/settings.py
CHANGED
@@ -198,7 +198,7 @@ def set_default_umap_image_settings(settings={}):
|
|
198
198
|
settings.setdefault('smooth_lines', True)
|
199
199
|
settings.setdefault('clustering', 'dbscan')
|
200
200
|
settings.setdefault('exclude', None)
|
201
|
-
settings.setdefault('col_to_compare', '
|
201
|
+
settings.setdefault('col_to_compare', 'column_name')
|
202
202
|
settings.setdefault('pos', 'c1')
|
203
203
|
settings.setdefault('neg', 'c2')
|
204
204
|
settings.setdefault('embedding_by_controls', False)
|
@@ -246,7 +246,7 @@ def get_measure_crop_settings(settings={}):
|
|
246
246
|
settings.setdefault('normalize_by','png')
|
247
247
|
settings.setdefault('crop_mode',['cell'])
|
248
248
|
settings.setdefault('dialate_pngs', False)
|
249
|
-
settings.setdefault('dialate_png_ratios', [0.2
|
249
|
+
settings.setdefault('dialate_png_ratios', [0.2])
|
250
250
|
|
251
251
|
# Timelapsed settings
|
252
252
|
settings.setdefault('timelapse', False)
|
@@ -289,7 +289,7 @@ def set_default_analyze_screen(settings):
|
|
289
289
|
settings.setdefault('minimum_cell_count',25)
|
290
290
|
settings.setdefault('n_estimators',100)
|
291
291
|
settings.setdefault('test_size',0.2)
|
292
|
-
settings.setdefault('location_column','
|
292
|
+
settings.setdefault('location_column','column_name')
|
293
293
|
settings.setdefault('positive_control','c2')
|
294
294
|
settings.setdefault('negative_control','c1')
|
295
295
|
settings.setdefault('exclude',None)
|
@@ -337,8 +337,9 @@ def set_default_train_test_model(settings):
|
|
337
337
|
return settings
|
338
338
|
|
339
339
|
def set_generate_training_dataset_defaults(settings):
|
340
|
-
|
340
|
+
|
341
341
|
settings.setdefault('src','path')
|
342
|
+
settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
|
342
343
|
settings.setdefault('dataset_mode','metadata')
|
343
344
|
settings.setdefault('annotation_column','test')
|
344
345
|
settings.setdefault('annotated_classes',[1,2])
|
@@ -346,7 +347,7 @@ def set_generate_training_dataset_defaults(settings):
|
|
346
347
|
settings.setdefault('size',224)
|
347
348
|
settings.setdefault('test_split',0.1)
|
348
349
|
settings.setdefault('class_metadata',[['c1'],['c2']])
|
349
|
-
settings.setdefault('metadata_type_by','
|
350
|
+
settings.setdefault('metadata_type_by','column_name')
|
350
351
|
settings.setdefault('channel_of_interest',3)
|
351
352
|
settings.setdefault('custom_measurement',None)
|
352
353
|
settings.setdefault('tables',None)
|
@@ -369,7 +370,7 @@ def deep_spacr_defaults(settings):
|
|
369
370
|
settings.setdefault('size',224)
|
370
371
|
settings.setdefault('test_split',0.1)
|
371
372
|
settings.setdefault('class_metadata',[['c1'],['c2']])
|
372
|
-
settings.setdefault('metadata_type_by','
|
373
|
+
settings.setdefault('metadata_type_by','column_name')
|
373
374
|
settings.setdefault('channel_of_interest',3)
|
374
375
|
settings.setdefault('custom_measurement',None)
|
375
376
|
settings.setdefault('tables',None)
|
@@ -453,7 +454,7 @@ def get_analyze_recruitment_default_settings(settings):
|
|
453
454
|
settings.setdefault('pathogen_plate_metadata',[['c1', 'c2', 'c3'],['c4','c5', 'c6']])
|
454
455
|
settings.setdefault('treatments',['cm', 'lovastatin'])
|
455
456
|
settings.setdefault('treatment_plate_metadata',[['r1', 'r2','r3'], ['r4', 'r5','r6']])
|
456
|
-
settings.setdefault('metadata_types',['
|
457
|
+
settings.setdefault('metadata_types',['column_name', 'column_name', 'row_name'])
|
457
458
|
settings.setdefault('channel_dims',[0,1,2,3])
|
458
459
|
settings.setdefault('cell_chann_dim',3)
|
459
460
|
settings.setdefault('cell_mask_dim',4)
|
@@ -531,18 +532,22 @@ def get_perform_regression_default_settings(settings):
|
|
531
532
|
settings.setdefault('score_data','list of paths')
|
532
533
|
settings.setdefault('positive_control','239740')
|
533
534
|
settings.setdefault('negative_control','233460')
|
535
|
+
settings.setdefault('min_n',0)
|
534
536
|
settings.setdefault('controls',['000000_1','000000_10','000000_11','000000_12','000000_13','000000_14','000000_15','000000_16','000000_17','000000_18','000000_19','000000_20','000000_21','000000_22','000000_23','000000_24','000000_25','000000_26','000000_27','000000_28','000000_29','000000_3','000000_30','000000_31','000000_32','000000_4','000000_5','000000_6','000000_8','000000_9'])
|
535
|
-
settings.setdefault('fraction_threshold',
|
537
|
+
settings.setdefault('fraction_threshold',None)
|
536
538
|
settings.setdefault('dependent_variable','pred')
|
537
539
|
settings.setdefault('threshold_method','std')
|
538
540
|
settings.setdefault('threshold_multiplier',3)
|
541
|
+
settings.setdefault('target_unique_count',5)
|
539
542
|
settings.setdefault('transform',None)
|
543
|
+
settings.setdefault('log_x',False)
|
544
|
+
settings.setdefault('log_y',False)
|
545
|
+
settings.setdefault('x_lim',None)
|
540
546
|
settings.setdefault('agg_type','mean')
|
541
|
-
settings.setdefault('min_cell_count',
|
547
|
+
settings.setdefault('min_cell_count',None)
|
542
548
|
settings.setdefault('regression_type','ols')
|
543
549
|
settings.setdefault('random_row_column_effects',False)
|
544
550
|
settings.setdefault('split_axis_lims','')
|
545
|
-
settings.setdefault('plate','')
|
546
551
|
settings.setdefault('cov_type',None)
|
547
552
|
settings.setdefault('alpha',1)
|
548
553
|
settings.setdefault('filter_value',['c1', 'c2', 'c3'])
|
@@ -557,6 +562,7 @@ def get_perform_regression_default_settings(settings):
|
|
557
562
|
print(f"Using alpha as quantile for quantile regression, alpha: {settings['alpha']}")
|
558
563
|
settings['agg_type'] = None
|
559
564
|
print(f'agg_type set to None for quantile regression')
|
565
|
+
|
560
566
|
return settings
|
561
567
|
|
562
568
|
def get_check_cellpose_models_default_settings(settings):
|
@@ -697,16 +703,6 @@ expected_types = {
|
|
697
703
|
"overlay_chans": list,
|
698
704
|
"overlay": bool,
|
699
705
|
"normalization_percentiles": list,
|
700
|
-
"print_object_number": bool,
|
701
|
-
"nr": int,
|
702
|
-
"figuresize": int,
|
703
|
-
"cmap": str,
|
704
|
-
"test_mode": bool,
|
705
|
-
"test_images": int,
|
706
|
-
"remove_background_cell": bool,
|
707
|
-
"remove_background_nucleus": bool,
|
708
|
-
"remove_background_pathogen": bool,
|
709
|
-
"pathogen_model": (str, type(None)),
|
710
706
|
"filter": bool,
|
711
707
|
"fill_in":bool,
|
712
708
|
"upscale": bool,
|
@@ -825,18 +821,6 @@ expected_types = {
|
|
825
821
|
"transform": (str, type(None)),
|
826
822
|
"agg_type": str,
|
827
823
|
"min_cell_count": int,
|
828
|
-
"regression_type": str,
|
829
|
-
"random_row_column_effects": bool,
|
830
|
-
"alpha": float,
|
831
|
-
"fraction_threshold": float,
|
832
|
-
"class_1_threshold": (float, type(None)),
|
833
|
-
"batch_size": int,
|
834
|
-
"CP_prob": float,
|
835
|
-
"flow_threshold": float,
|
836
|
-
"percentiles": (list, type(None)),
|
837
|
-
"invert": bool,
|
838
|
-
"diameter": int,
|
839
|
-
"grayscale": bool,
|
840
824
|
"resize": bool,
|
841
825
|
"target_height": (int, type(None)),
|
842
826
|
"target_width": (int, type(None)),
|
@@ -881,9 +865,6 @@ expected_types = {
|
|
881
865
|
"metadata_type_by":str,
|
882
866
|
"custom_measurement":str,
|
883
867
|
"custom_model":bool,
|
884
|
-
"size":int,
|
885
|
-
"test_split":float,
|
886
|
-
"class_metadata":list, # This is a list of lists
|
887
868
|
"png_type":str,
|
888
869
|
"custom_model_path":str,
|
889
870
|
"generate_training_dataset":bool,
|
@@ -894,6 +875,7 @@ expected_types = {
|
|
894
875
|
"correlate":bool,
|
895
876
|
"target_layer":str,
|
896
877
|
"save_to_db":bool,
|
878
|
+
"test_mode":bool,
|
897
879
|
"normalize_input":bool,
|
898
880
|
}
|
899
881
|
|
@@ -904,7 +886,7 @@ categories = {"Paths":[ "src", "grna", "barcodes", "custom_model_path", "dataset
|
|
904
886
|
"Nucleus": ["nucleus_intensity_range", "nucleus_size_range", "nucleus_chann_dim", "nucleus_channel", "nucleus_background", "nucleus_Signal_to_noise", "nucleus_CP_prob", "nucleus_FT", "remove_background_nucleus", "nucleus_min_size", "nucleus_mask_dim", "nucleus_loc"],
|
905
887
|
"Pathogen": ["pathogen_intensity_range", "pathogen_size_range", "pathogen_chann_dim", "pathogen_channel", "pathogen_background", "pathogen_Signal_to_noise", "pathogen_CP_prob", "pathogen_FT", "pathogen_model", "remove_background_pathogen", "pathogen_min_size", "pathogen_mask_dim", "pathogens", "pathogen_loc", "pathogen_types", "pathogen_plate_metadata", ],
|
906
888
|
"Measurements": ["remove_image_canvas", "remove_highly_correlated", "homogeneity", "homogeneity_distances", "radial_dist", "calculate_correlation", "manders_thresholds", "save_measurements", "tables", "image_nr", "dot_size", "filter_by", "remove_highly_correlated_features", "remove_low_variance_features", "channel_of_interest"],
|
907
|
-
"Object Image": ["save_png", "dialate_pngs", "dialate_png_ratios", "png_size", "png_dims", "save_arrays", "normalize_by", "crop_mode", "
|
889
|
+
"Object Image": ["save_png", "dialate_pngs", "dialate_png_ratios", "png_size", "png_dims", "save_arrays", "normalize_by", "crop_mode", "normalize", "use_bounding_box"],
|
908
890
|
"Sequencing": ["signal_direction","mode","comp_level","comp_type","save_h5","expected_end","offset","target_sequence","regex", "highlight"],
|
909
891
|
"Generate Dataset":["save_to_db","file_metadata","class_metadata", "annotation_column","annotated_classes", "dataset_mode", "metadata_type_by","custom_measurement", "sample", "size"],
|
910
892
|
"Hyperparamiters (Training)": ["png_type", "score_threshold","file_type", "train_channels", "epochs", "loss_type", "optimizer_type","image_size","val_split","learning_rate","weight_decay","dropout_rate", "init_weights", "train", "classes", "augment", "amsgrad","use_checkpoint","gradient_accumulation","gradient_accumulation_steps","intermedeate_save","pin_memory"],
|
@@ -939,6 +921,9 @@ def check_settings(vars_dict, expected_types, q=None):
|
|
939
921
|
continue
|
940
922
|
|
941
923
|
value = var.get()
|
924
|
+
if value == 'None':
|
925
|
+
value = None
|
926
|
+
|
942
927
|
expected_type = expected_types.get(key, str)
|
943
928
|
|
944
929
|
try:
|
@@ -953,14 +938,19 @@ def check_settings(vars_dict, expected_types, q=None):
|
|
953
938
|
# settings[key] = None
|
954
939
|
else:
|
955
940
|
raise ValueError("Invalid format for list or list of lists")
|
941
|
+
|
956
942
|
elif expected_type == list:
|
957
943
|
settings[key] = parse_list(value) if value else None
|
944
|
+
|
945
|
+
if isinstance(settings[key], list) and len(settings[key]) == 1:
|
946
|
+
settings[key] = settings[key][0]
|
947
|
+
|
958
948
|
elif expected_type == bool:
|
959
949
|
settings[key] = value if isinstance(value, bool) else value.lower() in ['true', '1', 't', 'y', 'yes']
|
960
950
|
elif expected_type == (int, type(None)):
|
961
|
-
settings[key] = int(value) if value else None
|
951
|
+
settings[key] = settings[key] = int(value) if isinstance(value, int) or str(value).isdigit() else None
|
962
952
|
elif expected_type == (float, type(None)):
|
963
|
-
settings[key] = float(value) if value else None
|
953
|
+
settings[key] = float(value) if isinstance(value, float) or (isinstance(value, str) and value.replace(".", "", 1).isdigit()) else None
|
964
954
|
elif expected_type == (int, float):
|
965
955
|
settings[key] = float(value) if '.' in value else int(value)
|
966
956
|
elif expected_type == (str, type(None)):
|
@@ -1000,7 +990,7 @@ def check_settings(vars_dict, expected_types, q=None):
|
|
1000
990
|
settings[key] = expected_type(value) if value else None
|
1001
991
|
except (ValueError, SyntaxError) as e:
|
1002
992
|
expected_type_name = ' or '.join([t.__name__ for t in expected_type]) if isinstance(expected_type, tuple) else expected_type.__name__
|
1003
|
-
q.put(f"Error: Invalid format for {key}. Expected type: {expected_type_name}. Error: {e}")
|
993
|
+
q.put(f"Error: Invalid format for {key}. Expected type: {expected_type_name}. Error: {e}, Value entered: {value}")
|
1004
994
|
return
|
1005
995
|
|
1006
996
|
return settings
|
spacr/submodules.py
CHANGED
@@ -341,17 +341,17 @@ def count_phenotypes(settings):
|
|
341
341
|
unique_values_count = df[settings['annotation_column']].nunique(dropna=True)
|
342
342
|
print(f"Unique values in {settings['annotation_column']} (excluding NaN): {unique_values_count}")
|
343
343
|
|
344
|
-
# Count unique values in 'value' column, grouped by 'plate', '
|
345
|
-
grouped_unique_count = df.groupby(['plate', '
|
344
|
+
# Count unique values in 'value' column, grouped by 'plate', 'row_name', 'column'
|
345
|
+
grouped_unique_count = df.groupby(['plate', 'row_name', 'column'])[settings['annotation_column']].nunique(dropna=True).reset_index(name='unique_count')
|
346
346
|
display(grouped_unique_count)
|
347
347
|
|
348
348
|
save_path = os.path.join(settings['src'], 'phenotype_counts.csv')
|
349
349
|
|
350
350
|
# Group by plate, row, and column, then count the occurrences of each unique value
|
351
|
-
grouped_counts = df.groupby(['plate', '
|
351
|
+
grouped_counts = df.groupby(['plate', 'row_name', 'column', 'value']).size().reset_index(name='count')
|
352
352
|
|
353
353
|
# Pivot the DataFrame so that unique values are columns and their counts are in the rows
|
354
|
-
pivot_df = grouped_counts.pivot_table(index=['plate', '
|
354
|
+
pivot_df = grouped_counts.pivot_table(index=['plate', 'row_name', 'column'], columns='value', values='count', fill_value=0)
|
355
355
|
|
356
356
|
# Flatten the multi-level columns
|
357
357
|
pivot_df.columns = [f"value_{int(col)}" for col in pivot_df.columns]
|
@@ -376,17 +376,17 @@ def compare_reads_to_scores(reads_csv, scores_csv, empirical_dict={'r1':(90,10),
|
|
376
376
|
column='column', value='c3', plate=None, save_paths=None):
|
377
377
|
|
378
378
|
def calculate_well_score_fractions(df, class_columns='cv_predictions'):
|
379
|
-
if all(col in df.columns for col in ['plate', '
|
380
|
-
df['prc'] = df['plate'] + '_' + df['
|
379
|
+
if all(col in df.columns for col in ['plate', 'row_name', 'column']):
|
380
|
+
df['prc'] = df['plate'] + '_' + df['row_name'] + '_' + df['column']
|
381
381
|
else:
|
382
|
-
raise ValueError("Cannot find 'plate', '
|
383
|
-
prc_summary = df.groupby(['plate', '
|
384
|
-
well_counts = (df.groupby(['plate', '
|
382
|
+
raise ValueError("Cannot find 'plate', 'row_name', or 'column' in df.columns")
|
383
|
+
prc_summary = df.groupby(['plate', 'row_name', 'column', 'prc']).size().reset_index(name='total_rows')
|
384
|
+
well_counts = (df.groupby(['plate', 'row_name', 'column', 'prc', class_columns])
|
385
385
|
.size()
|
386
386
|
.unstack(fill_value=0)
|
387
387
|
.reset_index()
|
388
388
|
.rename(columns={0: 'class_0', 1: 'class_1'}))
|
389
|
-
summary_df = pd.merge(prc_summary, well_counts, on=['plate', '
|
389
|
+
summary_df = pd.merge(prc_summary, well_counts, on=['plate', 'row_name', 'column', 'prc'], how='left')
|
390
390
|
summary_df['class_0_fraction'] = summary_df['class_0'] / summary_df['total_rows']
|
391
391
|
summary_df['class_1_fraction'] = summary_df['class_1'] / summary_df['total_rows']
|
392
392
|
return summary_df
|
@@ -481,8 +481,8 @@ def compare_reads_to_scores(reads_csv, scores_csv, empirical_dict={'r1':(90,10),
|
|
481
481
|
return result
|
482
482
|
|
483
483
|
def calculate_well_read_fraction(df, count_column='count'):
|
484
|
-
if all(col in df.columns for col in ['plate', '
|
485
|
-
df['prc'] = df['plate'] + '_' + df['
|
484
|
+
if all(col in df.columns for col in ['plate', 'row_name', 'column']):
|
485
|
+
df['prc'] = df['plate'] + '_' + df['row_name'] + '_' + df['column']
|
486
486
|
else:
|
487
487
|
raise ValueError("Cannot find plate, row or column in df.columns")
|
488
488
|
grouped_df = df.groupby('prc')[count_column].sum().reset_index()
|
@@ -501,18 +501,18 @@ def compare_reads_to_scores(reads_csv, scores_csv, empirical_dict={'r1':(90,10),
|
|
501
501
|
reads_df_temp['plate'] = f"plate{i+1}"
|
502
502
|
scores_df_temp['plate'] = f"plate{i+1}"
|
503
503
|
|
504
|
-
if 'col' in reads_df_temp.columns:
|
505
|
-
reads_df_temp = reads_df_temp.rename(columns={'col': 'column'})
|
506
504
|
if 'column_name' in reads_df_temp.columns:
|
507
505
|
reads_df_temp = reads_df_temp.rename(columns={'column_name': 'column'})
|
508
|
-
if '
|
509
|
-
|
506
|
+
if 'column_name' in reads_df_temp.columns:
|
507
|
+
reads_df_temp = reads_df_temp.rename(columns={'column_name': 'column'})
|
508
|
+
if 'column_name' in scores_df_temp.columns:
|
509
|
+
scores_df_temp = scores_df_temp.rename(columns={'column_name': 'column'})
|
510
510
|
if 'column_name' in scores_df_temp.columns:
|
511
511
|
scores_df_temp = scores_df_temp.rename(columns={'column_name': 'column'})
|
512
512
|
if 'row_name' in reads_df_temp.columns:
|
513
|
-
reads_df_temp = reads_df_temp.rename(columns={'row_name': '
|
513
|
+
reads_df_temp = reads_df_temp.rename(columns={'row_name': 'row_name'})
|
514
514
|
if 'row_name' in scores_df_temp.columns:
|
515
|
-
scores_df_temp = scores_df_temp.rename(columns={'row_name': '
|
515
|
+
scores_df_temp = scores_df_temp.rename(columns={'row_name': 'row_name'})
|
516
516
|
|
517
517
|
reads_ls.append(reads_df_temp)
|
518
518
|
scores_ls.append(scores_df_temp)
|
@@ -539,7 +539,7 @@ def compare_reads_to_scores(reads_csv, scores_csv, empirical_dict={'r1':(90,10),
|
|
539
539
|
|
540
540
|
df_emp = pd.DataFrame([(key, val[0], val[1], val[0] / (val[0] + val[1]), val[1] / (val[0] + val[1])) for key, val in empirical_dict.items()],columns=['key', 'value1', 'value2', 'pc_fraction', 'nc_fraction'])
|
541
541
|
|
542
|
-
df = pd.merge(df, df_emp, left_on='
|
542
|
+
df = pd.merge(df, df_emp, left_on='row_name', right_on='key')
|
543
543
|
|
544
544
|
if any in y_columns not in df.columns:
|
545
545
|
print(f"columns in dataframe:")
|
spacr/timelapse.py
CHANGED
@@ -533,14 +533,14 @@ def exponential_decay(x, a, b, c):
|
|
533
533
|
|
534
534
|
def preprocess_pathogen_data(pathogen_df):
|
535
535
|
# Group by identifiers and count the number of parasites
|
536
|
-
parasite_counts = pathogen_df.groupby(['plate', '
|
536
|
+
parasite_counts = pathogen_df.groupby(['plate', 'row_name', 'column_name', 'field', 'timeid', 'pathogen_cell_id']).size().reset_index(name='parasite_count')
|
537
537
|
|
538
538
|
# Aggregate numerical columns and take the first of object columns
|
539
|
-
agg_funcs = {col: 'mean' if np.issubdtype(pathogen_df[col].dtype, np.number) else 'first' for col in pathogen_df.columns if col not in ['plate', '
|
540
|
-
pathogen_agg = pathogen_df.groupby(['plate', '
|
539
|
+
agg_funcs = {col: 'mean' if np.issubdtype(pathogen_df[col].dtype, np.number) else 'first' for col in pathogen_df.columns if col not in ['plate', 'row_name', 'column_name', 'field', 'timeid', 'pathogen_cell_id', 'parasite_count']}
|
540
|
+
pathogen_agg = pathogen_df.groupby(['plate', 'row_name', 'column_name', 'field', 'timeid', 'pathogen_cell_id']).agg(agg_funcs).reset_index()
|
541
541
|
|
542
542
|
# Merge the counts back into the aggregated data
|
543
|
-
pathogen_agg = pathogen_agg.merge(parasite_counts, on=['plate', '
|
543
|
+
pathogen_agg = pathogen_agg.merge(parasite_counts, on=['plate', 'row_name', 'column_name', 'field', 'timeid', 'pathogen_cell_id'])
|
544
544
|
|
545
545
|
# Remove the object_label column as it corresponds to the pathogen ID not the cell ID
|
546
546
|
if 'object_label' in pathogen_agg.columns:
|
@@ -604,10 +604,10 @@ def save_results_dataframe(df, src, results_name):
|
|
604
604
|
def summarize_per_well(peak_details_df):
|
605
605
|
# Step 1: Split the 'ID' column
|
606
606
|
split_columns = peak_details_df['ID'].str.split('_', expand=True)
|
607
|
-
peak_details_df[['plate', '
|
607
|
+
peak_details_df[['plate', 'row_name', 'column', 'field', 'object_number']] = split_columns
|
608
608
|
|
609
|
-
# Step 2: Create 'well_ID' by combining '
|
610
|
-
peak_details_df['well_ID'] = peak_details_df['
|
609
|
+
# Step 2: Create 'well_ID' by combining 'row_name' and 'column'
|
610
|
+
peak_details_df['well_ID'] = peak_details_df['row_name'] + '_' + peak_details_df['column']
|
611
611
|
|
612
612
|
# Filter entries where 'amplitude' is not null
|
613
613
|
filtered_df = peak_details_df[peak_details_df['amplitude'].notna()]
|
@@ -635,10 +635,10 @@ def summarize_per_well(peak_details_df):
|
|
635
635
|
def summarize_per_well_inf_non_inf(peak_details_df):
|
636
636
|
# Step 1: Split the 'ID' column
|
637
637
|
split_columns = peak_details_df['ID'].str.split('_', expand=True)
|
638
|
-
peak_details_df[['plate', '
|
638
|
+
peak_details_df[['plate', 'row_name', 'column', 'field', 'object_number']] = split_columns
|
639
639
|
|
640
|
-
# Step 2: Create 'well_ID' by combining '
|
641
|
-
peak_details_df['well_ID'] = peak_details_df['
|
640
|
+
# Step 2: Create 'well_ID' by combining 'row_name' and 'column'
|
641
|
+
peak_details_df['well_ID'] = peak_details_df['row_name'] + '_' + peak_details_df['column']
|
642
642
|
|
643
643
|
# Assume 'pathogen_count' indicates infection if > 0
|
644
644
|
# Add an 'infected_status' column to classify cells
|
@@ -669,7 +669,7 @@ def analyze_calcium_oscillations(db_loc, measurement='cell_channel_1_mean_intens
|
|
669
669
|
pathogen_df = pd.read_sql("SELECT * FROM pathogen", conn)
|
670
670
|
pathogen_df['pathogen_cell_id'] = pathogen_df['pathogen_cell_id'].astype(float).astype('Int64')
|
671
671
|
pathogen_df = preprocess_pathogen_data(pathogen_df)
|
672
|
-
cell_df = cell_df.merge(pathogen_df, on=['plate', '
|
672
|
+
cell_df = cell_df.merge(pathogen_df, on=['plate', 'row_name', 'column_name', 'field', 'timeid', 'object_label'], how='left', suffixes=('', '_pathogen'))
|
673
673
|
cell_df['parasite_count'] = cell_df['parasite_count'].fillna(0)
|
674
674
|
print(f'After pathogen merge: {len(cell_df)} objects')
|
675
675
|
|
@@ -677,7 +677,7 @@ def analyze_calcium_oscillations(db_loc, measurement='cell_channel_1_mean_intens
|
|
677
677
|
if cytoplasm:
|
678
678
|
cytoplasm_df = pd.read_sql(f"SELECT * FROM {'cytoplasm'}", conn)
|
679
679
|
# Merge on specified columns
|
680
|
-
cell_df = cell_df.merge(cytoplasm_df, on=['plate', '
|
680
|
+
cell_df = cell_df.merge(cytoplasm_df, on=['plate', 'row_name', 'column_name', 'field', 'timeid', 'object_label'], how='left', suffixes=('', '_cytoplasm'))
|
681
681
|
|
682
682
|
print(f'After cytoplasm merge: {len(cell_df)} objects')
|
683
683
|
|
@@ -687,12 +687,12 @@ def analyze_calcium_oscillations(db_loc, measurement='cell_channel_1_mean_intens
|
|
687
687
|
# Prepare DataFrame (use cell_df instead of df)
|
688
688
|
prcf_components = cell_df['prcf'].str.split('_', expand=True)
|
689
689
|
cell_df['plate'] = prcf_components[0]
|
690
|
-
cell_df['
|
690
|
+
cell_df['row_name'] = prcf_components[1]
|
691
691
|
cell_df['column'] = prcf_components[2]
|
692
692
|
cell_df['field'] = prcf_components[3]
|
693
693
|
cell_df['time'] = prcf_components[4].str.extract('t(\d+)').astype(int)
|
694
694
|
cell_df['object_number'] = cell_df['object_label']
|
695
|
-
cell_df['plate_row_column_field_object'] = cell_df['plate'].astype(str) + '_' + cell_df['
|
695
|
+
cell_df['plate_row_column_field_object'] = cell_df['plate'].astype(str) + '_' + cell_df['row_name'].astype(str) + '_' + cell_df['column'].astype(str) + '_' + cell_df['field'].astype(str) + '_' + cell_df['object_label'].astype(str)
|
696
696
|
|
697
697
|
df = cell_df.copy()
|
698
698
|
|
@@ -753,7 +753,7 @@ def analyze_calcium_oscillations(db_loc, measurement='cell_channel_1_mean_intens
|
|
753
753
|
peak_details_list.append({
|
754
754
|
'ID': unique_id,
|
755
755
|
'plate': group['plate'].iloc[0],
|
756
|
-
'
|
756
|
+
'row_name': group['row_name'].iloc[0],
|
757
757
|
'column': group['column'].iloc[0],
|
758
758
|
'field': group['field'].iloc[0],
|
759
759
|
'object_number': group['object_number'].iloc[0],
|
@@ -784,7 +784,7 @@ def analyze_calcium_oscillations(db_loc, measurement='cell_channel_1_mean_intens
|
|
784
784
|
peak_details_list.append({
|
785
785
|
'ID': unique_id,
|
786
786
|
'plate': group['plate'].iloc[0],
|
787
|
-
'
|
787
|
+
'row_name': group['row_name'].iloc[0],
|
788
788
|
'column': group['column'].iloc[0],
|
789
789
|
'field': group['field'].iloc[0],
|
790
790
|
'object_number': group['object_number'].iloc[0],
|
spacr/toxo.py
CHANGED
@@ -10,6 +10,17 @@ from matplotlib.legend import Legend
|
|
10
10
|
from matplotlib.transforms import Bbox
|
11
11
|
from brokenaxes import brokenaxes
|
12
12
|
|
13
|
+
import os
|
14
|
+
import pandas as pd
|
15
|
+
import seaborn as sns
|
16
|
+
import matplotlib.pyplot as plt
|
17
|
+
from scipy.spatial.distance import cosine
|
18
|
+
from scipy.stats import pearsonr
|
19
|
+
import pandas as pd
|
20
|
+
import matplotlib.pyplot as plt
|
21
|
+
import seaborn as sns
|
22
|
+
from sklearn.metrics import mean_absolute_error
|
23
|
+
|
13
24
|
|
14
25
|
from matplotlib.gridspec import GridSpec
|
15
26
|
|
@@ -450,4 +461,172 @@ def plot_gene_heatmaps(data, gene_list, columns, x_column='Gene ID', normalize=F
|
|
450
461
|
plt.savefig(save_path, format='pdf', dpi=600, bbox_inches='tight')
|
451
462
|
print(f"Figure saved to {save_path}")
|
452
463
|
|
453
|
-
plt.show()
|
464
|
+
plt.show()
|
465
|
+
|
466
|
+
def generate_score_heatmap(settings):
|
467
|
+
|
468
|
+
def group_cv_score(csv, plate=1, column='c3', data_column='pred'):
|
469
|
+
|
470
|
+
df = pd.read_csv(csv)
|
471
|
+
if 'column_name' in df.columns:
|
472
|
+
df = df[df['column_name']==column]
|
473
|
+
elif 'column' in df.columns:
|
474
|
+
df['column_name'] = df['column']
|
475
|
+
df = df[df['column_name']==column]
|
476
|
+
if not plate is None:
|
477
|
+
df['plate'] = f"plate{plate}"
|
478
|
+
grouped_df = df.groupby(['plate', 'row_name', 'column_name'])[data_column].mean().reset_index()
|
479
|
+
grouped_df['prc'] = grouped_df['plate'].astype(str) + '_' + grouped_df['row_name'].astype(str) + '_' + grouped_df['column_name'].astype(str)
|
480
|
+
return grouped_df
|
481
|
+
|
482
|
+
def calculate_fraction_mixed_condition(csv, plate=1, column='c3', control_sgrnas = ['TGGT1_220950_1', 'TGGT1_233460_4']):
|
483
|
+
df = pd.read_csv(csv)
|
484
|
+
df = df[df['column_name']==column]
|
485
|
+
if plate not in df.columns:
|
486
|
+
df['plate'] = f"plate{plate}"
|
487
|
+
df = df[df['grna_name'].str.match(f'^{control_sgrnas[0]}$|^{control_sgrnas[1]}$')]
|
488
|
+
grouped_df = df.groupby(['plate', 'row_name', 'column_name'])['count'].sum().reset_index()
|
489
|
+
grouped_df = grouped_df.rename(columns={'count': 'total_count'})
|
490
|
+
merged_df = pd.merge(df, grouped_df, on=['plate', 'row_name', 'column_name'])
|
491
|
+
merged_df['fraction'] = merged_df['count'] / merged_df['total_count']
|
492
|
+
merged_df['prc'] = merged_df['plate'].astype(str) + '_' + merged_df['row_name'].astype(str) + '_' + merged_df['column_name'].astype(str)
|
493
|
+
return merged_df
|
494
|
+
|
495
|
+
def plot_multi_channel_heatmap(df, column='c3'):
|
496
|
+
"""
|
497
|
+
Plot a heatmap with multiple channels as columns.
|
498
|
+
|
499
|
+
Parameters:
|
500
|
+
- df: DataFrame with scores for different channels.
|
501
|
+
- column: Column to filter by (default is 'c3').
|
502
|
+
"""
|
503
|
+
# Extract row number and convert to integer for sorting
|
504
|
+
df['row_num'] = df['row_name'].str.extract(r'(\d+)').astype(int)
|
505
|
+
|
506
|
+
# Filter and sort by plate, row, and column
|
507
|
+
df = df[df['column_name'] == column]
|
508
|
+
df = df.sort_values(by=['plate', 'row_num', 'column_name'])
|
509
|
+
|
510
|
+
# Drop temporary 'row_num' column after sorting
|
511
|
+
df = df.drop('row_num', axis=1)
|
512
|
+
|
513
|
+
# Create a new column combining plate, row, and column for the index
|
514
|
+
df['plate_row_col'] = df['plate'] + '-' + df['row_name'] + '-' + df['column_name']
|
515
|
+
|
516
|
+
# Set 'plate_row_col' as the index
|
517
|
+
df.set_index('plate_row_col', inplace=True)
|
518
|
+
|
519
|
+
# Extract only numeric data for the heatmap
|
520
|
+
heatmap_data = df.select_dtypes(include=[float, int])
|
521
|
+
|
522
|
+
# Plot heatmap with square boxes, no annotations, and 'viridis' colormap
|
523
|
+
plt.figure(figsize=(12, 8))
|
524
|
+
sns.heatmap(
|
525
|
+
heatmap_data,
|
526
|
+
cmap="viridis",
|
527
|
+
cbar=True,
|
528
|
+
square=True,
|
529
|
+
annot=False
|
530
|
+
)
|
531
|
+
|
532
|
+
plt.title("Heatmap of Prediction Scores for All Channels")
|
533
|
+
plt.xlabel("Channels")
|
534
|
+
plt.ylabel("Plate-Row-Column")
|
535
|
+
plt.tight_layout()
|
536
|
+
|
537
|
+
# Save the figure object and return it
|
538
|
+
fig = plt.gcf()
|
539
|
+
plt.show()
|
540
|
+
|
541
|
+
return fig
|
542
|
+
|
543
|
+
|
544
|
+
def combine_classification_scores(folders, csv_name, data_column, plate=1, column='c3'):
|
545
|
+
# Ensure `folders` is a list
|
546
|
+
if isinstance(folders, str):
|
547
|
+
folders = [folders]
|
548
|
+
|
549
|
+
ls = [] # Initialize ls to store found CSV file paths
|
550
|
+
|
551
|
+
# Iterate over the provided folders
|
552
|
+
for folder in folders:
|
553
|
+
sub_folders = os.listdir(folder) # Get sub-folder list
|
554
|
+
for sub_folder in sub_folders: # Iterate through sub-folders
|
555
|
+
path = os.path.join(folder, sub_folder) # Join the full path
|
556
|
+
|
557
|
+
if os.path.isdir(path): # Check if it’s a directory
|
558
|
+
csv = os.path.join(path, csv_name) # Join path to the CSV file
|
559
|
+
if os.path.exists(csv): # If CSV exists, add to list
|
560
|
+
ls.append(csv)
|
561
|
+
else:
|
562
|
+
print(f'No such file: {csv}')
|
563
|
+
|
564
|
+
# Initialize combined DataFrame
|
565
|
+
combined_df = None
|
566
|
+
print(f'Found {len(ls)} CSV files')
|
567
|
+
|
568
|
+
# Loop through all collected CSV files and process them
|
569
|
+
for csv_file in ls:
|
570
|
+
df = pd.read_csv(csv_file) # Read CSV into DataFrame
|
571
|
+
df = df[df['column_name']==column]
|
572
|
+
if not plate is None:
|
573
|
+
df['plate'] = f"plate{plate}"
|
574
|
+
# Group the data by 'plate', 'row_name', and 'column_name'
|
575
|
+
grouped_df = df.groupby(['plate', 'row_name', 'column_name'])[data_column].mean().reset_index()
|
576
|
+
# Use the CSV filename to create a new column name
|
577
|
+
folder_name = os.path.dirname(csv_file).replace(".csv", "")
|
578
|
+
new_column_name = os.path.basename(f"{folder_name}_{data_column}")
|
579
|
+
print(new_column_name)
|
580
|
+
grouped_df = grouped_df.rename(columns={data_column: new_column_name})
|
581
|
+
|
582
|
+
# Merge into the combined DataFrame
|
583
|
+
if combined_df is None:
|
584
|
+
combined_df = grouped_df
|
585
|
+
else:
|
586
|
+
combined_df = pd.merge(combined_df, grouped_df, on=['plate', 'row_name', 'column_name'], how='outer')
|
587
|
+
combined_df['prc'] = combined_df['plate'].astype(str) + '_' + combined_df['row_name'].astype(str) + '_' + combined_df['column_name'].astype(str)
|
588
|
+
return combined_df
|
589
|
+
|
590
|
+
def calculate_mae(df):
|
591
|
+
"""
|
592
|
+
Calculate the MAE between each channel's predictions and the fraction column for all rows.
|
593
|
+
"""
|
594
|
+
# Extract numeric columns excluding 'fraction' and 'prc'
|
595
|
+
channels = df.drop(columns=['fraction', 'prc']).select_dtypes(include=[float, int])
|
596
|
+
|
597
|
+
mae_data = []
|
598
|
+
|
599
|
+
# Compute MAE for each channel with 'fraction' for all rows
|
600
|
+
for column in channels.columns:
|
601
|
+
for index, row in df.iterrows():
|
602
|
+
mae = mean_absolute_error([row['fraction']], [row[column]])
|
603
|
+
mae_data.append({'Channel': column, 'MAE': mae, 'Row': row['prc']})
|
604
|
+
|
605
|
+
# Convert the list of dictionaries to a DataFrame
|
606
|
+
mae_df = pd.DataFrame(mae_data)
|
607
|
+
return mae_df
|
608
|
+
|
609
|
+
result_df = combine_classification_scores(settings['folders'], settings['csv_name'], settings['data_column'], settings['plate'], settings['column'], )
|
610
|
+
df = calculate_fraction_mixed_condition(settings['csv'], settings['plate'], settings['column'], settings['control_sgrnas'])
|
611
|
+
df = df[df['grna_name']==settings['fraction_grna']]
|
612
|
+
fraction_df = df[['fraction', 'prc']]
|
613
|
+
merged_df = pd.merge(fraction_df, result_df, on=['prc'])
|
614
|
+
cv_df = group_cv_score(settings['cv_csv'], settings['plate'], settings['column'], settings['data_column_cv'])
|
615
|
+
cv_df = cv_df[[settings['data_column_cv'], 'prc']]
|
616
|
+
merged_df = pd.merge(merged_df, cv_df, on=['prc'])
|
617
|
+
|
618
|
+
fig = plot_multi_channel_heatmap(merged_df, settings['column'])
|
619
|
+
if 'row_number' in merged_df.columns:
|
620
|
+
merged_df = merged_df.drop('row_num', axis=1)
|
621
|
+
mae_df = calculate_mae(merged_df)
|
622
|
+
if 'row_number' in mae_df.columns:
|
623
|
+
mae_df = mae_df.drop('row_num', axis=1)
|
624
|
+
|
625
|
+
if not settings['dst'] is None:
|
626
|
+
mae_dst = os.path.join(settings['dst'], f"mae_scores_comparison_plate_{settings['plate']}.csv")
|
627
|
+
merged_dst = os.path.join(settings['dst'], f"scores_comparison_plate_{settings['plate']}_data.csv")
|
628
|
+
heatmap_save = os.path.join(settings['dst'], f"scores_comparison_plate_{settings['plate']}.pdf")
|
629
|
+
mae_df.to_csv(mae_dst, index=False)
|
630
|
+
merged_df.to_csv(merged_dst, index=False)
|
631
|
+
fig.savefig(heatmap_save, format='pdf', dpi=600, bbox_inches='tight')
|
632
|
+
return merged_df
|