spacr 0.3.38__py3-none-any.whl → 0.3.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/core.py +1 -1
- spacr/io.py +20 -13
- spacr/measure.py +4 -4
- spacr/ml.py +53 -44
- spacr/plot.py +421 -37
- spacr/settings.py +18 -13
- spacr/toxo.py +223 -16
- spacr/utils.py +7 -5
- {spacr-0.3.38.dist-info → spacr-0.3.42.dist-info}/METADATA +1 -1
- {spacr-0.3.38.dist-info → spacr-0.3.42.dist-info}/RECORD +14 -14
- {spacr-0.3.38.dist-info → spacr-0.3.42.dist-info}/LICENSE +0 -0
- {spacr-0.3.38.dist-info → spacr-0.3.42.dist-info}/WHEEL +0 -0
- {spacr-0.3.38.dist-info → spacr-0.3.42.dist-info}/entry_points.txt +0 -0
- {spacr-0.3.38.dist-info → spacr-0.3.42.dist-info}/top_level.txt +0 -0
spacr/core.py
CHANGED
@@ -143,7 +143,7 @@ def preprocess_generate_masks(src, settings={}):
|
|
143
143
|
start = time.time()
|
144
144
|
if i+1 <= settings['examples_to_plot']:
|
145
145
|
file_path = os.path.join(merged_src, file)
|
146
|
-
plot_image_mask_overlay(file_path, settings['channels'], settings['cell_channel'], settings['nucleus_channel'], settings['pathogen_channel'], figuresize=10,
|
146
|
+
plot_image_mask_overlay(file_path, settings['channels'], settings['cell_channel'], settings['nucleus_channel'], settings['pathogen_channel'], figuresize=10, percentiles=(1,99), thickness=3, save_pdf=True)
|
147
147
|
stop = time.time()
|
148
148
|
duration = stop-start
|
149
149
|
time_ls.append(duration)
|
spacr/io.py
CHANGED
@@ -1686,11 +1686,16 @@ def preprocess_img_data(settings):
|
|
1686
1686
|
print(f'Found {extension_counts[most_common_extension]} {most_common_extension} files')
|
1687
1687
|
else:
|
1688
1688
|
print(f'Could not find any {valid_ext} files in {src} only found {extension_counts[0]}')
|
1689
|
-
|
1689
|
+
|
1690
|
+
|
1691
|
+
|
1692
|
+
|
1693
|
+
|
1694
|
+
if os.path.exists(os.path.join(src,'stack')):
|
1690
1695
|
print('Found existing stack folder.')
|
1691
|
-
if os.path.exists(src
|
1696
|
+
if os.path.exists(os.path.join(src,'channel_stack')):
|
1692
1697
|
print('Found existing channel_stack folder.')
|
1693
|
-
if os.path.exists(src
|
1698
|
+
if os.path.exists(os.path.join(src,'norm_channel_stack')):
|
1694
1699
|
print('Found existing norm_channel_stack folder. Skipping preprocessing')
|
1695
1700
|
return settings, src
|
1696
1701
|
|
@@ -1713,12 +1718,13 @@ def preprocess_img_data(settings):
|
|
1713
1718
|
|
1714
1719
|
src = _run_test_mode(settings['src'], regex, timelapse, test_images, random_test)
|
1715
1720
|
settings['src'] = src
|
1716
|
-
|
1721
|
+
|
1722
|
+
stack_path = os.path.join(src, 'stack')
|
1717
1723
|
if img_format == None:
|
1718
|
-
if not os.path.exists(
|
1724
|
+
if not os.path.exists(stack_path):
|
1719
1725
|
_merge_channels(src, plot=False)
|
1720
|
-
|
1721
|
-
if not os.path.exists(
|
1726
|
+
|
1727
|
+
if not os.path.exists(stack_path):
|
1722
1728
|
try:
|
1723
1729
|
if not img_format == None:
|
1724
1730
|
if timelapse:
|
@@ -1727,7 +1733,7 @@ def preprocess_img_data(settings):
|
|
1727
1733
|
_rename_and_organize_image_files(src, regex, batch_size, pick_slice, skip_mode, metadata_type, img_format)
|
1728
1734
|
|
1729
1735
|
#Make sure no batches will be of only one image
|
1730
|
-
all_imgs = len(
|
1736
|
+
all_imgs = len(stack_path)
|
1731
1737
|
full_batches = all_imgs // batch_size
|
1732
1738
|
last_batch_size = all_imgs % batch_size
|
1733
1739
|
|
@@ -1738,26 +1744,27 @@ def preprocess_img_data(settings):
|
|
1738
1744
|
raise ValueError("Only one batch of size 1 detected. Adjust the batch size.")
|
1739
1745
|
# If the last batch is of size 1, merge it with the second last batch
|
1740
1746
|
elif full_batches > 0:
|
1747
|
+
print(f"all images: {all_imgs}, full batch: {full_batches}, last batch: {last_batch_size}")
|
1741
1748
|
raise ValueError("Last batch of size 1 detected. Adjust the batch size.")
|
1742
1749
|
|
1743
1750
|
_merge_channels(src, plot=False)
|
1744
1751
|
|
1745
1752
|
if timelapse:
|
1746
|
-
_create_movies_from_npy_per_channel(
|
1753
|
+
_create_movies_from_npy_per_channel(stack_path, fps=2)
|
1747
1754
|
|
1748
1755
|
if plot:
|
1749
1756
|
print(f'plotting {nr} images from {src}/stack')
|
1750
|
-
plot_arrays(
|
1757
|
+
plot_arrays(stack_path, figuresize, cmap, nr=nr, normalize=normalize)
|
1751
1758
|
|
1752
1759
|
if all_to_mip:
|
1753
|
-
_mip_all(
|
1760
|
+
_mip_all(stack_path)
|
1754
1761
|
if plot:
|
1755
1762
|
print(f'plotting {nr} images from {src}/stack')
|
1756
|
-
plot_arrays(
|
1763
|
+
plot_arrays(stack_path, figuresize, cmap, nr=nr, normalize=normalize)
|
1757
1764
|
except Exception as e:
|
1758
1765
|
print(f"Error: {e}")
|
1759
1766
|
|
1760
|
-
concatenate_and_normalize(src=
|
1767
|
+
concatenate_and_normalize(src=stack_path,
|
1761
1768
|
channels=mask_channels,
|
1762
1769
|
save_dtype=np.float32,
|
1763
1770
|
settings=settings)
|
spacr/measure.py
CHANGED
@@ -710,7 +710,7 @@ def _measure_crop_core(index, time_ls, file, settings):
|
|
710
710
|
else:
|
711
711
|
cell_mask = np.zeros_like(data[:, :, 0])
|
712
712
|
settings['cytoplasm'] = False
|
713
|
-
settings['
|
713
|
+
settings['uninfected'] = True
|
714
714
|
|
715
715
|
if settings['nucleus_mask_dim'] is not None:
|
716
716
|
nucleus_mask = data[:, :, settings['nucleus_mask_dim']].astype(data_type)
|
@@ -762,7 +762,7 @@ def _measure_crop_core(index, time_ls, file, settings):
|
|
762
762
|
cytoplasm_mask = _filter_object(cytoplasm_mask, settings['cytoplasm_min_size'])
|
763
763
|
|
764
764
|
if settings['cell_mask_dim'] is not None:
|
765
|
-
cell_mask, nucleus_mask, pathogen_mask, cytoplasm_mask = _exclude_objects(cell_mask, nucleus_mask, pathogen_mask, cytoplasm_mask,
|
765
|
+
cell_mask, nucleus_mask, pathogen_mask, cytoplasm_mask = _exclude_objects(cell_mask, nucleus_mask, pathogen_mask, cytoplasm_mask, uninfected=settings['uninfected'])
|
766
766
|
|
767
767
|
# Update data with the new masks
|
768
768
|
if settings['cell_mask_dim'] is not None:
|
@@ -979,9 +979,9 @@ def measure_crop(settings):
|
|
979
979
|
#_create_database(source_folder+'/measurements/measurements.db')
|
980
980
|
|
981
981
|
if settings['cell_mask_dim'] is None:
|
982
|
-
settings['
|
982
|
+
settings['uninfected'] = True
|
983
983
|
if settings['pathogen_mask_dim'] is None:
|
984
|
-
settings['
|
984
|
+
settings['uninfected'] = True
|
985
985
|
if settings['cell_mask_dim'] is not None and settings['pathogen_min_size'] is not None:
|
986
986
|
settings['cytoplasm'] = True
|
987
987
|
elif settings['cell_mask_dim'] is not None and settings['nucleus_min_size'] is not None:
|
spacr/ml.py
CHANGED
@@ -134,7 +134,7 @@ def scale_variables(X, y):
|
|
134
134
|
|
135
135
|
return X_scaled, y_scaled
|
136
136
|
|
137
|
-
def process_model_coefficients(model, regression_type, X, y,
|
137
|
+
def process_model_coefficients(model, regression_type, X, y, nc, pc, controls):
|
138
138
|
"""Return DataFrame of model coefficients and p-values."""
|
139
139
|
if regression_type in ['ols', 'gls', 'wls', 'rlm', 'glm', 'mixed', 'quantile', 'logit', 'probit', 'poisson']:
|
140
140
|
coefs = model.params
|
@@ -169,8 +169,8 @@ def process_model_coefficients(model, regression_type, X, y, highlight):
|
|
169
169
|
coef_df['p_value'] = np.nan # Placeholder since sklearn doesn't provide p-values
|
170
170
|
|
171
171
|
coef_df['-log10(p_value)'] = -np.log10(coef_df['p_value'])
|
172
|
-
coef_df['
|
173
|
-
|
172
|
+
coef_df['grna'] = coef_df['feature'].str.extract(r'\[(.*?)\]')[0]
|
173
|
+
coef_df['condition'] = coef_df.apply(lambda row: 'nc' if nc in row['feature'] else 'pc' if pc in row['feature'] else ('control' if row['grna'] in controls else 'other'),axis=1)
|
174
174
|
return coef_df[~coef_df['feature'].str.contains('row|column')]
|
175
175
|
|
176
176
|
def prepare_formula(dependent_variable, random_row_column_effects=False):
|
@@ -284,15 +284,13 @@ def check_and_clean_data(df, dependent_variable):
|
|
284
284
|
df_cleaned['row'] = df['row']
|
285
285
|
df_cleaned['column'] = df['column']
|
286
286
|
|
287
|
-
#display(df_cleaned)
|
288
|
-
|
289
287
|
# Create a new column 'gene_fraction' that sums the fractions by gene within the same well
|
290
288
|
df_cleaned['gene_fraction'] = df_cleaned.groupby(['prc', 'gene'])['fraction'].transform('sum')
|
291
289
|
|
292
290
|
print("Data is ready for model fitting.")
|
293
291
|
return df_cleaned
|
294
292
|
|
295
|
-
def regression(df, csv_path, dependent_variable='predictions', regression_type=None, alpha=1.0, random_row_column_effects=False,
|
293
|
+
def regression(df, csv_path, dependent_variable='predictions', regression_type=None, alpha=1.0, random_row_column_effects=False, nc='233460', pc='220950', controls=[''], dst=None, cov_type=None, plot=False):
|
296
294
|
from .plot import volcano_plot, plot_histogram
|
297
295
|
|
298
296
|
# Generate the volcano filename
|
@@ -312,9 +310,7 @@ def regression(df, csv_path, dependent_variable='predictions', regression_type=N
|
|
312
310
|
if regression_type is None:
|
313
311
|
regression_type = 'ols' if is_normal else 'glm'
|
314
312
|
|
315
|
-
#display('before check_and_clean_data:',df)
|
316
313
|
df = check_and_clean_data(df, dependent_variable)
|
317
|
-
#display('after check_and_clean_data:',df)
|
318
314
|
|
319
315
|
# Handle mixed effects if row/column effect is treated as random
|
320
316
|
if random_row_column_effects:
|
@@ -340,10 +336,10 @@ def regression(df, csv_path, dependent_variable='predictions', regression_type=N
|
|
340
336
|
model = regression_model(X, y, regression_type=regression_type, groups=groups, alpha=alpha, cov_type=cov_type)
|
341
337
|
|
342
338
|
# Process the model coefficients
|
343
|
-
coef_df = process_model_coefficients(model, regression_type, X, y,
|
344
|
-
|
345
|
-
|
346
|
-
|
339
|
+
coef_df = process_model_coefficients(model, regression_type, X, y, nc, pc, controls)
|
340
|
+
|
341
|
+
if plot:
|
342
|
+
volcano_plot(coef_df, volcano_path)
|
347
343
|
|
348
344
|
return model, coef_df
|
349
345
|
|
@@ -453,26 +449,28 @@ def perform_regression(settings):
|
|
453
449
|
return df, n_gene
|
454
450
|
else:
|
455
451
|
return df
|
452
|
+
|
453
|
+
|
456
454
|
|
457
455
|
settings = get_perform_regression_default_settings(settings)
|
458
456
|
count_data_df, score_data_df = _perform_regression_read_data(settings)
|
459
457
|
results_path, results_path_gene, results_path_grna, hits_path, res_folder, csv_path = _perform_regression_set_paths(settings)
|
460
458
|
save_settings(settings, name='regression', show=True)
|
459
|
+
|
460
|
+
if isinstance(settings['filter_value'], list):
|
461
|
+
filter_value = settings['filter_value']
|
462
|
+
else:
|
463
|
+
filter_value = []
|
464
|
+
if isinstance(settings['filter_column'], str):
|
465
|
+
filter_column = settings['filter_column']
|
461
466
|
|
462
|
-
score_data_df = clean_controls(score_data_df, settings['
|
467
|
+
score_data_df = clean_controls(score_data_df, settings['filter_value'], settings['filter_column'])
|
463
468
|
print(f"Dependent variable after clean_controls: {len(score_data_df)}")
|
464
469
|
|
465
470
|
dependent_df, dependent_variable = process_scores(score_data_df, settings['dependent_variable'], settings['plate'], settings['min_cell_count'], settings['agg_type'], settings['transform'])
|
466
471
|
print(f"Dependent variable after process_scores: {len(dependent_df)}")
|
467
472
|
|
468
|
-
|
469
|
-
|
470
|
-
if settings['other'] is not None:
|
471
|
-
if isinstance(settings['other'], str):
|
472
|
-
settings['other'] = [settings['other']]
|
473
|
-
filter_value.extend(settings['other'])
|
474
|
-
|
475
|
-
independent_df = process_reads(count_data_df, settings['fraction_threshold'], settings['plate'], filter_column=settings['location_column'], filter_value=filter_value)
|
473
|
+
independent_df = process_reads(count_data_df, settings['fraction_threshold'], settings['plate'], filter_column=filter_column, filter_value=filter_value)
|
476
474
|
independent_df, n_grna, n_gene = _count_variable_instances(independent_df, column_1='grna', column_2='gene')
|
477
475
|
|
478
476
|
print(f"Independent variable after process_reads: {len(independent_df)}")
|
@@ -487,19 +485,30 @@ def perform_regression(settings):
|
|
487
485
|
if settings['transform'] is None:
|
488
486
|
_ = plot_plates(score_data_df, variable=dependent_variable, grouping='mean', min_max='allq', cmap='viridis', min_count=settings['min_cell_count'], dst = res_folder)
|
489
487
|
|
490
|
-
model, coef_df = regression(merged_df, csv_path, dependent_variable, settings['regression_type'], settings['alpha'], settings['random_row_column_effects'],
|
488
|
+
model, coef_df = regression(merged_df, csv_path, dependent_variable, settings['regression_type'], settings['alpha'], settings['random_row_column_effects'], nc=settings['negative_control'], pc=settings['positive_control'], controls=settings['controls'], dst=res_folder, cov_type=settings['cov_type'])
|
491
489
|
|
492
490
|
coef_df['grna'] = coef_df['feature'].apply(lambda x: re.search(r'grna\[(.*?)\]', x).group(1) if 'grna' in x else None)
|
493
491
|
coef_df['gene'] = coef_df['feature'].apply(lambda x: re.search(r'gene\[(.*?)\]', x).group(1) if 'gene' in x else None)
|
494
492
|
coef_df = coef_df.merge(n_grna, how='left', on='grna')
|
495
493
|
coef_df = coef_df.merge(n_gene, how='left', on='gene')
|
496
|
-
display(coef_df)
|
497
494
|
|
498
495
|
gene_coef_df = coef_df[coef_df['n_gene'] != None]
|
499
496
|
grna_coef_df = coef_df[coef_df['n_grna'] != None]
|
500
497
|
gene_coef_df = gene_coef_df.dropna(subset=['n_gene'])
|
501
498
|
grna_coef_df = grna_coef_df.dropna(subset=['n_grna'])
|
502
499
|
|
500
|
+
if settings['controls'] is not None:
|
501
|
+
control_coef_df = grna_coef_df[grna_coef_df['grna'].isin(settings['controls'])]
|
502
|
+
mean_coef = control_coef_df['coefficient'].mean()
|
503
|
+
|
504
|
+
if settings['threshold_method'] in ['var','variance']:
|
505
|
+
coef_mes = control_coef_df['coefficient'].var()
|
506
|
+
elif settings['threshold_method'] in ['std', 'standard_deveation']:
|
507
|
+
coef_mes = control_coef_df['coefficient'].std()
|
508
|
+
else:
|
509
|
+
raise ValueError(f"Unsupported threshold method {settings['threshold_method']}. Supported methods: ['var','variance','std','standard_deveation']")
|
510
|
+
reg_threshold = mean_coef + (settings['threshold_multiplier'] * coef_mes)
|
511
|
+
|
503
512
|
coef_df.to_csv(results_path, index=False)
|
504
513
|
gene_coef_df.to_csv(results_path_gene, index=False)
|
505
514
|
grna_coef_df.to_csv(results_path_grna, index=False)
|
@@ -509,7 +518,10 @@ def perform_regression(settings):
|
|
509
518
|
|
510
519
|
else:
|
511
520
|
significant = coef_df[coef_df['p_value']<= 0.05]
|
512
|
-
|
521
|
+
if settings['controls'] is not None:
|
522
|
+
significant_high = significant[significant['coefficient'] >= reg_threshold]
|
523
|
+
significant_low = significant[significant['coefficient'] <= reg_threshold]
|
524
|
+
significant = pd.concat([significant_high, significant_low])
|
513
525
|
significant.sort_values(by='coefficient', ascending=False, inplace=True)
|
514
526
|
significant = significant[~significant['feature'].str.contains('row|column')]
|
515
527
|
|
@@ -530,22 +542,24 @@ def perform_regression(settings):
|
|
530
542
|
grna_merged_df = merge_regression_res_with_metadata(results_path_grna, metadata_file, name=filename)
|
531
543
|
|
532
544
|
if settings['toxo']:
|
533
|
-
|
534
545
|
data_path = merged_df
|
535
546
|
data_path_gene = gene_merged_df
|
536
547
|
data_path_grna = grna_merged_df
|
537
548
|
base_dir = os.path.dirname(os.path.abspath(__file__))
|
538
549
|
metadata_path = os.path.join(base_dir, 'resources', 'data', 'lopit.csv')
|
539
|
-
|
540
|
-
custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
541
|
-
custom_volcano_plot(data_path_gene, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
542
|
-
custom_volcano_plot(data_path_grna, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
543
550
|
|
544
|
-
|
545
|
-
|
546
|
-
|
551
|
+
custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', point_size=200, figsize=20, threshold=reg_threshold, split_axis_lims=settings['split_axis_lims'])
|
552
|
+
#custom_volcano_plot(data_path_gene, metadata_path, metadata_column='tagm_location', point_size=50, figsize=20, threshold=reg_threshold)
|
553
|
+
#custom_volcano_plot(data_path_grna, metadata_path, metadata_column='tagm_location', point_size=50, figsize=20, threshold=reg_threshold)
|
554
|
+
|
555
|
+
#if len(significant) > 2:
|
556
|
+
# metadata_path = os.path.join(base_dir, 'resources', 'data', 'toxoplasma_metadata.csv')
|
557
|
+
# go_term_enrichment_by_column(significant, metadata_path)
|
547
558
|
|
548
559
|
print('Significant Genes')
|
560
|
+
grnas = significant['grna'].unique().tolist()
|
561
|
+
genes = significant['gene'].unique().tolist()
|
562
|
+
print(f"Found p<0.05 coedfficients for {len(grnas)} gRNAs and {len(genes)} genes")
|
549
563
|
display(significant)
|
550
564
|
|
551
565
|
output = {'results':coef_df,
|
@@ -586,7 +600,7 @@ def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filte
|
|
586
600
|
if isinstance(filter_value, str):
|
587
601
|
filter_value = [filter_value]
|
588
602
|
|
589
|
-
if isinstance(filter_column, list):
|
603
|
+
if isinstance(filter_column, list):
|
590
604
|
for filter_col in filter_column:
|
591
605
|
for value in filter_value:
|
592
606
|
csv_df = csv_df[csv_df[filter_col] != value]
|
@@ -649,16 +663,12 @@ def check_normality(data, variable_name, verbose=False):
|
|
649
663
|
print(f"Normal distribution: The data for {variable_name} is not normally distributed.")
|
650
664
|
return False
|
651
665
|
|
652
|
-
def clean_controls(df,
|
653
|
-
if
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
df = df[~df['column'].isin([pc])]
|
659
|
-
if other != None:
|
660
|
-
df = df[~df['column'].isin([other])]
|
661
|
-
print(f'Removed data from {nc, pc, other}')
|
666
|
+
def clean_controls(df,values, column):
|
667
|
+
if column in df.columns:
|
668
|
+
if isinstance(values, list):
|
669
|
+
for value in values:
|
670
|
+
df = df[~df[column].isin([value])]
|
671
|
+
print(f'Removed data from {value}')
|
662
672
|
return df
|
663
673
|
|
664
674
|
def process_scores(df, dependent_variable, plate, min_cell_count=25, agg_type='mean', transform=None, regression_type='ols'):
|
@@ -763,7 +773,6 @@ def generate_ml_scores(settings):
|
|
763
773
|
raise ValueError("The 'png_list_df' DataFrame must contain 'prcfo' and 'test' columns.")
|
764
774
|
annotated_df = png_list_df[['prcfo', settings['annotation_column']]].set_index('prcfo')
|
765
775
|
df = annotated_df.merge(df, left_index=True, right_index=True)
|
766
|
-
#display(df)
|
767
776
|
unique_values = df[settings['annotation_column']].dropna().unique()
|
768
777
|
if len(unique_values) == 1:
|
769
778
|
unannotated_rows = df[df[settings['annotation_column']].isna()].index
|