spacr 0.3.37__py3-none-any.whl → 0.3.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/core.py +1 -1
- spacr/io.py +20 -13
- spacr/ml.py +33 -24
- spacr/plot.py +427 -37
- spacr/toxo.py +202 -16
- spacr/utils.py +14 -12
- {spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/METADATA +1 -1
- {spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/RECORD +12 -12
- {spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/LICENSE +0 -0
- {spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/WHEEL +0 -0
- {spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/entry_points.txt +0 -0
- {spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/top_level.txt +0 -0
spacr/core.py
CHANGED
@@ -143,7 +143,7 @@ def preprocess_generate_masks(src, settings={}):
|
|
143
143
|
start = time.time()
|
144
144
|
if i+1 <= settings['examples_to_plot']:
|
145
145
|
file_path = os.path.join(merged_src, file)
|
146
|
-
plot_image_mask_overlay(file_path, settings['channels'], settings['cell_channel'], settings['nucleus_channel'], settings['pathogen_channel'], figuresize=10,
|
146
|
+
plot_image_mask_overlay(file_path, settings['channels'], settings['cell_channel'], settings['nucleus_channel'], settings['pathogen_channel'], figuresize=10, percentiles=(1,99), thickness=3, save_pdf=True)
|
147
147
|
stop = time.time()
|
148
148
|
duration = stop-start
|
149
149
|
time_ls.append(duration)
|
spacr/io.py
CHANGED
@@ -1686,11 +1686,16 @@ def preprocess_img_data(settings):
|
|
1686
1686
|
print(f'Found {extension_counts[most_common_extension]} {most_common_extension} files')
|
1687
1687
|
else:
|
1688
1688
|
print(f'Could not find any {valid_ext} files in {src} only found {extension_counts[0]}')
|
1689
|
-
|
1689
|
+
|
1690
|
+
|
1691
|
+
|
1692
|
+
|
1693
|
+
|
1694
|
+
if os.path.exists(os.path.join(src,'stack')):
|
1690
1695
|
print('Found existing stack folder.')
|
1691
|
-
if os.path.exists(src
|
1696
|
+
if os.path.exists(os.path.join(src,'channel_stack')):
|
1692
1697
|
print('Found existing channel_stack folder.')
|
1693
|
-
if os.path.exists(src
|
1698
|
+
if os.path.exists(os.path.join(src,'norm_channel_stack')):
|
1694
1699
|
print('Found existing norm_channel_stack folder. Skipping preprocessing')
|
1695
1700
|
return settings, src
|
1696
1701
|
|
@@ -1713,12 +1718,13 @@ def preprocess_img_data(settings):
|
|
1713
1718
|
|
1714
1719
|
src = _run_test_mode(settings['src'], regex, timelapse, test_images, random_test)
|
1715
1720
|
settings['src'] = src
|
1716
|
-
|
1721
|
+
|
1722
|
+
stack_path = os.path.join(src, 'stack')
|
1717
1723
|
if img_format == None:
|
1718
|
-
if not os.path.exists(
|
1724
|
+
if not os.path.exists(stack_path):
|
1719
1725
|
_merge_channels(src, plot=False)
|
1720
|
-
|
1721
|
-
if not os.path.exists(
|
1726
|
+
|
1727
|
+
if not os.path.exists(stack_path):
|
1722
1728
|
try:
|
1723
1729
|
if not img_format == None:
|
1724
1730
|
if timelapse:
|
@@ -1727,7 +1733,7 @@ def preprocess_img_data(settings):
|
|
1727
1733
|
_rename_and_organize_image_files(src, regex, batch_size, pick_slice, skip_mode, metadata_type, img_format)
|
1728
1734
|
|
1729
1735
|
#Make sure no batches will be of only one image
|
1730
|
-
all_imgs = len(
|
1736
|
+
all_imgs = len(stack_path)
|
1731
1737
|
full_batches = all_imgs // batch_size
|
1732
1738
|
last_batch_size = all_imgs % batch_size
|
1733
1739
|
|
@@ -1738,26 +1744,27 @@ def preprocess_img_data(settings):
|
|
1738
1744
|
raise ValueError("Only one batch of size 1 detected. Adjust the batch size.")
|
1739
1745
|
# If the last batch is of size 1, merge it with the second last batch
|
1740
1746
|
elif full_batches > 0:
|
1747
|
+
print(f"all images: {all_imgs}, full batch: {full_batches}, last batch: {last_batch_size}")
|
1741
1748
|
raise ValueError("Last batch of size 1 detected. Adjust the batch size.")
|
1742
1749
|
|
1743
1750
|
_merge_channels(src, plot=False)
|
1744
1751
|
|
1745
1752
|
if timelapse:
|
1746
|
-
_create_movies_from_npy_per_channel(
|
1753
|
+
_create_movies_from_npy_per_channel(stack_path, fps=2)
|
1747
1754
|
|
1748
1755
|
if plot:
|
1749
1756
|
print(f'plotting {nr} images from {src}/stack')
|
1750
|
-
plot_arrays(
|
1757
|
+
plot_arrays(stack_path, figuresize, cmap, nr=nr, normalize=normalize)
|
1751
1758
|
|
1752
1759
|
if all_to_mip:
|
1753
|
-
_mip_all(
|
1760
|
+
_mip_all(stack_path)
|
1754
1761
|
if plot:
|
1755
1762
|
print(f'plotting {nr} images from {src}/stack')
|
1756
|
-
plot_arrays(
|
1763
|
+
plot_arrays(stack_path, figuresize, cmap, nr=nr, normalize=normalize)
|
1757
1764
|
except Exception as e:
|
1758
1765
|
print(f"Error: {e}")
|
1759
1766
|
|
1760
|
-
concatenate_and_normalize(src=
|
1767
|
+
concatenate_and_normalize(src=stack_path,
|
1761
1768
|
channels=mask_channels,
|
1762
1769
|
save_dtype=np.float32,
|
1763
1770
|
settings=settings)
|
spacr/ml.py
CHANGED
@@ -134,7 +134,7 @@ def scale_variables(X, y):
|
|
134
134
|
|
135
135
|
return X_scaled, y_scaled
|
136
136
|
|
137
|
-
def process_model_coefficients(model, regression_type, X, y,
|
137
|
+
def process_model_coefficients(model, regression_type, X, y, nc, pc, controls):
|
138
138
|
"""Return DataFrame of model coefficients and p-values."""
|
139
139
|
if regression_type in ['ols', 'gls', 'wls', 'rlm', 'glm', 'mixed', 'quantile', 'logit', 'probit', 'poisson']:
|
140
140
|
coefs = model.params
|
@@ -169,8 +169,8 @@ def process_model_coefficients(model, regression_type, X, y, highlight):
|
|
169
169
|
coef_df['p_value'] = np.nan # Placeholder since sklearn doesn't provide p-values
|
170
170
|
|
171
171
|
coef_df['-log10(p_value)'] = -np.log10(coef_df['p_value'])
|
172
|
-
coef_df['
|
173
|
-
|
172
|
+
coef_df['grna'] = coef_df['feature'].str.extract(r'\[(.*?)\]')[0]
|
173
|
+
coef_df['condition'] = coef_df.apply(lambda row: 'nc' if nc in row['feature'] else 'pc' if pc in row['feature'] else ('control' if row['grna'] in controls else 'other'),axis=1)
|
174
174
|
return coef_df[~coef_df['feature'].str.contains('row|column')]
|
175
175
|
|
176
176
|
def prepare_formula(dependent_variable, random_row_column_effects=False):
|
@@ -284,15 +284,13 @@ def check_and_clean_data(df, dependent_variable):
|
|
284
284
|
df_cleaned['row'] = df['row']
|
285
285
|
df_cleaned['column'] = df['column']
|
286
286
|
|
287
|
-
#display(df_cleaned)
|
288
|
-
|
289
287
|
# Create a new column 'gene_fraction' that sums the fractions by gene within the same well
|
290
288
|
df_cleaned['gene_fraction'] = df_cleaned.groupby(['prc', 'gene'])['fraction'].transform('sum')
|
291
289
|
|
292
290
|
print("Data is ready for model fitting.")
|
293
291
|
return df_cleaned
|
294
292
|
|
295
|
-
def regression(df, csv_path, dependent_variable='predictions', regression_type=None, alpha=1.0, random_row_column_effects=False,
|
293
|
+
def regression(df, csv_path, dependent_variable='predictions', regression_type=None, alpha=1.0, random_row_column_effects=False, nc='233460', pc='220950', controls=[''], dst=None, cov_type=None, plot=False):
|
296
294
|
from .plot import volcano_plot, plot_histogram
|
297
295
|
|
298
296
|
# Generate the volcano filename
|
@@ -312,9 +310,7 @@ def regression(df, csv_path, dependent_variable='predictions', regression_type=N
|
|
312
310
|
if regression_type is None:
|
313
311
|
regression_type = 'ols' if is_normal else 'glm'
|
314
312
|
|
315
|
-
#display('before check_and_clean_data:',df)
|
316
313
|
df = check_and_clean_data(df, dependent_variable)
|
317
|
-
#display('after check_and_clean_data:',df)
|
318
314
|
|
319
315
|
# Handle mixed effects if row/column effect is treated as random
|
320
316
|
if random_row_column_effects:
|
@@ -340,10 +336,10 @@ def regression(df, csv_path, dependent_variable='predictions', regression_type=N
|
|
340
336
|
model = regression_model(X, y, regression_type=regression_type, groups=groups, alpha=alpha, cov_type=cov_type)
|
341
337
|
|
342
338
|
# Process the model coefficients
|
343
|
-
coef_df = process_model_coefficients(model, regression_type, X, y,
|
344
|
-
|
345
|
-
|
346
|
-
|
339
|
+
coef_df = process_model_coefficients(model, regression_type, X, y, nc, pc, controls)
|
340
|
+
|
341
|
+
if plot:
|
342
|
+
volcano_plot(coef_df, volcano_path)
|
347
343
|
|
348
344
|
return model, coef_df
|
349
345
|
|
@@ -487,19 +483,28 @@ def perform_regression(settings):
|
|
487
483
|
if settings['transform'] is None:
|
488
484
|
_ = plot_plates(score_data_df, variable=dependent_variable, grouping='mean', min_max='allq', cmap='viridis', min_count=settings['min_cell_count'], dst = res_folder)
|
489
485
|
|
490
|
-
model, coef_df = regression(merged_df, csv_path, dependent_variable, settings['regression_type'], settings['alpha'], settings['random_row_column_effects'],
|
486
|
+
model, coef_df = regression(merged_df, csv_path, dependent_variable, settings['regression_type'], settings['alpha'], settings['random_row_column_effects'], nc=settings['negative_control'], pc=settings['positive_control'], controls=settings['controls'], dst=res_folder, cov_type=settings['cov_type'])
|
491
487
|
|
492
488
|
coef_df['grna'] = coef_df['feature'].apply(lambda x: re.search(r'grna\[(.*?)\]', x).group(1) if 'grna' in x else None)
|
493
489
|
coef_df['gene'] = coef_df['feature'].apply(lambda x: re.search(r'gene\[(.*?)\]', x).group(1) if 'gene' in x else None)
|
494
490
|
coef_df = coef_df.merge(n_grna, how='left', on='grna')
|
495
491
|
coef_df = coef_df.merge(n_gene, how='left', on='gene')
|
496
|
-
display(coef_df)
|
497
492
|
|
498
493
|
gene_coef_df = coef_df[coef_df['n_gene'] != None]
|
499
494
|
grna_coef_df = coef_df[coef_df['n_grna'] != None]
|
500
495
|
gene_coef_df = gene_coef_df.dropna(subset=['n_gene'])
|
501
496
|
grna_coef_df = grna_coef_df.dropna(subset=['n_grna'])
|
502
497
|
|
498
|
+
if settings['controls'] is not None:
|
499
|
+
control_coef_df = grna_coef_df[grna_coef_df['grna'].isin(settings['controls'])]
|
500
|
+
mean_coef = control_coef_df['coefficient'].mean()
|
501
|
+
variance_coef = control_coef_df['coefficient'].var()
|
502
|
+
std_coef = control_coef_df['coefficient'].std()
|
503
|
+
reg_threshold = mean_coef + (3 * std_coef)
|
504
|
+
|
505
|
+
print('coef_df')
|
506
|
+
display(coef_df)
|
507
|
+
|
503
508
|
coef_df.to_csv(results_path, index=False)
|
504
509
|
gene_coef_df.to_csv(results_path_gene, index=False)
|
505
510
|
grna_coef_df.to_csv(results_path_grna, index=False)
|
@@ -509,7 +514,10 @@ def perform_regression(settings):
|
|
509
514
|
|
510
515
|
else:
|
511
516
|
significant = coef_df[coef_df['p_value']<= 0.05]
|
512
|
-
|
517
|
+
if settings['controls'] is not None:
|
518
|
+
significant_high = significant[significant['coefficient'] >= reg_threshold]
|
519
|
+
significant_low = significant[significant['coefficient'] <= reg_threshold]
|
520
|
+
significant = pd.concat([significant_high, significant_low])
|
513
521
|
significant.sort_values(by='coefficient', ascending=False, inplace=True)
|
514
522
|
significant = significant[~significant['feature'].str.contains('row|column')]
|
515
523
|
|
@@ -530,22 +538,24 @@ def perform_regression(settings):
|
|
530
538
|
grna_merged_df = merge_regression_res_with_metadata(results_path_grna, metadata_file, name=filename)
|
531
539
|
|
532
540
|
if settings['toxo']:
|
533
|
-
|
534
541
|
data_path = merged_df
|
535
542
|
data_path_gene = gene_merged_df
|
536
543
|
data_path_grna = grna_merged_df
|
537
544
|
base_dir = os.path.dirname(os.path.abspath(__file__))
|
538
545
|
metadata_path = os.path.join(base_dir, 'resources', 'data', 'lopit.csv')
|
539
|
-
|
540
|
-
custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
541
|
-
custom_volcano_plot(data_path_gene, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
542
|
-
custom_volcano_plot(data_path_grna, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
543
546
|
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
+
custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', point_size=200, figsize=20, threshold=reg_threshold, split_axis_lims=settings['split_axis_lims'])
|
548
|
+
#custom_volcano_plot(data_path_gene, metadata_path, metadata_column='tagm_location', point_size=50, figsize=20, threshold=reg_threshold)
|
549
|
+
#custom_volcano_plot(data_path_grna, metadata_path, metadata_column='tagm_location', point_size=50, figsize=20, threshold=reg_threshold)
|
550
|
+
|
551
|
+
#if len(significant) > 2:
|
552
|
+
# metadata_path = os.path.join(base_dir, 'resources', 'data', 'toxoplasma_metadata.csv')
|
553
|
+
# go_term_enrichment_by_column(significant, metadata_path)
|
547
554
|
|
548
555
|
print('Significant Genes')
|
556
|
+
grnas = significant['grna'].unique().tolist()
|
557
|
+
genes = significant['gene'].unique().tolist()
|
558
|
+
print(f"Found p<0.05 coedfficients for {len(grnas)} gRNAs and {len(genes)} genes")
|
549
559
|
display(significant)
|
550
560
|
|
551
561
|
output = {'results':coef_df,
|
@@ -763,7 +773,6 @@ def generate_ml_scores(settings):
|
|
763
773
|
raise ValueError("The 'png_list_df' DataFrame must contain 'prcfo' and 'test' columns.")
|
764
774
|
annotated_df = png_list_df[['prcfo', settings['annotation_column']]].set_index('prcfo')
|
765
775
|
df = annotated_df.merge(df, left_index=True, right_index=True)
|
766
|
-
#display(df)
|
767
776
|
unique_values = df[settings['annotation_column']].dropna().unique()
|
768
777
|
if len(unique_values) == 1:
|
769
778
|
unannotated_rows = df[df[settings['annotation_column']].isna()].index
|