PyPI - spacr - Versions diffs - 0.3.37__py3-none-any.whl → 0.3.41__py3-none-any.whl - Mend

spacr 0.3.37py3-none-any.whl → 0.3.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

spacr/core.py +1 -1
spacr/io.py +20 -13
spacr/ml.py +33 -24
spacr/plot.py +427 -37
spacr/toxo.py +202 -16
spacr/utils.py +14 -12
{spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/METADATA +1 -1
{spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/RECORD +12 -12
{spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/LICENSE +0 -0
{spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/WHEEL +0 -0
{spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/entry_points.txt +0 -0
{spacr-0.3.37.dist-info → spacr-0.3.41.dist-info}/top_level.txt +0 -0

spacr/core.py CHANGED Viewed

@@ -143,7 +143,7 @@ def preprocess_generate_masks(src, settings={}):
                                 start = time.time()
                                 if i+1 <= settings['examples_to_plot']:
                                     file_path = os.path.join(merged_src, file)
-                                    plot_image_mask_overlay(file_path, settings['channels'], settings['cell_channel'], settings['nucleus_channel'], settings['pathogen_channel'], figuresize=10, normalize=True, thickness=3, save_pdf=True)
+                                    plot_image_mask_overlay(file_path, settings['channels'], settings['cell_channel'], settings['nucleus_channel'], settings['pathogen_channel'], figuresize=10, percentiles=(1,99), thickness=3, save_pdf=True)
                                     stop = time.time()
                                     duration = stop-start
                                     time_ls.append(duration)

spacr/io.py CHANGED Viewed

@@ -1686,11 +1686,16 @@ def preprocess_img_data(settings):
         print(f'Found {extension_counts[most_common_extension]} {most_common_extension} files')
     else:
         print(f'Could not find any {valid_ext} files in {src} only found {extension_counts[0]}')
-        if os.path.exists(src+'/stack'):
+        if os.path.exists(os.path.join(src,'stack')):
             print('Found existing stack folder.')
-        if os.path.exists(src+'/channel_stack'):
+        if os.path.exists(os.path.join(src,'channel_stack')):
             print('Found existing channel_stack folder.')
-        if os.path.exists(src+'/norm_channel_stack'):
+        if os.path.exists(os.path.join(src,'norm_channel_stack')):
             print('Found existing norm_channel_stack folder. Skipping preprocessing')
             return settings, src
@@ -1713,12 +1718,13 @@ def preprocess_img_data(settings):
         src = _run_test_mode(settings['src'], regex, timelapse, test_images, random_test)
         settings['src'] = src
+    stack_path = os.path.join(src, 'stack')
     if img_format == None:
-        if not os.path.exists(src+'/stack'):
+        if not os.path.exists(stack_path):
             _merge_channels(src, plot=False)
-    if not os.path.exists(src+'/stack'):
+    if not os.path.exists(stack_path):
         try:
             if not img_format == None:
                 if timelapse:
@@ -1727,7 +1733,7 @@ def preprocess_img_data(settings):
                     _rename_and_organize_image_files(src, regex, batch_size, pick_slice, skip_mode, metadata_type, img_format)
                     #Make sure no batches will be of only one image
-                    all_imgs = len(src+'/stack')
+                    all_imgs = len(stack_path)
                     full_batches = all_imgs // batch_size
                     last_batch_size = all_imgs % batch_size
@@ -1738,26 +1744,27 @@ def preprocess_img_data(settings):
                             raise ValueError("Only one batch of size 1 detected. Adjust the batch size.")
                         # If the last batch is of size 1, merge it with the second last batch
                         elif full_batches > 0:
+                            print(f"all images: {all_imgs},  full batch: {full_batches}, last batch: {last_batch_size}")
                             raise ValueError("Last batch of size 1 detected. Adjust the batch size.")
                 _merge_channels(src, plot=False)
                 if timelapse:
-                    _create_movies_from_npy_per_channel(src+'/stack', fps=2)
+                    _create_movies_from_npy_per_channel(stack_path, fps=2)
                 if plot:
                     print(f'plotting {nr} images from {src}/stack')
-                    plot_arrays(src+'/stack', figuresize, cmap, nr=nr, normalize=normalize)
+                    plot_arrays(stack_path, figuresize, cmap, nr=nr, normalize=normalize)
                 if all_to_mip:
-                    _mip_all(src+'/stack')
+                    _mip_all(stack_path)
                     if plot:
                         print(f'plotting {nr} images from {src}/stack')
-                        plot_arrays(src+'/stack', figuresize, cmap, nr=nr, normalize=normalize)
+                        plot_arrays(stack_path, figuresize, cmap, nr=nr, normalize=normalize)
         except Exception as e:
             print(f"Error: {e}")
-    concatenate_and_normalize(src=src+'/stack',
+    concatenate_and_normalize(src=stack_path,
                               channels=mask_channels,
                               save_dtype=np.float32,
                               settings=settings)

spacr/ml.py CHANGED Viewed

@@ -134,7 +134,7 @@ def scale_variables(X, y):
     return X_scaled, y_scaled
-def process_model_coefficients(model, regression_type, X, y, highlight):
+def process_model_coefficients(model, regression_type, X, y, nc, pc, controls):
     """Return DataFrame of model coefficients and p-values."""
     if regression_type in ['ols', 'gls', 'wls', 'rlm', 'glm', 'mixed', 'quantile', 'logit', 'probit', 'poisson']:
         coefs = model.params
@@ -169,8 +169,8 @@ def process_model_coefficients(model, regression_type, X, y, highlight):
         coef_df['p_value'] = np.nan  # Placeholder since sklearn doesn't provide p-values
     coef_df['-log10(p_value)'] = -np.log10(coef_df['p_value'])
-    coef_df['highlight'] = coef_df['feature'].apply(lambda x: highlight in x)
+    coef_df['grna'] = coef_df['feature'].str.extract(r'\[(.*?)\]')[0]
+    coef_df['condition'] = coef_df.apply(lambda row: 'nc' if nc in row['feature'] else 'pc' if pc in row['feature'] else ('control' if row['grna'] in controls else 'other'),axis=1)
     return coef_df[~coef_df['feature'].str.contains('row|column')]
 def prepare_formula(dependent_variable, random_row_column_effects=False):
@@ -284,15 +284,13 @@ def check_and_clean_data(df, dependent_variable):
     df_cleaned['row'] = df['row']
     df_cleaned['column'] = df['column']
-    #display(df_cleaned)
     # Create a new column 'gene_fraction' that sums the fractions by gene within the same well
     df_cleaned['gene_fraction'] = df_cleaned.groupby(['prc', 'gene'])['fraction'].transform('sum')
     print("Data is ready for model fitting.")
     return df_cleaned
-def regression(df, csv_path, dependent_variable='predictions', regression_type=None, alpha=1.0, random_row_column_effects=False, highlight='220950', dst=None, cov_type=None):
+def regression(df, csv_path, dependent_variable='predictions', regression_type=None, alpha=1.0, random_row_column_effects=False, nc='233460', pc='220950', controls=[''], dst=None, cov_type=None, plot=False):
     from .plot import volcano_plot, plot_histogram
     # Generate the volcano filename
@@ -312,9 +310,7 @@ def regression(df, csv_path, dependent_variable='predictions', regression_type=N
     if regression_type is None:
         regression_type = 'ols' if is_normal else 'glm'
-    #display('before check_and_clean_data:',df)
     df = check_and_clean_data(df, dependent_variable)
-    #display('after check_and_clean_data:',df)
     # Handle mixed effects if row/column effect is treated as random
     if random_row_column_effects:
@@ -340,10 +336,10 @@ def regression(df, csv_path, dependent_variable='predictions', regression_type=N
         model = regression_model(X, y, regression_type=regression_type, groups=groups, alpha=alpha, cov_type=cov_type)
         # Process the model coefficients
-        coef_df = process_model_coefficients(model, regression_type, X, y, highlight)
-    # Plot the volcano plot
-    volcano_plot(coef_df, volcano_path)
+        coef_df = process_model_coefficients(model, regression_type, X, y, nc, pc, controls)
+    if plot:
+        volcano_plot(coef_df, volcano_path)
     return model, coef_df
@@ -487,19 +483,28 @@ def perform_regression(settings):
     if settings['transform'] is None:
         _ = plot_plates(score_data_df, variable=dependent_variable, grouping='mean', min_max='allq', cmap='viridis', min_count=settings['min_cell_count'], dst = res_folder)
-    model, coef_df = regression(merged_df, csv_path, dependent_variable, settings['regression_type'], settings['alpha'], settings['random_row_column_effects'], highlight=settings['highlight'], dst=res_folder, cov_type=settings['cov_type'])
+    model, coef_df = regression(merged_df, csv_path, dependent_variable, settings['regression_type'], settings['alpha'], settings['random_row_column_effects'], nc=settings['negative_control'], pc=settings['positive_control'], controls=settings['controls'], dst=res_folder, cov_type=settings['cov_type'])
     coef_df['grna'] = coef_df['feature'].apply(lambda x: re.search(r'grna\[(.*?)\]', x).group(1) if 'grna' in x else None)
     coef_df['gene'] = coef_df['feature'].apply(lambda x: re.search(r'gene\[(.*?)\]', x).group(1) if 'gene' in x else None)
     coef_df = coef_df.merge(n_grna, how='left', on='grna')
     coef_df = coef_df.merge(n_gene, how='left', on='gene')
-    display(coef_df)
     gene_coef_df = coef_df[coef_df['n_gene'] != None]
     grna_coef_df = coef_df[coef_df['n_grna'] != None]
     gene_coef_df = gene_coef_df.dropna(subset=['n_gene'])
     grna_coef_df = grna_coef_df.dropna(subset=['n_grna'])
+    if settings['controls'] is not None:
+        control_coef_df = grna_coef_df[grna_coef_df['grna'].isin(settings['controls'])]
+        mean_coef = control_coef_df['coefficient'].mean()
+        variance_coef = control_coef_df['coefficient'].var()
+        std_coef = control_coef_df['coefficient'].std()
+        reg_threshold = mean_coef + (3 * std_coef)
+    print('coef_df')
+    display(coef_df)
     coef_df.to_csv(results_path, index=False)
     gene_coef_df.to_csv(results_path_gene, index=False)
     grna_coef_df.to_csv(results_path_grna, index=False)
@@ -509,7 +514,10 @@ def perform_regression(settings):
     else:
         significant = coef_df[coef_df['p_value']<= 0.05]
-        #significant = significant[significant['coefficient'] > 0.1]
+        if settings['controls'] is not None:
+            significant_high = significant[significant['coefficient'] >= reg_threshold]
+            significant_low = significant[significant['coefficient'] <= reg_threshold]
+            significant = pd.concat([significant_high, significant_low])
         significant.sort_values(by='coefficient', ascending=False, inplace=True)
         significant = significant[~significant['feature'].str.contains('row|column')]
@@ -530,22 +538,24 @@ def perform_regression(settings):
         grna_merged_df = merge_regression_res_with_metadata(results_path_grna, metadata_file, name=filename)
     if settings['toxo']:
         data_path = merged_df
         data_path_gene = gene_merged_df
         data_path_grna = grna_merged_df
         base_dir = os.path.dirname(os.path.abspath(__file__))
         metadata_path = os.path.join(base_dir, 'resources', 'data', 'lopit.csv')
-        custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
-        custom_volcano_plot(data_path_gene, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
-        custom_volcano_plot(data_path_grna, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
-        if len(significant) > 2:
-            metadata_path = os.path.join(base_dir, 'resources', 'data', 'toxoplasma_metadata.csv')
-            go_term_enrichment_by_column(significant, metadata_path)
+        custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', point_size=200, figsize=20, threshold=reg_threshold, split_axis_lims=settings['split_axis_lims'])
+        #custom_volcano_plot(data_path_gene, metadata_path, metadata_column='tagm_location', point_size=50, figsize=20, threshold=reg_threshold)
+        #custom_volcano_plot(data_path_grna, metadata_path, metadata_column='tagm_location', point_size=50, figsize=20, threshold=reg_threshold)
+        #if len(significant) > 2:
+        #    metadata_path = os.path.join(base_dir, 'resources', 'data', 'toxoplasma_metadata.csv')
+        #    go_term_enrichment_by_column(significant, metadata_path)
     print('Significant Genes')
+    grnas = significant['grna'].unique().tolist()
+    genes = significant['gene'].unique().tolist()
+    print(f"Found p<0.05 coedfficients for {len(grnas)} gRNAs and {len(genes)} genes")
     display(significant)
     output = {'results':coef_df,
@@ -763,7 +773,6 @@ def generate_ml_scores(settings):
             raise ValueError("The 'png_list_df' DataFrame must contain 'prcfo' and 'test' columns.")
         annotated_df = png_list_df[['prcfo', settings['annotation_column']]].set_index('prcfo')
         df = annotated_df.merge(df, left_index=True, right_index=True)
-        #display(df)
         unique_values = df[settings['annotation_column']].dropna().unique()
         if len(unique_values) == 1:
             unannotated_rows = df[df[settings['annotation_column']].isna()].index

spacr 0.3.37__py3-none-any.whl → 0.3.41__py3-none-any.whl

spacr 0.3.37py3-none-any.whl → 0.3.41py3-none-any.whl