spacr 0.3.50__py3-none-any.whl → 0.3.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/gui_elements.py +1 -1
- spacr/gui_utils.py +4 -116
- spacr/io.py +114 -140
- spacr/measure.py +14 -12
- spacr/ml.py +41 -32
- spacr/plot.py +167 -307
- spacr/sequencing.py +13 -9
- spacr/settings.py +29 -39
- spacr/submodules.py +19 -19
- spacr/timelapse.py +16 -16
- spacr/toxo.py +180 -1
- spacr/utils.py +95 -164
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/METADATA +2 -1
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/RECORD +18 -18
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/LICENSE +0 -0
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/WHEEL +0 -0
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/entry_points.txt +0 -0
- {spacr-0.3.50.dist-info → spacr-0.3.55.dist-info}/top_level.txt +0 -0
spacr/ml.py
CHANGED
@@ -155,10 +155,6 @@ def process_model_coefficients(model, regression_type, X, y, nc, pc, controls):
|
|
155
155
|
coef_df['condition'] = coef_df.apply(lambda row: 'nc' if nc in row['feature'] else 'pc' if pc in row['feature'] else ('control' if row['grna'] in controls else 'other'),axis=1)
|
156
156
|
return coef_df[~coef_df['feature'].str.contains('row|column')]
|
157
157
|
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
158
|
def check_distribution(y):
|
163
159
|
"""Check the type of distribution to recommend a model."""
|
164
160
|
if np.all((y == 0) | (y == 1)):
|
@@ -288,7 +284,7 @@ def check_and_clean_data(df, dependent_variable):
|
|
288
284
|
df = handle_missing_values(df, ['fraction', dependent_variable])
|
289
285
|
|
290
286
|
# Step 2: Ensure grna, gene, plate, row, column, and prc are categorical types
|
291
|
-
df = ensure_valid_types(df, ['grna', 'gene', 'plate', '
|
287
|
+
df = ensure_valid_types(df, ['grna', 'gene', 'plate', 'row_name', 'column', 'prc'])
|
292
288
|
|
293
289
|
# Step 3: Check for multicollinearity in fraction and the dependent variable
|
294
290
|
df_cleaned = check_collinearity(df, ['fraction', dependent_variable])
|
@@ -298,7 +294,7 @@ def check_and_clean_data(df, dependent_variable):
|
|
298
294
|
df_cleaned['grna'] = df['grna']
|
299
295
|
df_cleaned['prc'] = df['prc']
|
300
296
|
df_cleaned['plate'] = df['plate']
|
301
|
-
df_cleaned['
|
297
|
+
df_cleaned['row_name'] = df['row_name']
|
302
298
|
df_cleaned['column'] = df['column']
|
303
299
|
|
304
300
|
# Create a new column 'gene_fraction' that sums the fractions by gene within the same well
|
@@ -337,7 +333,7 @@ def minimum_cell_simulation(settings, num_repeats=10, sample_size=100, tolerance
|
|
337
333
|
df = pd.read_csv(score_data)
|
338
334
|
df = correct_metadata_column_names(df)
|
339
335
|
df['plate'] = f'plate{i + 1}'
|
340
|
-
df['prc'] = df['plate'] + '_' + df['
|
336
|
+
df['prc'] = df['plate'] + '_' + df['row_name'].astype(str) + '_' + df['column'].astype(str)
|
341
337
|
dfs.append(df)
|
342
338
|
|
343
339
|
df = pd.concat(dfs, axis=0)
|
@@ -706,18 +702,16 @@ def perform_regression(settings):
|
|
706
702
|
def _perform_regression_read_data(settings):
|
707
703
|
|
708
704
|
if isinstance(settings['score_data'], list) and isinstance(settings['count_data'], list):
|
709
|
-
settings['plate'] = None
|
710
705
|
if len(settings['score_data']) == 1:
|
711
|
-
|
712
|
-
|
713
|
-
settings['count_data'] = settings['count_data'][0]
|
706
|
+
count_data_df = pd.read_csv(settings['count_data'][0])
|
707
|
+
score_data_df = pd.read_csv(settings['score_data'][0])
|
714
708
|
else:
|
715
709
|
count_data_df = pd.DataFrame()
|
716
710
|
for i, count_data in enumerate(settings['count_data']):
|
717
711
|
df = pd.read_csv(count_data)
|
718
712
|
df['plate_name'] = f'plate{i+1}'
|
719
713
|
if 'column' in df.columns:
|
720
|
-
df['
|
714
|
+
df['column_name'] = df['column']
|
721
715
|
count_data_df = pd.concat([count_data_df, df])
|
722
716
|
print('Count data:', len(count_data_df))
|
723
717
|
|
@@ -726,7 +720,7 @@ def perform_regression(settings):
|
|
726
720
|
df = pd.read_csv(score_data)
|
727
721
|
df['plate_name'] = f'plate{i+1}'
|
728
722
|
if 'column' in df.columns:
|
729
|
-
df['
|
723
|
+
df['column_name'] = df['column']
|
730
724
|
score_data_df = pd.concat([score_data_df, df])
|
731
725
|
print('Score data:', len(score_data_df))
|
732
726
|
else:
|
@@ -806,9 +800,23 @@ def perform_regression(settings):
|
|
806
800
|
return df, n_gene
|
807
801
|
else:
|
808
802
|
return df
|
809
|
-
|
803
|
+
|
810
804
|
settings = get_perform_regression_default_settings(settings)
|
811
805
|
count_data_df, score_data_df = _perform_regression_read_data(settings)
|
806
|
+
|
807
|
+
if "row_name" in count_data_df.columns:
|
808
|
+
num_parts = len(count_data_df['row_name'].iloc[0].split('_'))
|
809
|
+
if num_parts == 2:
|
810
|
+
split = count_data_df['row_name'].str.split('_', expand=True)
|
811
|
+
count_data_df['row_name'] = split[1]
|
812
|
+
|
813
|
+
if "prc" in score_data_df.columns:
|
814
|
+
num_parts = len(score_data_df['prc'].iloc[0].split('_'))
|
815
|
+
if num_parts == 3:
|
816
|
+
split = score_data_df['prc'].str.split('_', expand=True)
|
817
|
+
score_data_df['plate'] = settings['plate']
|
818
|
+
score_data_df['prc'] = score_data_df['plate'] + '_' + split[1] + '_' + split[2]
|
819
|
+
|
812
820
|
results_path, results_path_gene, results_path_grna, hits_path, res_folder, csv_path = _perform_regression_set_paths(settings)
|
813
821
|
save_settings(settings, name='regression', show=True)
|
814
822
|
|
@@ -849,7 +857,7 @@ def perform_regression(settings):
|
|
849
857
|
merged_df.to_csv(data_path, index=False)
|
850
858
|
print(f"Saved regression data to {data_path}")
|
851
859
|
|
852
|
-
merged_df[['plate', '
|
860
|
+
merged_df[['plate', 'row_name', 'column']] = merged_df['prc'].str.split('_', expand=True)
|
853
861
|
|
854
862
|
_ = plot_plates(merged_df, variable=orig_dv, grouping='mean', min_max='allq', cmap='viridis', min_count=None, dst=res_folder)
|
855
863
|
|
@@ -857,6 +865,7 @@ def perform_regression(settings):
|
|
857
865
|
|
858
866
|
coef_df['grna'] = coef_df['feature'].apply(lambda x: re.search(r'grna\[(.*?)\]', x).group(1) if 'grna' in x else None)
|
859
867
|
coef_df['gene'] = coef_df['feature'].apply(lambda x: re.search(r'gene\[(.*?)\]', x).group(1) if 'gene' in x else None)
|
868
|
+
|
860
869
|
coef_df = coef_df.merge(n_grna, how='left', on='grna')
|
861
870
|
coef_df = coef_df.merge(n_gene, how='left', on='gene')
|
862
871
|
|
@@ -903,7 +912,6 @@ def perform_regression(settings):
|
|
903
912
|
save_summary_to_file(model, file_path=f'{res_folder}/mode_summary.csv')
|
904
913
|
|
905
914
|
significant.to_csv(hits_path, index=False)
|
906
|
-
|
907
915
|
significant_grna_filtered = significant[significant['n_grna'] > settings['min_n']]
|
908
916
|
significant_gene_filtered = significant[significant['n_gene'] > settings['min_n']]
|
909
917
|
significant_filtered = pd.concat([significant_grna_filtered, significant_gene_filtered])
|
@@ -928,8 +936,6 @@ def perform_regression(settings):
|
|
928
936
|
base_dir = os.path.dirname(os.path.abspath(__file__))
|
929
937
|
metadata_path = os.path.join(base_dir, 'resources', 'data', 'lopit.csv')
|
930
938
|
|
931
|
-
display(data_path)
|
932
|
-
|
933
939
|
if settings['volcano'] == 'all':
|
934
940
|
print('all')
|
935
941
|
gene_list = custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', point_size=600, figsize=20, threshold=reg_threshold, save_path=volcano_path, x_lim=settings['x_lim'],y_lims=settings['y_lims'])
|
@@ -982,14 +988,14 @@ def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filte
|
|
982
988
|
csv_df = csv_df.rename(columns={'plate_name': 'plate'})
|
983
989
|
if 'column_name' in csv_df.columns:
|
984
990
|
csv_df = csv_df.rename(columns={'column_name': 'column'})
|
985
|
-
if '
|
986
|
-
csv_df = csv_df.rename(columns={'
|
991
|
+
if 'column_name' in csv_df.columns:
|
992
|
+
csv_df = csv_df.rename(columns={'column_name': 'column'})
|
987
993
|
if 'row_name' in csv_df.columns:
|
988
|
-
csv_df = csv_df.rename(columns={'row_name': '
|
994
|
+
csv_df = csv_df.rename(columns={'row_name': 'row_name'})
|
989
995
|
if 'grna_name' in csv_df.columns:
|
990
996
|
csv_df = csv_df.rename(columns={'grna_name': 'grna'})
|
991
997
|
if 'plate_row' in csv_df.columns:
|
992
|
-
csv_df[['plate', '
|
998
|
+
csv_df[['plate', 'row_name']] = csv_df['plate_row'].str.split('_', expand=True)
|
993
999
|
|
994
1000
|
if not 'plate' in csv_df.columns:
|
995
1001
|
if not plate is None:
|
@@ -1009,11 +1015,11 @@ def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filte
|
|
1009
1015
|
csv_df = csv_df[csv_df[filter_col] != value]
|
1010
1016
|
|
1011
1017
|
# Ensure the necessary columns are present
|
1012
|
-
if not all(col in csv_df.columns for col in ['
|
1013
|
-
raise ValueError("The CSV file must contain 'grna', 'count', '
|
1018
|
+
if not all(col in csv_df.columns for col in ['row_name','column','grna','count']):
|
1019
|
+
raise ValueError("The CSV file must contain 'grna', 'count', 'row_name', and 'column' columns.")
|
1014
1020
|
|
1015
1021
|
# Create the prc column
|
1016
|
-
csv_df['prc'] = csv_df['plate'] + '_' + csv_df['
|
1022
|
+
csv_df['prc'] = csv_df['plate'] + '_' + csv_df['row_name'] + '_' + csv_df['column']
|
1017
1023
|
|
1018
1024
|
# Group by prc and calculate the sum of counts
|
1019
1025
|
grouped_df = csv_df.groupby('prc')['count'].sum().reset_index()
|
@@ -1075,7 +1081,7 @@ def clean_controls(df,values, column):
|
|
1075
1081
|
return df
|
1076
1082
|
|
1077
1083
|
def process_scores(df, dependent_variable, plate, min_cell_count=25, agg_type='mean', transform=None, regression_type='ols'):
|
1078
|
-
|
1084
|
+
|
1079
1085
|
if 'plate_name' in df.columns:
|
1080
1086
|
df.drop(columns=['plate'], inplace=True)
|
1081
1087
|
df = df.rename(columns={'plate_name': 'plate'})
|
@@ -1083,11 +1089,14 @@ def process_scores(df, dependent_variable, plate, min_cell_count=25, agg_type='m
|
|
1083
1089
|
if plate is not None:
|
1084
1090
|
df['plate'] = plate
|
1085
1091
|
|
1086
|
-
if '
|
1087
|
-
df['
|
1092
|
+
if 'column_name' not in df.columns:
|
1093
|
+
df['column_name'] = df['column']
|
1088
1094
|
|
1089
|
-
df['prc'] = df['plate'].astype(str) + '_' + df['
|
1095
|
+
df['prc'] = df['plate'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str)
|
1090
1096
|
|
1097
|
+
display(df)
|
1098
|
+
|
1099
|
+
|
1091
1100
|
df = df[['prc', dependent_variable]]
|
1092
1101
|
|
1093
1102
|
# Group by prc and calculate the mean and count of the dependent_variable
|
@@ -1257,7 +1266,7 @@ def generate_ml_scores(settings):
|
|
1257
1266
|
|
1258
1267
|
return [output, plate_heatmap]
|
1259
1268
|
|
1260
|
-
def ml_analysis(df, channel_of_interest=3, location_column='
|
1269
|
+
def ml_analysis(df, channel_of_interest=3, location_column='column_name', positive_control='c2', negative_control='c1', exclude=None, n_repeats=10, top_features=30, n_estimators=100, test_size=0.2, model_type='xgboost', n_jobs=-1, remove_low_variance_features=True, remove_highly_correlated_features=True, verbose=False):
|
1261
1270
|
|
1262
1271
|
"""
|
1263
1272
|
Calculates permutation importance for numerical features in the dataframe,
|
@@ -1403,8 +1412,8 @@ def ml_analysis(df, channel_of_interest=3, location_column='col', positive_contr
|
|
1403
1412
|
df = _calculate_similarity(df, features, location_column, positive_control, negative_control)
|
1404
1413
|
|
1405
1414
|
df['prcfo'] = df.index.astype(str)
|
1406
|
-
df[['plate', '
|
1407
|
-
df['prc'] = df['plate'] + '_' + df['
|
1415
|
+
df[['plate', 'row_name', 'column_name', 'field', 'object']] = df['prcfo'].str.split('_', expand=True)
|
1416
|
+
df['prc'] = df['plate'] + '_' + df['row_name'] + '_' + df['column_name']
|
1408
1417
|
|
1409
1418
|
return [df, permutation_df, feature_importance_df, model, X_train, X_test, y_train, y_test, metrics_df], [permutation_fig, feature_importance_fig]
|
1410
1419
|
|