spacr 0.3.50__py3-none-any.whl → 0.3.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spacr/ml.py CHANGED
@@ -155,10 +155,6 @@ def process_model_coefficients(model, regression_type, X, y, nc, pc, controls):
155
155
  coef_df['condition'] = coef_df.apply(lambda row: 'nc' if nc in row['feature'] else 'pc' if pc in row['feature'] else ('control' if row['grna'] in controls else 'other'),axis=1)
156
156
  return coef_df[~coef_df['feature'].str.contains('row|column')]
157
157
 
158
-
159
-
160
-
161
-
162
158
  def check_distribution(y):
163
159
  """Check the type of distribution to recommend a model."""
164
160
  if np.all((y == 0) | (y == 1)):
@@ -288,7 +284,7 @@ def check_and_clean_data(df, dependent_variable):
288
284
  df = handle_missing_values(df, ['fraction', dependent_variable])
289
285
 
290
286
  # Step 2: Ensure grna, gene, plate, row, column, and prc are categorical types
291
- df = ensure_valid_types(df, ['grna', 'gene', 'plate', 'row', 'column', 'prc'])
287
+ df = ensure_valid_types(df, ['grna', 'gene', 'plate', 'row_name', 'column', 'prc'])
292
288
 
293
289
  # Step 3: Check for multicollinearity in fraction and the dependent variable
294
290
  df_cleaned = check_collinearity(df, ['fraction', dependent_variable])
@@ -298,7 +294,7 @@ def check_and_clean_data(df, dependent_variable):
298
294
  df_cleaned['grna'] = df['grna']
299
295
  df_cleaned['prc'] = df['prc']
300
296
  df_cleaned['plate'] = df['plate']
301
- df_cleaned['row'] = df['row']
297
+ df_cleaned['row_name'] = df['row_name']
302
298
  df_cleaned['column'] = df['column']
303
299
 
304
300
  # Create a new column 'gene_fraction' that sums the fractions by gene within the same well
@@ -337,7 +333,7 @@ def minimum_cell_simulation(settings, num_repeats=10, sample_size=100, tolerance
337
333
  df = pd.read_csv(score_data)
338
334
  df = correct_metadata_column_names(df)
339
335
  df['plate'] = f'plate{i + 1}'
340
- df['prc'] = df['plate'] + '_' + df['row'].astype(str) + '_' + df['column'].astype(str)
336
+ df['prc'] = df['plate'] + '_' + df['row_name'].astype(str) + '_' + df['column'].astype(str)
341
337
  dfs.append(df)
342
338
 
343
339
  df = pd.concat(dfs, axis=0)
@@ -706,18 +702,16 @@ def perform_regression(settings):
706
702
  def _perform_regression_read_data(settings):
707
703
 
708
704
  if isinstance(settings['score_data'], list) and isinstance(settings['count_data'], list):
709
- settings['plate'] = None
710
705
  if len(settings['score_data']) == 1:
711
- settings['score_data'] = settings['score_data'][0]
712
- if len(settings['count_data']) == 1:
713
- settings['count_data'] = settings['count_data'][0]
706
+ count_data_df = pd.read_csv(settings['count_data'][0])
707
+ score_data_df = pd.read_csv(settings['score_data'][0])
714
708
  else:
715
709
  count_data_df = pd.DataFrame()
716
710
  for i, count_data in enumerate(settings['count_data']):
717
711
  df = pd.read_csv(count_data)
718
712
  df['plate_name'] = f'plate{i+1}'
719
713
  if 'column' in df.columns:
720
- df['col'] = df['column']
714
+ df['column_name'] = df['column']
721
715
  count_data_df = pd.concat([count_data_df, df])
722
716
  print('Count data:', len(count_data_df))
723
717
 
@@ -726,7 +720,7 @@ def perform_regression(settings):
726
720
  df = pd.read_csv(score_data)
727
721
  df['plate_name'] = f'plate{i+1}'
728
722
  if 'column' in df.columns:
729
- df['col'] = df['column']
723
+ df['column_name'] = df['column']
730
724
  score_data_df = pd.concat([score_data_df, df])
731
725
  print('Score data:', len(score_data_df))
732
726
  else:
@@ -806,9 +800,23 @@ def perform_regression(settings):
806
800
  return df, n_gene
807
801
  else:
808
802
  return df
809
-
803
+
810
804
  settings = get_perform_regression_default_settings(settings)
811
805
  count_data_df, score_data_df = _perform_regression_read_data(settings)
806
+
807
+ if "row_name" in count_data_df.columns:
808
+ num_parts = len(count_data_df['row_name'].iloc[0].split('_'))
809
+ if num_parts == 2:
810
+ split = count_data_df['row_name'].str.split('_', expand=True)
811
+ count_data_df['row_name'] = split[1]
812
+
813
+ if "prc" in score_data_df.columns:
814
+ num_parts = len(score_data_df['prc'].iloc[0].split('_'))
815
+ if num_parts == 3:
816
+ split = score_data_df['prc'].str.split('_', expand=True)
817
+ score_data_df['plate'] = settings['plate']
818
+ score_data_df['prc'] = score_data_df['plate'] + '_' + split[1] + '_' + split[2]
819
+
812
820
  results_path, results_path_gene, results_path_grna, hits_path, res_folder, csv_path = _perform_regression_set_paths(settings)
813
821
  save_settings(settings, name='regression', show=True)
814
822
 
@@ -849,7 +857,7 @@ def perform_regression(settings):
849
857
  merged_df.to_csv(data_path, index=False)
850
858
  print(f"Saved regression data to {data_path}")
851
859
 
852
- merged_df[['plate', 'row', 'column']] = merged_df['prc'].str.split('_', expand=True)
860
+ merged_df[['plate', 'row_name', 'column']] = merged_df['prc'].str.split('_', expand=True)
853
861
 
854
862
  _ = plot_plates(merged_df, variable=orig_dv, grouping='mean', min_max='allq', cmap='viridis', min_count=None, dst=res_folder)
855
863
 
@@ -857,6 +865,7 @@ def perform_regression(settings):
857
865
 
858
866
  coef_df['grna'] = coef_df['feature'].apply(lambda x: re.search(r'grna\[(.*?)\]', x).group(1) if 'grna' in x else None)
859
867
  coef_df['gene'] = coef_df['feature'].apply(lambda x: re.search(r'gene\[(.*?)\]', x).group(1) if 'gene' in x else None)
868
+
860
869
  coef_df = coef_df.merge(n_grna, how='left', on='grna')
861
870
  coef_df = coef_df.merge(n_gene, how='left', on='gene')
862
871
 
@@ -903,7 +912,6 @@ def perform_regression(settings):
903
912
  save_summary_to_file(model, file_path=f'{res_folder}/mode_summary.csv')
904
913
 
905
914
  significant.to_csv(hits_path, index=False)
906
-
907
915
  significant_grna_filtered = significant[significant['n_grna'] > settings['min_n']]
908
916
  significant_gene_filtered = significant[significant['n_gene'] > settings['min_n']]
909
917
  significant_filtered = pd.concat([significant_grna_filtered, significant_gene_filtered])
@@ -928,8 +936,6 @@ def perform_regression(settings):
928
936
  base_dir = os.path.dirname(os.path.abspath(__file__))
929
937
  metadata_path = os.path.join(base_dir, 'resources', 'data', 'lopit.csv')
930
938
 
931
- display(data_path)
932
-
933
939
  if settings['volcano'] == 'all':
934
940
  print('all')
935
941
  gene_list = custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', point_size=600, figsize=20, threshold=reg_threshold, save_path=volcano_path, x_lim=settings['x_lim'],y_lims=settings['y_lims'])
@@ -982,14 +988,14 @@ def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filte
982
988
  csv_df = csv_df.rename(columns={'plate_name': 'plate'})
983
989
  if 'column_name' in csv_df.columns:
984
990
  csv_df = csv_df.rename(columns={'column_name': 'column'})
985
- if 'col' in csv_df.columns:
986
- csv_df = csv_df.rename(columns={'col': 'column'})
991
+ if 'column_name' in csv_df.columns:
992
+ csv_df = csv_df.rename(columns={'column_name': 'column'})
987
993
  if 'row_name' in csv_df.columns:
988
- csv_df = csv_df.rename(columns={'row_name': 'row'})
994
+ csv_df = csv_df.rename(columns={'row_name': 'row_name'})
989
995
  if 'grna_name' in csv_df.columns:
990
996
  csv_df = csv_df.rename(columns={'grna_name': 'grna'})
991
997
  if 'plate_row' in csv_df.columns:
992
- csv_df[['plate', 'row']] = csv_df['plate_row'].str.split('_', expand=True)
998
+ csv_df[['plate', 'row_name']] = csv_df['plate_row'].str.split('_', expand=True)
993
999
 
994
1000
  if not 'plate' in csv_df.columns:
995
1001
  if not plate is None:
@@ -1009,11 +1015,11 @@ def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filte
1009
1015
  csv_df = csv_df[csv_df[filter_col] != value]
1010
1016
 
1011
1017
  # Ensure the necessary columns are present
1012
- if not all(col in csv_df.columns for col in ['row','column','grna','count']):
1013
- raise ValueError("The CSV file must contain 'grna', 'count', 'row', and 'column' columns.")
1018
+ if not all(col in csv_df.columns for col in ['row_name','column','grna','count']):
1019
+ raise ValueError("The CSV file must contain 'grna', 'count', 'row_name', and 'column' columns.")
1014
1020
 
1015
1021
  # Create the prc column
1016
- csv_df['prc'] = csv_df['plate'] + '_' + csv_df['row'] + '_' + csv_df['column']
1022
+ csv_df['prc'] = csv_df['plate'] + '_' + csv_df['row_name'] + '_' + csv_df['column']
1017
1023
 
1018
1024
  # Group by prc and calculate the sum of counts
1019
1025
  grouped_df = csv_df.groupby('prc')['count'].sum().reset_index()
@@ -1075,7 +1081,7 @@ def clean_controls(df,values, column):
1075
1081
  return df
1076
1082
 
1077
1083
  def process_scores(df, dependent_variable, plate, min_cell_count=25, agg_type='mean', transform=None, regression_type='ols'):
1078
-
1084
+
1079
1085
  if 'plate_name' in df.columns:
1080
1086
  df.drop(columns=['plate'], inplace=True)
1081
1087
  df = df.rename(columns={'plate_name': 'plate'})
@@ -1083,11 +1089,14 @@ def process_scores(df, dependent_variable, plate, min_cell_count=25, agg_type='m
1083
1089
  if plate is not None:
1084
1090
  df['plate'] = plate
1085
1091
 
1086
- if 'col' not in df.columns:
1087
- df['col'] = df['column']
1092
+ if 'column_name' not in df.columns:
1093
+ df['column_name'] = df['column']
1088
1094
 
1089
- df['prc'] = df['plate'].astype(str) + '_' + df['row'].astype(str) + '_' + df['col'].astype(str)
1095
+ df['prc'] = df['plate'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str)
1090
1096
 
1097
+ display(df)
1098
+
1099
+
1091
1100
  df = df[['prc', dependent_variable]]
1092
1101
 
1093
1102
  # Group by prc and calculate the mean and count of the dependent_variable
@@ -1257,7 +1266,7 @@ def generate_ml_scores(settings):
1257
1266
 
1258
1267
  return [output, plate_heatmap]
1259
1268
 
1260
- def ml_analysis(df, channel_of_interest=3, location_column='col', positive_control='c2', negative_control='c1', exclude=None, n_repeats=10, top_features=30, n_estimators=100, test_size=0.2, model_type='xgboost', n_jobs=-1, remove_low_variance_features=True, remove_highly_correlated_features=True, verbose=False):
1269
+ def ml_analysis(df, channel_of_interest=3, location_column='column_name', positive_control='c2', negative_control='c1', exclude=None, n_repeats=10, top_features=30, n_estimators=100, test_size=0.2, model_type='xgboost', n_jobs=-1, remove_low_variance_features=True, remove_highly_correlated_features=True, verbose=False):
1261
1270
 
1262
1271
  """
1263
1272
  Calculates permutation importance for numerical features in the dataframe,
@@ -1403,8 +1412,8 @@ def ml_analysis(df, channel_of_interest=3, location_column='col', positive_contr
1403
1412
  df = _calculate_similarity(df, features, location_column, positive_control, negative_control)
1404
1413
 
1405
1414
  df['prcfo'] = df.index.astype(str)
1406
- df[['plate', 'row', 'col', 'field', 'object']] = df['prcfo'].str.split('_', expand=True)
1407
- df['prc'] = df['plate'] + '_' + df['row'] + '_' + df['col']
1415
+ df[['plate', 'row_name', 'column_name', 'field', 'object']] = df['prcfo'].str.split('_', expand=True)
1416
+ df['prc'] = df['plate'] + '_' + df['row_name'] + '_' + df['column_name']
1408
1417
 
1409
1418
  return [df, permutation_df, feature_importance_df, model, X_train, X_test, y_train, y_test, metrics_df], [permutation_fig, feature_importance_fig]
1410
1419