spacr 0.4.15__py3-none-any.whl → 0.4.60__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spacr/ml.py CHANGED
@@ -195,18 +195,18 @@ def prepare_formula(dependent_variable, random_row_column_effects=False):
195
195
  if random_row_column_effects:
196
196
  # Random effects for row and column + gene weighted by gene_fraction + grna weighted by fraction
197
197
  return f'{dependent_variable} ~ fraction:grna + gene_fraction:gene'
198
- return f'{dependent_variable} ~ fraction:grna + gene_fraction:gene + row_name + column_name'
198
+ return f'{dependent_variable} ~ fraction:grna + gene_fraction:gene + rowID + columnID'
199
199
 
200
200
  def fit_mixed_model(df, formula, dst):
201
201
  from .plot import plot_histogram
202
202
 
203
- """Fit the mixed model with plate, row_name, and column_name as random effects and return results."""
203
+ """Fit the mixed model with plate, row_name, and columnID as random effects and return results."""
204
204
  # Specify random effects for plate, row, and column
205
205
  model = smf.mixedlm(formula,
206
206
  data=df,
207
- groups=df['plate'],
208
- re_formula="1 + row_name + column_name",
209
- vc_formula={"row_name": "0 + row_name", "column_name": "0 + column_name"})
207
+ groups=df['plateID'],
208
+ re_formula="1 + rowID + columnID",
209
+ vc_formula={"rowID": "0 + rowID", "columnID": "0 + columnID"})
210
210
 
211
211
  mixed_model = model.fit()
212
212
 
@@ -288,7 +288,7 @@ def check_and_clean_data(df, dependent_variable):
288
288
  df = handle_missing_values(df, ['fraction', dependent_variable])
289
289
 
290
290
  # Step 2: Ensure grna, gene, plate, row, column, and prc are categorical types
291
- df = ensure_valid_types(df, ['grna', 'gene', 'plate', 'row_name', 'column_name', 'prc'])
291
+ df = ensure_valid_types(df, ['grna', 'gene', 'plateID', 'rowID', 'columnID', 'prc'])
292
292
 
293
293
  # Step 3: Check for multicollinearity in fraction and the dependent variable
294
294
  df_cleaned = check_collinearity(df, ['fraction', dependent_variable])
@@ -297,9 +297,9 @@ def check_and_clean_data(df, dependent_variable):
297
297
  df_cleaned['gene'] = df['gene']
298
298
  df_cleaned['grna'] = df['grna']
299
299
  df_cleaned['prc'] = df['prc']
300
- df_cleaned['plate'] = df['plate']
301
- df_cleaned['row_name'] = df['row_name']
302
- df_cleaned['column_name'] = df['column']
300
+ df_cleaned['plateID'] = df['plateID']
301
+ df_cleaned['rowID'] = df['rowID']
302
+ df_cleaned['columnID'] = df['columnID']
303
303
 
304
304
  # Create a new column 'gene_fraction' that sums the fractions by gene within the same well
305
305
  df_cleaned['gene_fraction'] = df_cleaned.groupby(['prc', 'gene'])['fraction'].transform('sum')
@@ -336,10 +336,10 @@ def minimum_cell_simulation(settings, num_repeats=10, sample_size=100, tolerance
336
336
  for i, score_data in enumerate(settings['score_data']):
337
337
  df = pd.read_csv(score_data)
338
338
  df = correct_metadata_column_names(df)
339
- df['plate'] = f'plate{i + 1}'
339
+ df['plateID'] = f'plate{i + 1}'
340
340
 
341
341
  if 'prc' not in df.columns:
342
- df['prc'] = df['plate'] + '_' + df['row'].astype(str) + '_' + df['column'].astype(str)
342
+ df['prc'] = df['plateID'] + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
343
343
 
344
344
  dfs.append(df)
345
345
 
@@ -429,120 +429,11 @@ def minimum_cell_simulation(settings, num_repeats=10, sample_size=100, tolerance
429
429
  color='teal', alpha=0.3, label='±1 Std. Dev.'
430
430
  )
431
431
 
432
- # Mark the elbow point (inflection) on the plot
433
- ax.axvline(elbow_point['sample_size'], color='black', linestyle='--', label='Elbow Point')
434
-
435
- # Formatting the plot
436
- ax.set_xlabel('Sample Size')
437
- ax.set_ylabel('Mean Absolute Difference')
438
- ax.set_title('Mean Absolute Difference vs. Sample Size with Standard Deviation')
439
- ax.legend().remove()
440
-
441
- # Save the plot if a destination is provided
442
- dst = os.path.dirname(settings['count_data'][0])
443
- if dst is not None:
444
- fig_path = os.path.join(dst, 'results')
445
- os.makedirs(fig_path, exist_ok=True)
446
- fig_file_path = os.path.join(fig_path, 'cell_min_threshold.pdf')
447
- fig.savefig(fig_file_path, format='pdf', dpi=600, bbox_inches='tight')
448
- print(f"Saved {fig_file_path}")
449
-
450
- plt.show()
451
- return elbow_point['sample_size']
452
-
453
- def minimum_cell_simulation_v1(settings, num_repeats=10, sample_size=100, tolerance=0.02, smoothing=10, increment=10):
454
- """
455
- Plot the mean absolute difference with standard deviation as shaded area vs. sample size.
456
- Detect and mark the elbow point (inflection) with smoothing and tolerance control.
457
- """
458
-
459
- from spacr.utils import correct_metadata_column_names
460
-
461
- # Load and process data
462
- if isinstance(settings['score_data'], str):
463
- settings['score_data'] = [settings['score_data']]
464
-
465
- dfs = []
466
- for i, score_data in enumerate(settings['score_data']):
467
- df = pd.read_csv(score_data)
468
- df = correct_metadata_column_names(df)
469
- df['plate'] = f'plate{i + 1}'
470
-
471
- if 'prc' not in df.columns:
472
- df['prc'] = df['plate'] + '_' + df['row'].astype(str) + '_' + df['column'].astype(str)
473
-
474
- dfs.append(df)
475
-
476
- df = pd.concat(dfs, axis=0)
477
-
478
- # Compute the number of cells per well and select the top 100 wells by cell count
479
- cell_counts = df.groupby('prc').size().reset_index(name='cell_count')
480
- top_wells = cell_counts.nlargest(sample_size, 'cell_count')['prc']
481
-
482
- # Filter the data to include only the top 100 wells
483
- df = df[df['prc'].isin(top_wells)]
484
-
485
- # Initialize storage for absolute difference data
486
- diff_data = []
487
-
488
- # Group by wells and iterate over them
489
- for i, (prc, group) in enumerate(df.groupby('prc')):
490
- original_mean = group[settings['score_column']].mean() # Original full-well mean
491
- max_cells = len(group)
492
- sample_sizes = np.arange(2, max_cells + 1, increment) # Sample sizes from 2 to max cells
493
-
494
- # Iterate over sample sizes and compute absolute difference
495
- for sample_size in sample_sizes:
496
- abs_diffs = []
497
-
498
- # Perform multiple random samples to reduce noise
499
- for _ in range(num_repeats):
500
- sample = group.sample(n=sample_size, replace=False)
501
- sampled_mean = sample[settings['score_column']].mean()
502
- abs_diff = abs(sampled_mean - original_mean) # Absolute difference
503
- abs_diffs.append(abs_diff)
504
-
505
- # Compute the average absolute difference across all repeats
506
- avg_abs_diff = np.mean(abs_diffs)
507
-
508
- # Store the result for plotting
509
- diff_data.append((sample_size, avg_abs_diff))
510
-
511
- # Convert absolute difference data to DataFrame for plotting
512
- diff_df = pd.DataFrame(diff_data, columns=['sample_size', 'avg_abs_diff'])
513
-
514
- # Group by sample size to calculate mean and standard deviation
515
- summary_df = diff_df.groupby('sample_size').agg(
516
- mean_abs_diff=('avg_abs_diff', 'mean'),
517
- std_abs_diff=('avg_abs_diff', 'std')
518
- ).reset_index()
519
-
520
- # Apply smoothing using a rolling window
521
- summary_df['smoothed_mean_abs_diff'] = summary_df['mean_abs_diff'].rolling(window=smoothing, min_periods=1).mean()
522
-
523
- # Detect the elbow point (where mean_abs_diff < tolerance)
524
- elbow_df = summary_df[summary_df['smoothed_mean_abs_diff'] <= tolerance]
525
-
526
- # Select the first occurrence if it exists; otherwise, use the last point
527
- if not elbow_df.empty:
528
- elbow_point = elbow_df.iloc[0] # First point where the condition is met
432
+ if settings['min_cell_count'] is None:
433
+ # Mark the elbow point (inflection) on the plot
434
+ ax.axvline(elbow_point['sample_size'], color='black', linestyle='--', label='Elbow Point')
529
435
  else:
530
- elbow_point = summary_df.iloc[-1] # Fallback to the last point
531
-
532
- # Plot the mean absolute difference with standard deviation as shaded area
533
- fig, ax = plt.subplots(figsize=(10, 10))
534
- ax.plot(
535
- summary_df['sample_size'], summary_df['smoothed_mean_abs_diff'], color='teal', label='Smoothed Mean Absolute Difference'
536
- )
537
- ax.fill_between(
538
- summary_df['sample_size'],
539
- summary_df['smoothed_mean_abs_diff'] - summary_df['std_abs_diff'],
540
- summary_df['smoothed_mean_abs_diff'] + summary_df['std_abs_diff'],
541
- color='teal', alpha=0.3, label='±1 Std. Dev.'
542
- )
543
-
544
- # Mark the elbow point (inflection) on the plot
545
- ax.axvline(elbow_point['sample_size'], color='black', linestyle='--', label='Elbow Point')
436
+ ax.axvline(settings['min_cell_count'], color='black', linestyle='--', label='Elbow Point')
546
437
 
547
438
  # Formatting the plot
548
439
  ax.set_xlabel('Sample Size')
@@ -831,46 +722,40 @@ def save_summary_to_file(model, file_path='summary.csv'):
831
722
  def perform_regression(settings):
832
723
 
833
724
  from .plot import plot_plates, plot_data_from_csv
834
- from .utils import merge_regression_res_with_metadata, save_settings, calculate_shortest_distance
725
+ from .utils import merge_regression_res_with_metadata, save_settings, calculate_shortest_distance, correct_metadata
835
726
  from .settings import get_perform_regression_default_settings
836
727
  from .toxo import go_term_enrichment_by_column, custom_volcano_plot, plot_gene_phenotypes, plot_gene_heatmaps
837
728
  from .sequencing import graph_sequencing_stats
838
729
 
839
730
  def _perform_regression_read_data(settings):
840
-
841
- if isinstance(settings['score_data'], list) and isinstance(settings['count_data'], list):
842
- if len(settings['score_data']) == 1:
843
- count_data_df = pd.read_csv(settings['count_data'][0])
844
- score_data_df = pd.read_csv(settings['score_data'][0])
845
- else:
846
- count_data_df = pd.DataFrame()
847
- for i, count_data in enumerate(settings['count_data']):
848
- df = pd.read_csv(count_data)
849
- df['plate_name'] = f'plate{i+1}'
850
- if 'column' in df.columns:
851
- df.rename(columns={'column': 'column_name'}, inplace=True)
852
- if 'col' in df.columns:
853
- df.rename(columns={'col': 'column_name'}, inplace=True)
854
- count_data_df = pd.concat([count_data_df, df])
855
- print('Count data:', len(count_data_df))
856
-
857
- score_data_df = pd.DataFrame()
858
- for i, score_data in enumerate(settings['score_data']):
859
- df = pd.read_csv(score_data)
860
- df['plate_name'] = f'plate{i+1}'
861
- if 'column' in df.columns:
862
- df.rename(columns={'column': 'column_name'}, inplace=True)
863
- if 'col' in df.columns:
864
- df.rename(columns={'col': 'column_name'}, inplace=True)
865
-
866
- score_data_df = pd.concat([score_data_df, df])
867
- display(score_data_df)
868
- print('Score data:', len(score_data_df))
869
- else:
870
- count_data_df = pd.read_csv(settings['count_data'])
871
- score_data_df = pd.read_csv(settings['score_data'])
872
- print(f"Dependent variable: {len(score_data_df)}")
873
- print(f"Independent variable: {len(count_data_df)}")
731
+
732
+ if not isinstance(settings['score_data'], list):
733
+ settings['score_data'] = [settings['score_data']]
734
+ if not isinstance(settings['count_data'], list):
735
+ settings['count_data'] = [settings['count_data']]
736
+
737
+ score_data_df = pd.DataFrame()
738
+ for i, score_data in enumerate(settings['score_data']):
739
+ df = pd.read_csv(score_data)
740
+ df = correct_metadata(df)
741
+ if not 'plateID' in df.columns:
742
+ df['plateID'] = f'plate{i+1}'
743
+
744
+ score_data_df = pd.concat([score_data_df, df])
745
+ print('Score data:', len(score_data_df))
746
+
747
+ count_data_df = pd.DataFrame()
748
+ for i, count_data in enumerate(settings['count_data']):
749
+ df = pd.read_csv(count_data)
750
+ df = correct_metadata(df)
751
+ if not 'plateID' in df.columns:
752
+ df['plateID'] = f'plate{i+1}'
753
+
754
+ count_data_df = pd.concat([count_data_df, df])
755
+ print('Count data:', len(count_data_df))
756
+
757
+ print(f"Dependent variable: {len(score_data_df)}")
758
+ print(f"Independent variable: {len(count_data_df)}")
874
759
 
875
760
  if settings['dependent_variable'] not in score_data_df.columns:
876
761
  print(f'Columns in DataFrame:')
@@ -879,10 +764,6 @@ def perform_regression(settings):
879
764
  if not settings['dependent_variable'] == 'pathogen_nucleus_shortest_distance':
880
765
  raise ValueError(f"Dependent variable {settings['dependent_variable']} not found in the DataFrame")
881
766
 
882
- if 'prediction_probability_class_1' in score_data_df.columns:
883
- if not settings['class_1_threshold'] is None:
884
- score_data_df['predictions'] = (score_data_df['prediction_probability_class_1'] >= settings['class_1_threshold']).astype(int)
885
-
886
767
  reg_types = ['ols','gls','wls','rlm','glm','mixed','quantile','logit','probit','poisson','lasso','ridge', None]
887
768
  if settings['regression_type'] not in reg_types:
888
769
  print(f'Possible regression types: {reg_types}')
@@ -945,34 +826,34 @@ def perform_regression(settings):
945
826
  return df
946
827
 
947
828
  def grna_metricks(df):
948
- df[['plate', 'row', 'column']] = df['prc'].str.split('_', expand=True)
829
+ df[['plateID', 'rowID', 'columnID']] = df['prc'].str.split('_', expand=True)
949
830
 
950
831
  # --- 2) Compute GRNA-level Well Counts ---
951
832
  # For each (grna, plate), count the number of unique prc (wells)
952
- grna_well_counts = (df.groupby(['grna', 'plate'])['prc'].nunique().reset_index(name='grna_well_count'))
833
+ grna_well_counts = (df.groupby(['grna', 'plateID'])['prc'].nunique().reset_index(name='grna_well_count'))
953
834
 
954
835
  # --- 3) Compute Gene-level Well Counts ---
955
836
  # For each (gene, plate), count the number of unique prc
956
- gene_well_counts = (df.groupby(['gene', 'plate'])['prc'].nunique().reset_index(name='gene_well_count'))
837
+ gene_well_counts = (df.groupby(['gene', 'plateID'])['prc'].nunique().reset_index(name='gene_well_count'))
957
838
 
958
839
  # --- 4) Merge These Counts into a Single DataFrame ---
959
840
  # Because each grna is typically associated with one gene, we bring them together.
960
841
  # First, create a unique (grna, gene, plate) reference from the original df
961
- unique_triplets = df[['grna', 'gene', 'plate']].drop_duplicates()
842
+ unique_triplets = df[['grna', 'gene', 'plateID']].drop_duplicates()
962
843
 
963
844
  # Merge the grna_well_count
964
- merged_df = pd.merge(unique_triplets, grna_well_counts, on=['grna', 'plate'], how='left')
845
+ merged_df = pd.merge(unique_triplets, grna_well_counts, on=['grna', 'plateID'], how='left')
965
846
 
966
847
  # Merge the gene_well_count
967
- merged_df = pd.merge(merged_df, gene_well_counts, on=['gene', 'plate'], how='left')
848
+ merged_df = pd.merge(merged_df, gene_well_counts, on=['gene', 'plateID'], how='left')
968
849
 
969
850
  # Keep only the columns needed (if you want to keep 'gene', remove the drop below)
970
- final_grna_df = merged_df[['grna', 'plate', 'grna_well_count', 'gene_well_count']]
851
+ final_grna_df = merged_df[['grna', 'plateID', 'grna_well_count', 'gene_well_count']]
971
852
 
972
853
  # --- 5) Compute gene_count per prc ---
973
854
  # For each prc (well), how many distinct genes are there?
974
855
  prc_gene_count_df = (df.groupby('prc')['gene'].nunique().reset_index(name='gene_count'))
975
- prc_gene_count_df[['plate', 'row', 'column']] = prc_gene_count_df['prc'].str.split('_', expand=True)
856
+ prc_gene_count_df[['plateID', 'rowID', 'columnID']] = prc_gene_count_df['prc'].str.split('_', expand=True)
976
857
 
977
858
  return final_grna_df, prc_gene_count_df
978
859
 
@@ -1016,18 +897,18 @@ def perform_regression(settings):
1016
897
  settings = get_perform_regression_default_settings(settings)
1017
898
  count_data_df, score_data_df = _perform_regression_read_data(settings)
1018
899
 
1019
- if "row_name" in count_data_df.columns:
1020
- num_parts = len(count_data_df['row_name'].iloc[0].split('_'))
900
+ if "rowID" in count_data_df.columns:
901
+ num_parts = len(count_data_df['rowID'].iloc[0].split('_'))
1021
902
  if num_parts == 2:
1022
- split = count_data_df['row_name'].str.split('_', expand=True)
1023
- count_data_df['row_name'] = split[1]
903
+ split = count_data_df['rowID'].str.split('_', expand=True)
904
+ count_data_df['rowID'] = split[1]
1024
905
 
1025
906
  if "prc" in score_data_df.columns:
1026
907
  num_parts = len(score_data_df['prc'].iloc[0].split('_'))
1027
908
  if num_parts == 3:
1028
909
  split = score_data_df['prc'].str.split('_', expand=True)
1029
- score_data_df['plate'] = settings['plate']
1030
- score_data_df['prc'] = score_data_df['plate'] + '_' + split[1] + '_' + split[2]
910
+ score_data_df['plateID'] = settings['plateID']
911
+ score_data_df['prc'] = score_data_df['plateID'] + '_' + split[1] + '_' + split[2]
1031
912
 
1032
913
  results_path, results_path_gene, results_path_grna, hits_path, res_folder, csv_path = _perform_regression_set_paths(settings)
1033
914
  save_settings(settings, name='regression', show=True)
@@ -1043,38 +924,47 @@ def perform_regression(settings):
1043
924
  filter_column = settings['filter_column']
1044
925
 
1045
926
  score_data_df = clean_controls(score_data_df, settings['filter_value'], settings['filter_column'])
1046
- print(f"Dependent variable after clean_controls: {len(score_data_df)}")
927
+
928
+ if settings['verbose']:
929
+ print(f"Dependent variable after clean_controls: {len(score_data_df)}")
1047
930
 
1048
- if settings['min_cell_count'] is None:
1049
- settings['min_cell_count'] = minimum_cell_simulation(settings, tolerance=settings['tolerance'])
931
+ sim_min_count = minimum_cell_simulation(settings, tolerance=settings['tolerance'])
1050
932
 
1051
- print(f"Minimum cell count: {settings['min_cell_count']}")
1052
- print(f"Dependent variable after minimum cell count filter: {len(score_data_df)}")
1053
- display(score_data_df)
933
+ if settings['min_cell_count'] is None:
934
+ settings['min_cell_count'] = sim_min_count
935
+
936
+ if settings['verbose']:
937
+ print(f"Minimum cell count: {settings['min_cell_count']}")
938
+ print(f"Dependent variable after minimum cell count filter: {len(score_data_df)}")
939
+ display(score_data_df)
1054
940
 
1055
941
  orig_dv = settings['dependent_variable']
1056
942
 
1057
- dependent_df, dependent_variable = process_scores(score_data_df, settings['dependent_variable'], settings['plate'], settings['min_cell_count'], settings['agg_type'], settings['transform'])
1058
- print(f"Dependent variable after process_scores: {len(dependent_df)}")
1059
- display(dependent_df)
943
+ dependent_df, dependent_variable = process_scores(score_data_df, settings['dependent_variable'], settings['plateID'], settings['min_cell_count'], settings['agg_type'], settings['transform'])
944
+
945
+ if settings['verbose']:
946
+ print(f"Dependent variable after process_scores: {len(dependent_df)}")
947
+ display(dependent_df)
1060
948
 
1061
949
  if settings['fraction_threshold'] is None:
1062
950
  settings['fraction_threshold'] = graph_sequencing_stats(settings)
1063
951
 
1064
- independent_df = process_reads(count_data_df, settings['fraction_threshold'], settings['plate'], filter_column=filter_column, filter_value=filter_value)
952
+ independent_df = process_reads(count_data_df, settings['fraction_threshold'], settings['plateID'], filter_column=filter_column, filter_value=filter_value)
953
+
1065
954
  independent_df, n_grna, n_gene = _count_variable_instances(independent_df, column_1='grna', column_2='gene')
1066
955
 
1067
- print(f"Independent variable after process_reads: {len(independent_df)}")
956
+ if settings['verbose']:
957
+ print(f"Independent variable after process_reads: {len(independent_df)}")
1068
958
 
1069
959
  merged_df = pd.merge(independent_df, dependent_df, on='prc')
1070
960
 
1071
- display(independent_df)
1072
- display(dependent_df)
1073
-
1074
- display(merged_df)
961
+ if settings['verbose']:
962
+ display(independent_df)
963
+ display(dependent_df)
964
+ display(merged_df)
1075
965
 
1076
966
 
1077
- merged_df[['plate', 'row_name', 'column']] = merged_df['prc'].str.split('_', expand=True)
967
+ merged_df[['plateID', 'rowID', 'columnID']] = merged_df['prc'].str.split('_', expand=True)
1078
968
 
1079
969
  try:
1080
970
  os.makedirs(res_folder, exist_ok=True)
@@ -1085,7 +975,7 @@ def perform_regression(settings):
1085
975
  cell_settings = {'src':data_path,
1086
976
  'graph_name':'cell_count',
1087
977
  'data_column':['cell_count'],
1088
- 'grouping_column':'plate',
978
+ 'grouping_column':'plateID',
1089
979
  'graph_type':'jitter_bar',
1090
980
  'theme':'bright',
1091
981
  'save':True,
@@ -1114,7 +1004,7 @@ def perform_regression(settings):
1114
1004
  wells_per_gene_settings = {'src':grna_data_path,
1115
1005
  'graph_name':'wells_per_gene',
1116
1006
  'data_column':['grna_well_count'],
1117
- 'grouping_column':'plate',
1007
+ 'grouping_column':'plateID',
1118
1008
  'graph_type':'jitter_bar',
1119
1009
  'theme':'bright',
1120
1010
  'save':True,
@@ -1133,7 +1023,7 @@ def perform_regression(settings):
1133
1023
  grna_per_well_settings = {'src':grna_well_data_path,
1134
1024
  'graph_name':'gene_per_well',
1135
1025
  'data_column':['gene_count'],
1136
- 'grouping_column':'plate',
1026
+ 'grouping_column':'plateID',
1137
1027
  'graph_type':'jitter_bar',
1138
1028
  'theme':'bright',
1139
1029
  'save':True,
@@ -1169,7 +1059,9 @@ def perform_regression(settings):
1169
1059
  mean_coef = control_coef_df['coefficient'].mean()
1170
1060
  significant_c = control_coef_df[control_coef_df['p_value']<= 0.05]
1171
1061
  mean_coef_c = significant_c['coefficient'].mean()
1172
- print(mean_coef, mean_coef_c)
1062
+
1063
+ if settings['verbose']:
1064
+ print(mean_coef, mean_coef_c)
1173
1065
 
1174
1066
  if settings['threshold_method'] in ['var','variance']:
1175
1067
  coef_mes = control_coef_df['coefficient'].var()
@@ -1197,8 +1089,9 @@ def perform_regression(settings):
1197
1089
  significant = significant[~significant['feature'].str.contains('row|column')]
1198
1090
 
1199
1091
  if regression_type in ['ols', 'beta']:
1200
- print(model.summary())
1201
- save_summary_to_file(model, file_path=f'{res_folder}/mode_summary.csv')
1092
+ if settings['verbose']:
1093
+ print(model.summary())
1094
+ save_summary_to_file(model, file_path=f'{res_folder}/mode_summary.csv')
1202
1095
 
1203
1096
  significant.to_csv(hits_path, index=False)
1204
1097
  significant_grna_filtered = significant[significant['n_grna'] > settings['min_n']]
@@ -1225,8 +1118,6 @@ def perform_regression(settings):
1225
1118
  base_dir = os.path.dirname(os.path.abspath(__file__))
1226
1119
  metadata_path = os.path.join(base_dir, 'resources', 'data', 'lopit.csv')
1227
1120
 
1228
-
1229
-
1230
1121
  if settings['volcano'] == 'all':
1231
1122
  print('all')
1232
1123
  gene_list = custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', point_size=600, figsize=20, threshold=reg_threshold, save_path=volcano_path, x_lim=settings['x_lim'],y_lims=settings['y_lims'])
@@ -1246,7 +1137,7 @@ def perform_regression(settings):
1246
1137
  data_ME49 = pd.read_csv(settings['metadata_files'][0], low_memory=False)
1247
1138
 
1248
1139
  columns = ['sense - Tachyzoites', 'sense - Tissue cysts', 'sense - EES1', 'sense - EES2', 'sense - EES3', 'sense - EES4', 'sense - EES5']
1249
-
1140
+
1250
1141
  print('Plotting gene phenotypes and heatmaps')
1251
1142
  print(gene_list)
1252
1143
 
@@ -1269,6 +1160,8 @@ def perform_regression(settings):
1269
1160
  return output
1270
1161
 
1271
1162
  def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filter_value=None):
1163
+
1164
+ from .utils import correct_metadata
1272
1165
 
1273
1166
  if isinstance(csv_path, pd.DataFrame):
1274
1167
  csv_df = csv_path
@@ -1276,47 +1169,41 @@ def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filte
1276
1169
  # Read the CSV file into a DataFrame
1277
1170
  csv_df = pd.read_csv(csv_path)
1278
1171
 
1279
- if 'plate_name' in csv_df.columns:
1280
- csv_df = csv_df.rename(columns={'plate_name': 'plate'})
1281
- if 'column_name' in csv_df.columns:
1282
- csv_df = csv_df.rename(columns={'column_name': 'column'})
1283
- if 'column_name' in csv_df.columns:
1284
- csv_df = csv_df.rename(columns={'column_name': 'column'})
1285
- if 'row_name' in csv_df.columns:
1286
- csv_df = csv_df.rename(columns={'row_name': 'row_name'})
1172
+ csv_df = correct_metadata(csv_df)
1173
+
1287
1174
  if 'grna_name' in csv_df.columns:
1288
1175
  csv_df = csv_df.rename(columns={'grna_name': 'grna'})
1289
1176
  if 'plate_row' in csv_df.columns:
1290
- csv_df[['plate', 'row_name']] = csv_df['plate_row'].str.split('_', expand=True)
1177
+ csv_df[['plateID', 'rowID']] = csv_df['plate_row'].str.split('_', expand=True)
1291
1178
 
1292
- if not 'plate' in csv_df.columns:
1179
+ if not 'plateID' in csv_df.columns:
1293
1180
  if not plate is None:
1294
- csv_df['plate'] = plate
1181
+ csv_df['plateID'] = plate
1295
1182
  else:
1296
- csv_df['plate'] = 'plate1'
1183
+ csv_df['plateID'] = 'plate1'
1297
1184
 
1298
1185
  if 'prcfo' in csv_df.columns:
1299
1186
  #csv_df = csv_df.loc[:, ~csv_df.columns.duplicated()].copy()
1300
- csv_df[['plate_name', 'row_name', 'column_name', 'field_name', 'object_name']] = csv_df['prcfo'].str.split('_', expand=True)
1301
- csv_df['prc'] = csv_df['plate_name'].astype(str) + '_' + csv_df['row_name'].astype(str) + '_' + csv_df['column_name'].astype(str)
1187
+ csv_df[['plateID', 'rowID', 'columnID', 'fieldID', 'objectID']] = csv_df['prcfo'].str.split('_', expand=True)
1188
+ csv_df['prc'] = csv_df['plateID'].astype(str) + '_' + csv_df['rowID'].astype(str) + '_' + csv_df['columnID'].astype(str)
1302
1189
 
1303
1190
  if isinstance(filter_column, str):
1304
1191
  filter_column = [filter_column]
1305
1192
 
1306
1193
  if isinstance(filter_value, str):
1307
1194
  filter_value = [filter_value]
1308
-
1195
+
1309
1196
  if isinstance(filter_column, list):
1310
1197
  for filter_col in filter_column:
1311
1198
  for value in filter_value:
1312
1199
  csv_df = csv_df[csv_df[filter_col] != value]
1313
-
1200
+
1314
1201
  # Ensure the necessary columns are present
1315
- if not all(col in csv_df.columns for col in ['row_name','column','grna','count']):
1316
- raise ValueError("The CSV file must contain 'grna', 'count', 'row_name', and 'column' columns.")
1202
+ if not all(col in csv_df.columns for col in ['rowID','columnID','grna','count']):
1203
+ raise ValueError("The CSV file must contain 'grna', 'count', 'rowID', and 'columnID' columns.")
1317
1204
 
1318
1205
  # Create the prc column
1319
- csv_df['prc'] = csv_df['plate'] + '_' + csv_df['row_name'] + '_' + csv_df['column']
1206
+ csv_df['prc'] = csv_df['plateID'] + '_' + csv_df['rowID'] + '_' + csv_df['columnID']
1320
1207
 
1321
1208
  # Group by prc and calculate the sum of counts
1322
1209
  grouped_df = csv_df.groupby('prc')['count'].sum().reset_index()
@@ -1378,44 +1265,34 @@ def clean_controls(df,values, column):
1378
1265
  return df
1379
1266
 
1380
1267
  def process_scores(df, dependent_variable, plate, min_cell_count=25, agg_type='mean', transform=None, regression_type='ols'):
1381
- from .utils import calculate_shortest_distance
1268
+ from .utils import calculate_shortest_distance, correct_metadata
1382
1269
  df = df.reset_index(drop=True)
1383
-
1384
1270
  if 'prcfo' in df.columns:
1385
1271
  df = df.loc[:, ~df.columns.duplicated()].copy()
1386
- if not all(col in df.columns for col in ['plate_name', 'row_name', 'column_name']):
1387
- df[['plate_name', 'row_name', 'column_name', 'field_name', 'object_name']] = df['prcfo'].str.split('_', expand=True)
1388
- if all(col in df.columns for col in ['plate_name', 'row_name', 'column_name']):
1389
- df['prc'] = df['plate_name'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str)
1272
+ if not all(col in df.columns for col in ['plateID', 'rowID', 'columnID']):
1273
+ df[['plateID', 'rowID', 'columnID', 'fieldID', 'objectID']] = df['prcfo'].str.split('_', expand=True)
1274
+ if all(col in df.columns for col in ['plateID', 'rowID', 'columnID']):
1275
+ df['prc'] = df['plateID'].astype(str) + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
1390
1276
  else:
1391
- if 'plate_name' in df.columns:
1392
- df.drop(columns=['plate'], inplace=True)
1393
- #df = df.rename(columns={'plate_name': 'plate'})
1394
-
1395
- if 'plate' in df.columns:
1396
- df['plate_name'] = df['plate']
1397
-
1398
- if plate is not None:
1399
- df['plate_name'] = plate
1400
1277
 
1401
- if 'row' in df.columns:
1402
- df = df.rename(columns={'row': 'row_name'})
1403
-
1404
- if 'col' in df.columns:
1405
- df = df.rename(columns={'col': 'column_name'})
1278
+
1279
+ df = correct_metadata(df)
1280
+
1406
1281
 
1407
- if 'column' in df.columns:
1408
- df = df.rename(columns={'column': 'column_name'})
1282
+ if not plate is None:
1283
+ df['plateID'] = plate
1409
1284
 
1410
1285
  df = df.loc[:, ~df.columns.duplicated()].copy()
1411
- df['prc'] = df['plate_name'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str)
1286
+
1287
+ if all(col in df.columns for col in ['plateID', 'rowID', 'columnID']):
1288
+ df['prc'] = df['plateID'].astype(str) + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
1289
+ else:
1290
+ raise ValueError("The DataFrame must contain 'plateID', 'rowID', and 'columnID' columns.")
1412
1291
 
1413
1292
  df = df[['prc', dependent_variable]]
1414
1293
  # Group by prc and calculate the mean and count of the dependent_variable
1415
1294
  grouped = df.groupby('prc')[dependent_variable]
1416
-
1417
- display(grouped)
1418
-
1295
+
1419
1296
  if regression_type != 'poisson':
1420
1297
 
1421
1298
  print(f'Using agg_type: {agg_type}')
@@ -1600,7 +1477,7 @@ def generate_ml_scores(settings):
1600
1477
 
1601
1478
  return [output, plate_heatmap]
1602
1479
 
1603
- def ml_analysis(df, channel_of_interest=3, location_column='column_name', positive_control='c2', negative_control='c1', exclude=None, n_repeats=10, top_features=30, reg_alpha=0.1, reg_lambda=1.0, learning_rate=0.00001, n_estimators=1000, test_size=0.2, model_type='xgboost', n_jobs=-1, remove_low_variance_features=True, remove_highly_correlated_features=True, prune_features=False, cross_validation=False, verbose=False):
1480
+ def ml_analysis(df, channel_of_interest=3, location_column='columnID', positive_control='c2', negative_control='c1', exclude=None, n_repeats=10, top_features=30, reg_alpha=0.1, reg_lambda=1.0, learning_rate=0.00001, n_estimators=1000, test_size=0.2, model_type='xgboost', n_jobs=-1, remove_low_variance_features=True, remove_highly_correlated_features=True, prune_features=False, cross_validation=False, verbose=False):
1604
1481
 
1605
1482
  """
1606
1483
  Calculates permutation importance for numerical features in the dataframe,
@@ -1829,8 +1706,8 @@ def ml_analysis(df, channel_of_interest=3, location_column='column_name', positi
1829
1706
  df = _calculate_similarity(df, features, location_column, positive_control, negative_control)
1830
1707
 
1831
1708
  df['prcfo'] = df.index.astype(str)
1832
- df[['plate', 'row_name', 'column_name', 'field', 'object']] = df['prcfo'].str.split('_', expand=True)
1833
- df['prc'] = df['plate'] + '_' + df['row_name'] + '_' + df['column_name']
1709
+ df[['plateID', 'rowID', 'columnID', 'fieldID', 'object']] = df['prcfo'].str.split('_', expand=True)
1710
+ df['prc'] = df['plateID'] + '_' + df['rowID'] + '_' + df['columnID']
1834
1711
 
1835
1712
  return [df, permutation_df, feature_importance_df, model, X_train, X_test, y_train, y_test, metrics_df, features], [permutation_fig, feature_importance_fig]
1836
1713
 
@@ -2004,11 +1881,17 @@ def interperate_vision_model(settings={}):
2004
1881
  # Clean and align columns for merging
2005
1882
  df['object_label'] = df['object_label'].str.replace('o', '')
2006
1883
 
2007
- if 'row_name' not in scores_df.columns:
2008
- scores_df['row_name'] = scores_df['row']
1884
+ if 'rowID' not in scores_df.columns:
1885
+ if 'row' in scores_df.columns:
1886
+ scores_df['rowID'] = scores_df['row']
1887
+ if 'row_name' in scores_df.columns:
1888
+ scores_df['rowID'] = scores_df['row_name']
2009
1889
 
2010
- if 'column_name' not in scores_df.columns:
2011
- scores_df['column_name'] = scores_df['col']
1890
+ if 'columnID' not in scores_df.columns:
1891
+ if 'col' in scores_df.columns:
1892
+ scores_df['columnID'] = scores_df['col']
1893
+ if 'column' in scores_df.columns:
1894
+ scores_df['columnID'] = scores_df['column']
2012
1895
 
2013
1896
  if 'object_label' not in scores_df.columns:
2014
1897
  scores_df['object_label'] = scores_df['object']
@@ -2020,14 +1903,14 @@ def interperate_vision_model(settings={}):
2020
1903
  scores_df['object_label'] = scores_df['object'].astype(str)
2021
1904
 
2022
1905
  # Ensure all join columns have the same data type in both DataFrames
2023
- df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
2024
- scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
1906
+ df[['plateID', 'rowID', 'columnID', 'fieldID', 'object_label']] = df[['plateID', 'rowID', 'columnID', 'fieldID', 'object_label']].astype(str)
1907
+ scores_df[['plateID', 'rowID', 'columnID', 'fieldID', 'object_label']] = scores_df[['plateID', 'rowID', 'columnID', 'fieldID', 'object_label']].astype(str)
2025
1908
 
2026
1909
  # Select only the necessary columns from scores_df for merging
2027
- scores_df = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label', settings['score_column']]]
1910
+ scores_df = scores_df[['plateID', 'rowID', 'columnID', 'fieldID', 'object_label', settings['score_column']]]
2028
1911
 
2029
1912
  # Now merge DataFrames
2030
- merged_df = pd.merge(df, scores_df, on=['plate', 'row_name', 'column_name', 'field', 'object_label'], how='inner')
1913
+ merged_df = pd.merge(df, scores_df, on=['plateID', 'rowID', 'columnID', 'fieldID', 'object_label'], how='inner')
2031
1914
 
2032
1915
  # Separate numerical features and the score column
2033
1916
  X = merged_df.select_dtypes(include='number').drop(columns=[settings['score_column']])