spacr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/__init__.py +2 -2
- spacr/core.py +52 -10
- spacr/deep_spacr.py +2 -3
- spacr/gui.py +0 -1
- spacr/gui_core.py +247 -41
- spacr/gui_elements.py +133 -2
- spacr/gui_utils.py +22 -17
- spacr/io.py +624 -149
- spacr/ml.py +141 -258
- spacr/plot.py +76 -34
- spacr/resources/MEDIAR/__pycache__/SetupDict.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/__pycache__/evaluate.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/__pycache__/generate_mapping.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/__pycache__/main.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/Baseline/__pycache__/Predictor.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/Baseline/__pycache__/Trainer.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/Baseline/__pycache__/__init__.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/Baseline/__pycache__/utils.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/MEDIAR/__pycache__/EnsemblePredictor.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/MEDIAR/__pycache__/Predictor.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/MEDIAR/__pycache__/Trainer.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/MEDIAR/__pycache__/__init__.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/MEDIAR/__pycache__/utils.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/__pycache__/BasePredictor.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/__pycache__/BaseTrainer.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/__pycache__/__init__.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/core/__pycache__/utils.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/__pycache__/__init__.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/__pycache__/measures.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/__pycache__/utils.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/data_utils/__pycache__/__init__.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/data_utils/__pycache__/datasetter.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/data_utils/__pycache__/transforms.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/data_utils/__pycache__/utils.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/data_utils/custom/__pycache__/CellAware.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/data_utils/custom/__pycache__/LoadImage.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/data_utils/custom/__pycache__/NormalizeImage.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/data_utils/custom/__pycache__/__init__.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/models/__pycache__/MEDIARFormer.cpython-39.pyc +0 -0
- spacr/resources/MEDIAR/train_tools/models/__pycache__/__init__.cpython-39.pyc +0 -0
- spacr/sequencing.py +73 -38
- spacr/settings.py +161 -135
- spacr/submodules.py +618 -215
- spacr/timelapse.py +197 -29
- spacr/toxo.py +23 -23
- spacr/utils.py +186 -128
- {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/METADATA +5 -2
- {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/RECORD +53 -24
- spacr/stats.py +0 -221
- /spacr/{cellpose.py → spacr_cellpose.py} +0 -0
- {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/LICENSE +0 -0
- {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/WHEEL +0 -0
- {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/entry_points.txt +0 -0
- {spacr-0.4.15.dist-info → spacr-0.5.0.dist-info}/top_level.txt +0 -0
spacr/ml.py
CHANGED
@@ -195,18 +195,18 @@ def prepare_formula(dependent_variable, random_row_column_effects=False):
|
|
195
195
|
if random_row_column_effects:
|
196
196
|
# Random effects for row and column + gene weighted by gene_fraction + grna weighted by fraction
|
197
197
|
return f'{dependent_variable} ~ fraction:grna + gene_fraction:gene'
|
198
|
-
return f'{dependent_variable} ~ fraction:grna + gene_fraction:gene +
|
198
|
+
return f'{dependent_variable} ~ fraction:grna + gene_fraction:gene + rowID + columnID'
|
199
199
|
|
200
200
|
def fit_mixed_model(df, formula, dst):
|
201
201
|
from .plot import plot_histogram
|
202
202
|
|
203
|
-
"""Fit the mixed model with plate, row_name, and
|
203
|
+
"""Fit the mixed model with plate, row_name, and columnID as random effects and return results."""
|
204
204
|
# Specify random effects for plate, row, and column
|
205
205
|
model = smf.mixedlm(formula,
|
206
206
|
data=df,
|
207
|
-
groups=df['
|
208
|
-
re_formula="1 +
|
209
|
-
vc_formula={"
|
207
|
+
groups=df['plateID'],
|
208
|
+
re_formula="1 + rowID + columnID",
|
209
|
+
vc_formula={"rowID": "0 + rowID", "columnID": "0 + columnID"})
|
210
210
|
|
211
211
|
mixed_model = model.fit()
|
212
212
|
|
@@ -288,7 +288,7 @@ def check_and_clean_data(df, dependent_variable):
|
|
288
288
|
df = handle_missing_values(df, ['fraction', dependent_variable])
|
289
289
|
|
290
290
|
# Step 2: Ensure grna, gene, plate, row, column, and prc are categorical types
|
291
|
-
df = ensure_valid_types(df, ['grna', 'gene', '
|
291
|
+
df = ensure_valid_types(df, ['grna', 'gene', 'plateID', 'rowID', 'columnID', 'prc'])
|
292
292
|
|
293
293
|
# Step 3: Check for multicollinearity in fraction and the dependent variable
|
294
294
|
df_cleaned = check_collinearity(df, ['fraction', dependent_variable])
|
@@ -297,9 +297,9 @@ def check_and_clean_data(df, dependent_variable):
|
|
297
297
|
df_cleaned['gene'] = df['gene']
|
298
298
|
df_cleaned['grna'] = df['grna']
|
299
299
|
df_cleaned['prc'] = df['prc']
|
300
|
-
df_cleaned['
|
301
|
-
df_cleaned['
|
302
|
-
df_cleaned['
|
300
|
+
df_cleaned['plateID'] = df['plateID']
|
301
|
+
df_cleaned['rowID'] = df['rowID']
|
302
|
+
df_cleaned['columnID'] = df['columnID']
|
303
303
|
|
304
304
|
# Create a new column 'gene_fraction' that sums the fractions by gene within the same well
|
305
305
|
df_cleaned['gene_fraction'] = df_cleaned.groupby(['prc', 'gene'])['fraction'].transform('sum')
|
@@ -336,10 +336,10 @@ def minimum_cell_simulation(settings, num_repeats=10, sample_size=100, tolerance
|
|
336
336
|
for i, score_data in enumerate(settings['score_data']):
|
337
337
|
df = pd.read_csv(score_data)
|
338
338
|
df = correct_metadata_column_names(df)
|
339
|
-
df['
|
339
|
+
df['plateID'] = f'plate{i + 1}'
|
340
340
|
|
341
341
|
if 'prc' not in df.columns:
|
342
|
-
df['prc'] = df['
|
342
|
+
df['prc'] = df['plateID'] + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
|
343
343
|
|
344
344
|
dfs.append(df)
|
345
345
|
|
@@ -429,120 +429,11 @@ def minimum_cell_simulation(settings, num_repeats=10, sample_size=100, tolerance
|
|
429
429
|
color='teal', alpha=0.3, label='±1 Std. Dev.'
|
430
430
|
)
|
431
431
|
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
# Formatting the plot
|
436
|
-
ax.set_xlabel('Sample Size')
|
437
|
-
ax.set_ylabel('Mean Absolute Difference')
|
438
|
-
ax.set_title('Mean Absolute Difference vs. Sample Size with Standard Deviation')
|
439
|
-
ax.legend().remove()
|
440
|
-
|
441
|
-
# Save the plot if a destination is provided
|
442
|
-
dst = os.path.dirname(settings['count_data'][0])
|
443
|
-
if dst is not None:
|
444
|
-
fig_path = os.path.join(dst, 'results')
|
445
|
-
os.makedirs(fig_path, exist_ok=True)
|
446
|
-
fig_file_path = os.path.join(fig_path, 'cell_min_threshold.pdf')
|
447
|
-
fig.savefig(fig_file_path, format='pdf', dpi=600, bbox_inches='tight')
|
448
|
-
print(f"Saved {fig_file_path}")
|
449
|
-
|
450
|
-
plt.show()
|
451
|
-
return elbow_point['sample_size']
|
452
|
-
|
453
|
-
def minimum_cell_simulation_v1(settings, num_repeats=10, sample_size=100, tolerance=0.02, smoothing=10, increment=10):
|
454
|
-
"""
|
455
|
-
Plot the mean absolute difference with standard deviation as shaded area vs. sample size.
|
456
|
-
Detect and mark the elbow point (inflection) with smoothing and tolerance control.
|
457
|
-
"""
|
458
|
-
|
459
|
-
from spacr.utils import correct_metadata_column_names
|
460
|
-
|
461
|
-
# Load and process data
|
462
|
-
if isinstance(settings['score_data'], str):
|
463
|
-
settings['score_data'] = [settings['score_data']]
|
464
|
-
|
465
|
-
dfs = []
|
466
|
-
for i, score_data in enumerate(settings['score_data']):
|
467
|
-
df = pd.read_csv(score_data)
|
468
|
-
df = correct_metadata_column_names(df)
|
469
|
-
df['plate'] = f'plate{i + 1}'
|
470
|
-
|
471
|
-
if 'prc' not in df.columns:
|
472
|
-
df['prc'] = df['plate'] + '_' + df['row'].astype(str) + '_' + df['column'].astype(str)
|
473
|
-
|
474
|
-
dfs.append(df)
|
475
|
-
|
476
|
-
df = pd.concat(dfs, axis=0)
|
477
|
-
|
478
|
-
# Compute the number of cells per well and select the top 100 wells by cell count
|
479
|
-
cell_counts = df.groupby('prc').size().reset_index(name='cell_count')
|
480
|
-
top_wells = cell_counts.nlargest(sample_size, 'cell_count')['prc']
|
481
|
-
|
482
|
-
# Filter the data to include only the top 100 wells
|
483
|
-
df = df[df['prc'].isin(top_wells)]
|
484
|
-
|
485
|
-
# Initialize storage for absolute difference data
|
486
|
-
diff_data = []
|
487
|
-
|
488
|
-
# Group by wells and iterate over them
|
489
|
-
for i, (prc, group) in enumerate(df.groupby('prc')):
|
490
|
-
original_mean = group[settings['score_column']].mean() # Original full-well mean
|
491
|
-
max_cells = len(group)
|
492
|
-
sample_sizes = np.arange(2, max_cells + 1, increment) # Sample sizes from 2 to max cells
|
493
|
-
|
494
|
-
# Iterate over sample sizes and compute absolute difference
|
495
|
-
for sample_size in sample_sizes:
|
496
|
-
abs_diffs = []
|
497
|
-
|
498
|
-
# Perform multiple random samples to reduce noise
|
499
|
-
for _ in range(num_repeats):
|
500
|
-
sample = group.sample(n=sample_size, replace=False)
|
501
|
-
sampled_mean = sample[settings['score_column']].mean()
|
502
|
-
abs_diff = abs(sampled_mean - original_mean) # Absolute difference
|
503
|
-
abs_diffs.append(abs_diff)
|
504
|
-
|
505
|
-
# Compute the average absolute difference across all repeats
|
506
|
-
avg_abs_diff = np.mean(abs_diffs)
|
507
|
-
|
508
|
-
# Store the result for plotting
|
509
|
-
diff_data.append((sample_size, avg_abs_diff))
|
510
|
-
|
511
|
-
# Convert absolute difference data to DataFrame for plotting
|
512
|
-
diff_df = pd.DataFrame(diff_data, columns=['sample_size', 'avg_abs_diff'])
|
513
|
-
|
514
|
-
# Group by sample size to calculate mean and standard deviation
|
515
|
-
summary_df = diff_df.groupby('sample_size').agg(
|
516
|
-
mean_abs_diff=('avg_abs_diff', 'mean'),
|
517
|
-
std_abs_diff=('avg_abs_diff', 'std')
|
518
|
-
).reset_index()
|
519
|
-
|
520
|
-
# Apply smoothing using a rolling window
|
521
|
-
summary_df['smoothed_mean_abs_diff'] = summary_df['mean_abs_diff'].rolling(window=smoothing, min_periods=1).mean()
|
522
|
-
|
523
|
-
# Detect the elbow point (where mean_abs_diff < tolerance)
|
524
|
-
elbow_df = summary_df[summary_df['smoothed_mean_abs_diff'] <= tolerance]
|
525
|
-
|
526
|
-
# Select the first occurrence if it exists; otherwise, use the last point
|
527
|
-
if not elbow_df.empty:
|
528
|
-
elbow_point = elbow_df.iloc[0] # First point where the condition is met
|
432
|
+
if settings['min_cell_count'] is None:
|
433
|
+
# Mark the elbow point (inflection) on the plot
|
434
|
+
ax.axvline(elbow_point['sample_size'], color='black', linestyle='--', label='Elbow Point')
|
529
435
|
else:
|
530
|
-
|
531
|
-
|
532
|
-
# Plot the mean absolute difference with standard deviation as shaded area
|
533
|
-
fig, ax = plt.subplots(figsize=(10, 10))
|
534
|
-
ax.plot(
|
535
|
-
summary_df['sample_size'], summary_df['smoothed_mean_abs_diff'], color='teal', label='Smoothed Mean Absolute Difference'
|
536
|
-
)
|
537
|
-
ax.fill_between(
|
538
|
-
summary_df['sample_size'],
|
539
|
-
summary_df['smoothed_mean_abs_diff'] - summary_df['std_abs_diff'],
|
540
|
-
summary_df['smoothed_mean_abs_diff'] + summary_df['std_abs_diff'],
|
541
|
-
color='teal', alpha=0.3, label='±1 Std. Dev.'
|
542
|
-
)
|
543
|
-
|
544
|
-
# Mark the elbow point (inflection) on the plot
|
545
|
-
ax.axvline(elbow_point['sample_size'], color='black', linestyle='--', label='Elbow Point')
|
436
|
+
ax.axvline(settings['min_cell_count'], color='black', linestyle='--', label='Elbow Point')
|
546
437
|
|
547
438
|
# Formatting the plot
|
548
439
|
ax.set_xlabel('Sample Size')
|
@@ -831,46 +722,40 @@ def save_summary_to_file(model, file_path='summary.csv'):
|
|
831
722
|
def perform_regression(settings):
|
832
723
|
|
833
724
|
from .plot import plot_plates, plot_data_from_csv
|
834
|
-
from .utils import merge_regression_res_with_metadata, save_settings, calculate_shortest_distance
|
725
|
+
from .utils import merge_regression_res_with_metadata, save_settings, calculate_shortest_distance, correct_metadata
|
835
726
|
from .settings import get_perform_regression_default_settings
|
836
727
|
from .toxo import go_term_enrichment_by_column, custom_volcano_plot, plot_gene_phenotypes, plot_gene_heatmaps
|
837
728
|
from .sequencing import graph_sequencing_stats
|
838
729
|
|
839
730
|
def _perform_regression_read_data(settings):
|
840
|
-
|
841
|
-
if isinstance(settings['score_data'], list)
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
print('Score data:', len(score_data_df))
|
869
|
-
else:
|
870
|
-
count_data_df = pd.read_csv(settings['count_data'])
|
871
|
-
score_data_df = pd.read_csv(settings['score_data'])
|
872
|
-
print(f"Dependent variable: {len(score_data_df)}")
|
873
|
-
print(f"Independent variable: {len(count_data_df)}")
|
731
|
+
|
732
|
+
if not isinstance(settings['score_data'], list):
|
733
|
+
settings['score_data'] = [settings['score_data']]
|
734
|
+
if not isinstance(settings['count_data'], list):
|
735
|
+
settings['count_data'] = [settings['count_data']]
|
736
|
+
|
737
|
+
score_data_df = pd.DataFrame()
|
738
|
+
for i, score_data in enumerate(settings['score_data']):
|
739
|
+
df = pd.read_csv(score_data)
|
740
|
+
df = correct_metadata(df)
|
741
|
+
if not 'plateID' in df.columns:
|
742
|
+
df['plateID'] = f'plate{i+1}'
|
743
|
+
|
744
|
+
score_data_df = pd.concat([score_data_df, df])
|
745
|
+
print('Score data:', len(score_data_df))
|
746
|
+
|
747
|
+
count_data_df = pd.DataFrame()
|
748
|
+
for i, count_data in enumerate(settings['count_data']):
|
749
|
+
df = pd.read_csv(count_data)
|
750
|
+
df = correct_metadata(df)
|
751
|
+
if not 'plateID' in df.columns:
|
752
|
+
df['plateID'] = f'plate{i+1}'
|
753
|
+
|
754
|
+
count_data_df = pd.concat([count_data_df, df])
|
755
|
+
print('Count data:', len(count_data_df))
|
756
|
+
|
757
|
+
print(f"Dependent variable: {len(score_data_df)}")
|
758
|
+
print(f"Independent variable: {len(count_data_df)}")
|
874
759
|
|
875
760
|
if settings['dependent_variable'] not in score_data_df.columns:
|
876
761
|
print(f'Columns in DataFrame:')
|
@@ -879,10 +764,6 @@ def perform_regression(settings):
|
|
879
764
|
if not settings['dependent_variable'] == 'pathogen_nucleus_shortest_distance':
|
880
765
|
raise ValueError(f"Dependent variable {settings['dependent_variable']} not found in the DataFrame")
|
881
766
|
|
882
|
-
if 'prediction_probability_class_1' in score_data_df.columns:
|
883
|
-
if not settings['class_1_threshold'] is None:
|
884
|
-
score_data_df['predictions'] = (score_data_df['prediction_probability_class_1'] >= settings['class_1_threshold']).astype(int)
|
885
|
-
|
886
767
|
reg_types = ['ols','gls','wls','rlm','glm','mixed','quantile','logit','probit','poisson','lasso','ridge', None]
|
887
768
|
if settings['regression_type'] not in reg_types:
|
888
769
|
print(f'Possible regression types: {reg_types}')
|
@@ -945,34 +826,34 @@ def perform_regression(settings):
|
|
945
826
|
return df
|
946
827
|
|
947
828
|
def grna_metricks(df):
|
948
|
-
df[['
|
829
|
+
df[['plateID', 'rowID', 'columnID']] = df['prc'].str.split('_', expand=True)
|
949
830
|
|
950
831
|
# --- 2) Compute GRNA-level Well Counts ---
|
951
832
|
# For each (grna, plate), count the number of unique prc (wells)
|
952
|
-
grna_well_counts = (df.groupby(['grna', '
|
833
|
+
grna_well_counts = (df.groupby(['grna', 'plateID'])['prc'].nunique().reset_index(name='grna_well_count'))
|
953
834
|
|
954
835
|
# --- 3) Compute Gene-level Well Counts ---
|
955
836
|
# For each (gene, plate), count the number of unique prc
|
956
|
-
gene_well_counts = (df.groupby(['gene', '
|
837
|
+
gene_well_counts = (df.groupby(['gene', 'plateID'])['prc'].nunique().reset_index(name='gene_well_count'))
|
957
838
|
|
958
839
|
# --- 4) Merge These Counts into a Single DataFrame ---
|
959
840
|
# Because each grna is typically associated with one gene, we bring them together.
|
960
841
|
# First, create a unique (grna, gene, plate) reference from the original df
|
961
|
-
unique_triplets = df[['grna', 'gene', '
|
842
|
+
unique_triplets = df[['grna', 'gene', 'plateID']].drop_duplicates()
|
962
843
|
|
963
844
|
# Merge the grna_well_count
|
964
|
-
merged_df = pd.merge(unique_triplets, grna_well_counts, on=['grna', '
|
845
|
+
merged_df = pd.merge(unique_triplets, grna_well_counts, on=['grna', 'plateID'], how='left')
|
965
846
|
|
966
847
|
# Merge the gene_well_count
|
967
|
-
merged_df = pd.merge(merged_df, gene_well_counts, on=['gene', '
|
848
|
+
merged_df = pd.merge(merged_df, gene_well_counts, on=['gene', 'plateID'], how='left')
|
968
849
|
|
969
850
|
# Keep only the columns needed (if you want to keep 'gene', remove the drop below)
|
970
|
-
final_grna_df = merged_df[['grna', '
|
851
|
+
final_grna_df = merged_df[['grna', 'plateID', 'grna_well_count', 'gene_well_count']]
|
971
852
|
|
972
853
|
# --- 5) Compute gene_count per prc ---
|
973
854
|
# For each prc (well), how many distinct genes are there?
|
974
855
|
prc_gene_count_df = (df.groupby('prc')['gene'].nunique().reset_index(name='gene_count'))
|
975
|
-
prc_gene_count_df[['
|
856
|
+
prc_gene_count_df[['plateID', 'rowID', 'columnID']] = prc_gene_count_df['prc'].str.split('_', expand=True)
|
976
857
|
|
977
858
|
return final_grna_df, prc_gene_count_df
|
978
859
|
|
@@ -1016,18 +897,18 @@ def perform_regression(settings):
|
|
1016
897
|
settings = get_perform_regression_default_settings(settings)
|
1017
898
|
count_data_df, score_data_df = _perform_regression_read_data(settings)
|
1018
899
|
|
1019
|
-
if "
|
1020
|
-
num_parts = len(count_data_df['
|
900
|
+
if "rowID" in count_data_df.columns:
|
901
|
+
num_parts = len(count_data_df['rowID'].iloc[0].split('_'))
|
1021
902
|
if num_parts == 2:
|
1022
|
-
split = count_data_df['
|
1023
|
-
count_data_df['
|
903
|
+
split = count_data_df['rowID'].str.split('_', expand=True)
|
904
|
+
count_data_df['rowID'] = split[1]
|
1024
905
|
|
1025
906
|
if "prc" in score_data_df.columns:
|
1026
907
|
num_parts = len(score_data_df['prc'].iloc[0].split('_'))
|
1027
908
|
if num_parts == 3:
|
1028
909
|
split = score_data_df['prc'].str.split('_', expand=True)
|
1029
|
-
score_data_df['
|
1030
|
-
score_data_df['prc'] = score_data_df['
|
910
|
+
score_data_df['plateID'] = settings['plateID']
|
911
|
+
score_data_df['prc'] = score_data_df['plateID'] + '_' + split[1] + '_' + split[2]
|
1031
912
|
|
1032
913
|
results_path, results_path_gene, results_path_grna, hits_path, res_folder, csv_path = _perform_regression_set_paths(settings)
|
1033
914
|
save_settings(settings, name='regression', show=True)
|
@@ -1043,38 +924,47 @@ def perform_regression(settings):
|
|
1043
924
|
filter_column = settings['filter_column']
|
1044
925
|
|
1045
926
|
score_data_df = clean_controls(score_data_df, settings['filter_value'], settings['filter_column'])
|
1046
|
-
|
927
|
+
|
928
|
+
if settings['verbose']:
|
929
|
+
print(f"Dependent variable after clean_controls: {len(score_data_df)}")
|
1047
930
|
|
1048
|
-
|
1049
|
-
settings['min_cell_count'] = minimum_cell_simulation(settings, tolerance=settings['tolerance'])
|
931
|
+
sim_min_count = minimum_cell_simulation(settings, tolerance=settings['tolerance'])
|
1050
932
|
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
933
|
+
if settings['min_cell_count'] is None:
|
934
|
+
settings['min_cell_count'] = sim_min_count
|
935
|
+
|
936
|
+
if settings['verbose']:
|
937
|
+
print(f"Minimum cell count: {settings['min_cell_count']}")
|
938
|
+
print(f"Dependent variable after minimum cell count filter: {len(score_data_df)}")
|
939
|
+
display(score_data_df)
|
1054
940
|
|
1055
941
|
orig_dv = settings['dependent_variable']
|
1056
942
|
|
1057
|
-
dependent_df, dependent_variable = process_scores(score_data_df, settings['dependent_variable'], settings['
|
1058
|
-
|
1059
|
-
|
943
|
+
dependent_df, dependent_variable = process_scores(score_data_df, settings['dependent_variable'], settings['plateID'], settings['min_cell_count'], settings['agg_type'], settings['transform'])
|
944
|
+
|
945
|
+
if settings['verbose']:
|
946
|
+
print(f"Dependent variable after process_scores: {len(dependent_df)}")
|
947
|
+
display(dependent_df)
|
1060
948
|
|
1061
949
|
if settings['fraction_threshold'] is None:
|
1062
950
|
settings['fraction_threshold'] = graph_sequencing_stats(settings)
|
1063
951
|
|
1064
|
-
independent_df = process_reads(count_data_df, settings['fraction_threshold'], settings['
|
952
|
+
independent_df = process_reads(count_data_df, settings['fraction_threshold'], settings['plateID'], filter_column=filter_column, filter_value=filter_value)
|
953
|
+
|
1065
954
|
independent_df, n_grna, n_gene = _count_variable_instances(independent_df, column_1='grna', column_2='gene')
|
1066
955
|
|
1067
|
-
|
956
|
+
if settings['verbose']:
|
957
|
+
print(f"Independent variable after process_reads: {len(independent_df)}")
|
1068
958
|
|
1069
959
|
merged_df = pd.merge(independent_df, dependent_df, on='prc')
|
1070
960
|
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
961
|
+
if settings['verbose']:
|
962
|
+
display(independent_df)
|
963
|
+
display(dependent_df)
|
964
|
+
display(merged_df)
|
1075
965
|
|
1076
966
|
|
1077
|
-
merged_df[['
|
967
|
+
merged_df[['plateID', 'rowID', 'columnID']] = merged_df['prc'].str.split('_', expand=True)
|
1078
968
|
|
1079
969
|
try:
|
1080
970
|
os.makedirs(res_folder, exist_ok=True)
|
@@ -1085,7 +975,7 @@ def perform_regression(settings):
|
|
1085
975
|
cell_settings = {'src':data_path,
|
1086
976
|
'graph_name':'cell_count',
|
1087
977
|
'data_column':['cell_count'],
|
1088
|
-
'grouping_column':'
|
978
|
+
'grouping_column':'plateID',
|
1089
979
|
'graph_type':'jitter_bar',
|
1090
980
|
'theme':'bright',
|
1091
981
|
'save':True,
|
@@ -1114,7 +1004,7 @@ def perform_regression(settings):
|
|
1114
1004
|
wells_per_gene_settings = {'src':grna_data_path,
|
1115
1005
|
'graph_name':'wells_per_gene',
|
1116
1006
|
'data_column':['grna_well_count'],
|
1117
|
-
'grouping_column':'
|
1007
|
+
'grouping_column':'plateID',
|
1118
1008
|
'graph_type':'jitter_bar',
|
1119
1009
|
'theme':'bright',
|
1120
1010
|
'save':True,
|
@@ -1133,7 +1023,7 @@ def perform_regression(settings):
|
|
1133
1023
|
grna_per_well_settings = {'src':grna_well_data_path,
|
1134
1024
|
'graph_name':'gene_per_well',
|
1135
1025
|
'data_column':['gene_count'],
|
1136
|
-
'grouping_column':'
|
1026
|
+
'grouping_column':'plateID',
|
1137
1027
|
'graph_type':'jitter_bar',
|
1138
1028
|
'theme':'bright',
|
1139
1029
|
'save':True,
|
@@ -1169,7 +1059,9 @@ def perform_regression(settings):
|
|
1169
1059
|
mean_coef = control_coef_df['coefficient'].mean()
|
1170
1060
|
significant_c = control_coef_df[control_coef_df['p_value']<= 0.05]
|
1171
1061
|
mean_coef_c = significant_c['coefficient'].mean()
|
1172
|
-
|
1062
|
+
|
1063
|
+
if settings['verbose']:
|
1064
|
+
print(mean_coef, mean_coef_c)
|
1173
1065
|
|
1174
1066
|
if settings['threshold_method'] in ['var','variance']:
|
1175
1067
|
coef_mes = control_coef_df['coefficient'].var()
|
@@ -1197,8 +1089,9 @@ def perform_regression(settings):
|
|
1197
1089
|
significant = significant[~significant['feature'].str.contains('row|column')]
|
1198
1090
|
|
1199
1091
|
if regression_type in ['ols', 'beta']:
|
1200
|
-
|
1201
|
-
|
1092
|
+
if settings['verbose']:
|
1093
|
+
print(model.summary())
|
1094
|
+
save_summary_to_file(model, file_path=f'{res_folder}/mode_summary.csv')
|
1202
1095
|
|
1203
1096
|
significant.to_csv(hits_path, index=False)
|
1204
1097
|
significant_grna_filtered = significant[significant['n_grna'] > settings['min_n']]
|
@@ -1225,8 +1118,6 @@ def perform_regression(settings):
|
|
1225
1118
|
base_dir = os.path.dirname(os.path.abspath(__file__))
|
1226
1119
|
metadata_path = os.path.join(base_dir, 'resources', 'data', 'lopit.csv')
|
1227
1120
|
|
1228
|
-
|
1229
|
-
|
1230
1121
|
if settings['volcano'] == 'all':
|
1231
1122
|
print('all')
|
1232
1123
|
gene_list = custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', point_size=600, figsize=20, threshold=reg_threshold, save_path=volcano_path, x_lim=settings['x_lim'],y_lims=settings['y_lims'])
|
@@ -1246,7 +1137,7 @@ def perform_regression(settings):
|
|
1246
1137
|
data_ME49 = pd.read_csv(settings['metadata_files'][0], low_memory=False)
|
1247
1138
|
|
1248
1139
|
columns = ['sense - Tachyzoites', 'sense - Tissue cysts', 'sense - EES1', 'sense - EES2', 'sense - EES3', 'sense - EES4', 'sense - EES5']
|
1249
|
-
|
1140
|
+
|
1250
1141
|
print('Plotting gene phenotypes and heatmaps')
|
1251
1142
|
print(gene_list)
|
1252
1143
|
|
@@ -1269,6 +1160,8 @@ def perform_regression(settings):
|
|
1269
1160
|
return output
|
1270
1161
|
|
1271
1162
|
def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filter_value=None):
|
1163
|
+
|
1164
|
+
from .utils import correct_metadata
|
1272
1165
|
|
1273
1166
|
if isinstance(csv_path, pd.DataFrame):
|
1274
1167
|
csv_df = csv_path
|
@@ -1276,47 +1169,41 @@ def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filte
|
|
1276
1169
|
# Read the CSV file into a DataFrame
|
1277
1170
|
csv_df = pd.read_csv(csv_path)
|
1278
1171
|
|
1279
|
-
|
1280
|
-
|
1281
|
-
if 'column_name' in csv_df.columns:
|
1282
|
-
csv_df = csv_df.rename(columns={'column_name': 'column'})
|
1283
|
-
if 'column_name' in csv_df.columns:
|
1284
|
-
csv_df = csv_df.rename(columns={'column_name': 'column'})
|
1285
|
-
if 'row_name' in csv_df.columns:
|
1286
|
-
csv_df = csv_df.rename(columns={'row_name': 'row_name'})
|
1172
|
+
csv_df = correct_metadata(csv_df)
|
1173
|
+
|
1287
1174
|
if 'grna_name' in csv_df.columns:
|
1288
1175
|
csv_df = csv_df.rename(columns={'grna_name': 'grna'})
|
1289
1176
|
if 'plate_row' in csv_df.columns:
|
1290
|
-
csv_df[['
|
1177
|
+
csv_df[['plateID', 'rowID']] = csv_df['plate_row'].str.split('_', expand=True)
|
1291
1178
|
|
1292
|
-
if not '
|
1179
|
+
if not 'plateID' in csv_df.columns:
|
1293
1180
|
if not plate is None:
|
1294
|
-
csv_df['
|
1181
|
+
csv_df['plateID'] = plate
|
1295
1182
|
else:
|
1296
|
-
csv_df['
|
1183
|
+
csv_df['plateID'] = 'plate1'
|
1297
1184
|
|
1298
1185
|
if 'prcfo' in csv_df.columns:
|
1299
1186
|
#csv_df = csv_df.loc[:, ~csv_df.columns.duplicated()].copy()
|
1300
|
-
csv_df[['
|
1301
|
-
csv_df['prc'] = csv_df['
|
1187
|
+
csv_df[['plateID', 'rowID', 'columnID', 'fieldID', 'objectID']] = csv_df['prcfo'].str.split('_', expand=True)
|
1188
|
+
csv_df['prc'] = csv_df['plateID'].astype(str) + '_' + csv_df['rowID'].astype(str) + '_' + csv_df['columnID'].astype(str)
|
1302
1189
|
|
1303
1190
|
if isinstance(filter_column, str):
|
1304
1191
|
filter_column = [filter_column]
|
1305
1192
|
|
1306
1193
|
if isinstance(filter_value, str):
|
1307
1194
|
filter_value = [filter_value]
|
1308
|
-
|
1195
|
+
|
1309
1196
|
if isinstance(filter_column, list):
|
1310
1197
|
for filter_col in filter_column:
|
1311
1198
|
for value in filter_value:
|
1312
1199
|
csv_df = csv_df[csv_df[filter_col] != value]
|
1313
|
-
|
1200
|
+
|
1314
1201
|
# Ensure the necessary columns are present
|
1315
|
-
if not all(col in csv_df.columns for col in ['
|
1316
|
-
raise ValueError("The CSV file must contain 'grna', 'count', '
|
1202
|
+
if not all(col in csv_df.columns for col in ['rowID','columnID','grna','count']):
|
1203
|
+
raise ValueError("The CSV file must contain 'grna', 'count', 'rowID', and 'columnID' columns.")
|
1317
1204
|
|
1318
1205
|
# Create the prc column
|
1319
|
-
csv_df['prc'] = csv_df['
|
1206
|
+
csv_df['prc'] = csv_df['plateID'] + '_' + csv_df['rowID'] + '_' + csv_df['columnID']
|
1320
1207
|
|
1321
1208
|
# Group by prc and calculate the sum of counts
|
1322
1209
|
grouped_df = csv_df.groupby('prc')['count'].sum().reset_index()
|
@@ -1378,44 +1265,34 @@ def clean_controls(df,values, column):
|
|
1378
1265
|
return df
|
1379
1266
|
|
1380
1267
|
def process_scores(df, dependent_variable, plate, min_cell_count=25, agg_type='mean', transform=None, regression_type='ols'):
|
1381
|
-
from .utils import calculate_shortest_distance
|
1268
|
+
from .utils import calculate_shortest_distance, correct_metadata
|
1382
1269
|
df = df.reset_index(drop=True)
|
1383
|
-
|
1384
1270
|
if 'prcfo' in df.columns:
|
1385
1271
|
df = df.loc[:, ~df.columns.duplicated()].copy()
|
1386
|
-
if not all(col in df.columns for col in ['
|
1387
|
-
df[['
|
1388
|
-
if all(col in df.columns for col in ['
|
1389
|
-
df['prc'] = df['
|
1272
|
+
if not all(col in df.columns for col in ['plateID', 'rowID', 'columnID']):
|
1273
|
+
df[['plateID', 'rowID', 'columnID', 'fieldID', 'objectID']] = df['prcfo'].str.split('_', expand=True)
|
1274
|
+
if all(col in df.columns for col in ['plateID', 'rowID', 'columnID']):
|
1275
|
+
df['prc'] = df['plateID'].astype(str) + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
|
1390
1276
|
else:
|
1391
|
-
if 'plate_name' in df.columns:
|
1392
|
-
df.drop(columns=['plate'], inplace=True)
|
1393
|
-
#df = df.rename(columns={'plate_name': 'plate'})
|
1394
|
-
|
1395
|
-
if 'plate' in df.columns:
|
1396
|
-
df['plate_name'] = df['plate']
|
1397
|
-
|
1398
|
-
if plate is not None:
|
1399
|
-
df['plate_name'] = plate
|
1400
1277
|
|
1401
|
-
|
1402
|
-
|
1403
|
-
|
1404
|
-
if 'col' in df.columns:
|
1405
|
-
df = df.rename(columns={'col': 'column_name'})
|
1278
|
+
|
1279
|
+
df = correct_metadata(df)
|
1280
|
+
|
1406
1281
|
|
1407
|
-
if
|
1408
|
-
df =
|
1282
|
+
if not plate is None:
|
1283
|
+
df['plateID'] = plate
|
1409
1284
|
|
1410
1285
|
df = df.loc[:, ~df.columns.duplicated()].copy()
|
1411
|
-
|
1286
|
+
|
1287
|
+
if all(col in df.columns for col in ['plateID', 'rowID', 'columnID']):
|
1288
|
+
df['prc'] = df['plateID'].astype(str) + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
|
1289
|
+
else:
|
1290
|
+
raise ValueError("The DataFrame must contain 'plateID', 'rowID', and 'columnID' columns.")
|
1412
1291
|
|
1413
1292
|
df = df[['prc', dependent_variable]]
|
1414
1293
|
# Group by prc and calculate the mean and count of the dependent_variable
|
1415
1294
|
grouped = df.groupby('prc')[dependent_variable]
|
1416
|
-
|
1417
|
-
display(grouped)
|
1418
|
-
|
1295
|
+
|
1419
1296
|
if regression_type != 'poisson':
|
1420
1297
|
|
1421
1298
|
print(f'Using agg_type: {agg_type}')
|
@@ -1600,7 +1477,7 @@ def generate_ml_scores(settings):
|
|
1600
1477
|
|
1601
1478
|
return [output, plate_heatmap]
|
1602
1479
|
|
1603
|
-
def ml_analysis(df, channel_of_interest=3, location_column='
|
1480
|
+
def ml_analysis(df, channel_of_interest=3, location_column='columnID', positive_control='c2', negative_control='c1', exclude=None, n_repeats=10, top_features=30, reg_alpha=0.1, reg_lambda=1.0, learning_rate=0.00001, n_estimators=1000, test_size=0.2, model_type='xgboost', n_jobs=-1, remove_low_variance_features=True, remove_highly_correlated_features=True, prune_features=False, cross_validation=False, verbose=False):
|
1604
1481
|
|
1605
1482
|
"""
|
1606
1483
|
Calculates permutation importance for numerical features in the dataframe,
|
@@ -1829,8 +1706,8 @@ def ml_analysis(df, channel_of_interest=3, location_column='column_name', positi
|
|
1829
1706
|
df = _calculate_similarity(df, features, location_column, positive_control, negative_control)
|
1830
1707
|
|
1831
1708
|
df['prcfo'] = df.index.astype(str)
|
1832
|
-
df[['
|
1833
|
-
df['prc'] = df['
|
1709
|
+
df[['plateID', 'rowID', 'columnID', 'fieldID', 'object']] = df['prcfo'].str.split('_', expand=True)
|
1710
|
+
df['prc'] = df['plateID'] + '_' + df['rowID'] + '_' + df['columnID']
|
1834
1711
|
|
1835
1712
|
return [df, permutation_df, feature_importance_df, model, X_train, X_test, y_train, y_test, metrics_df, features], [permutation_fig, feature_importance_fig]
|
1836
1713
|
|
@@ -2004,11 +1881,17 @@ def interperate_vision_model(settings={}):
|
|
2004
1881
|
# Clean and align columns for merging
|
2005
1882
|
df['object_label'] = df['object_label'].str.replace('o', '')
|
2006
1883
|
|
2007
|
-
if '
|
2008
|
-
|
1884
|
+
if 'rowID' not in scores_df.columns:
|
1885
|
+
if 'row' in scores_df.columns:
|
1886
|
+
scores_df['rowID'] = scores_df['row']
|
1887
|
+
if 'row_name' in scores_df.columns:
|
1888
|
+
scores_df['rowID'] = scores_df['row_name']
|
2009
1889
|
|
2010
|
-
if '
|
2011
|
-
|
1890
|
+
if 'columnID' not in scores_df.columns:
|
1891
|
+
if 'col' in scores_df.columns:
|
1892
|
+
scores_df['columnID'] = scores_df['col']
|
1893
|
+
if 'column' in scores_df.columns:
|
1894
|
+
scores_df['columnID'] = scores_df['column']
|
2012
1895
|
|
2013
1896
|
if 'object_label' not in scores_df.columns:
|
2014
1897
|
scores_df['object_label'] = scores_df['object']
|
@@ -2020,14 +1903,14 @@ def interperate_vision_model(settings={}):
|
|
2020
1903
|
scores_df['object_label'] = scores_df['object'].astype(str)
|
2021
1904
|
|
2022
1905
|
# Ensure all join columns have the same data type in both DataFrames
|
2023
|
-
df[['
|
2024
|
-
scores_df[['
|
1906
|
+
df[['plateID', 'rowID', 'columnID', 'fieldID', 'object_label']] = df[['plateID', 'rowID', 'columnID', 'fieldID', 'object_label']].astype(str)
|
1907
|
+
scores_df[['plateID', 'rowID', 'columnID', 'fieldID', 'object_label']] = scores_df[['plateID', 'rowID', 'columnID', 'fieldID', 'object_label']].astype(str)
|
2025
1908
|
|
2026
1909
|
# Select only the necessary columns from scores_df for merging
|
2027
|
-
scores_df = scores_df[['
|
1910
|
+
scores_df = scores_df[['plateID', 'rowID', 'columnID', 'fieldID', 'object_label', settings['score_column']]]
|
2028
1911
|
|
2029
1912
|
# Now merge DataFrames
|
2030
|
-
merged_df = pd.merge(df, scores_df, on=['
|
1913
|
+
merged_df = pd.merge(df, scores_df, on=['plateID', 'rowID', 'columnID', 'fieldID', 'object_label'], how='inner')
|
2031
1914
|
|
2032
1915
|
# Separate numerical features and the score column
|
2033
1916
|
X = merged_df.select_dtypes(include='number').drop(columns=[settings['score_column']])
|