spacr 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/core.py +105 -1
- spacr/deep_spacr.py +191 -141
- spacr/gui.py +1 -0
- spacr/gui_core.py +13 -4
- spacr/gui_utils.py +29 -1
- spacr/io.py +84 -125
- spacr/measure.py +1 -38
- spacr/ml.py +153 -66
- spacr/plot.py +429 -7
- spacr/settings.py +55 -10
- spacr/submodules.py +7 -6
- spacr/toxo.py +9 -4
- spacr/utils.py +510 -16
- {spacr-0.3.2.dist-info → spacr-0.3.3.dist-info}/METADATA +28 -25
- {spacr-0.3.2.dist-info → spacr-0.3.3.dist-info}/RECORD +19 -19
- {spacr-0.3.2.dist-info → spacr-0.3.3.dist-info}/LICENSE +0 -0
- {spacr-0.3.2.dist-info → spacr-0.3.3.dist-info}/WHEEL +0 -0
- {spacr-0.3.2.dist-info → spacr-0.3.3.dist-info}/entry_points.txt +0 -0
- {spacr-0.3.2.dist-info → spacr-0.3.3.dist-info}/top_level.txt +0 -0
spacr/ml.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
import os, shap
|
1
|
+
import os, shap, re
|
2
2
|
import pandas as pd
|
3
3
|
import numpy as np
|
4
4
|
from scipy import stats
|
@@ -354,75 +354,128 @@ def perform_regression(settings):
|
|
354
354
|
from .settings import get_perform_regression_default_settings
|
355
355
|
from .toxo import go_term_enrichment_by_column, custom_volcano_plot
|
356
356
|
|
357
|
-
|
358
|
-
|
359
|
-
if
|
360
|
-
settings['
|
361
|
-
|
362
|
-
|
357
|
+
def _perform_regression_read_data(settings):
|
358
|
+
|
359
|
+
if isinstance(settings['score_data'], list) and isinstance(settings['count_data'], list):
|
360
|
+
settings['plate'] = None
|
361
|
+
if len(settings['score_data']) == 1:
|
362
|
+
settings['score_data'] = settings['score_data'][0]
|
363
|
+
if len(settings['count_data']) == 1:
|
364
|
+
settings['count_data'] = settings['count_data'][0]
|
365
|
+
else:
|
366
|
+
count_data_df = pd.DataFrame()
|
367
|
+
for i, count_data in enumerate(settings['count_data']):
|
368
|
+
df = pd.read_csv(count_data)
|
369
|
+
df['plate_name'] = f'plate{i+1}'
|
370
|
+
if 'column' in df.columns:
|
371
|
+
df['col'] = df['column']
|
372
|
+
count_data_df = pd.concat([count_data_df, df])
|
373
|
+
print('Count data:', len(count_data_df))
|
374
|
+
|
375
|
+
score_data_df = pd.DataFrame()
|
376
|
+
for i, score_data in enumerate(settings['score_data']):
|
377
|
+
df = pd.read_csv(score_data)
|
378
|
+
df['plate_name'] = f'plate{i+1}'
|
379
|
+
if 'column' in df.columns:
|
380
|
+
df['col'] = df['column']
|
381
|
+
score_data_df = pd.concat([score_data_df, df])
|
382
|
+
print('Score data:', len(score_data_df))
|
363
383
|
else:
|
364
|
-
count_data_df = pd.
|
365
|
-
|
366
|
-
df = pd.read_csv(count_data)
|
367
|
-
df['plate_name'] = f'plate{i+1}'
|
368
|
-
count_data_df = pd.concat([count_data_df, df])
|
369
|
-
print('Count data:', len(count_data_df))
|
370
|
-
|
371
|
-
score_data_df = pd.DataFrame()
|
372
|
-
for i, score_data in enumerate(settings['score_data']):
|
373
|
-
df = pd.read_csv(score_data)
|
374
|
-
df['plate_name'] = f'plate{i+1}'
|
375
|
-
score_data_df = pd.concat([score_data_df, df])
|
376
|
-
print('Score data:', len(score_data_df))
|
377
|
-
else:
|
378
|
-
count_data_df = pd.read_csv(settings['count_data'])
|
379
|
-
score_data_df = pd.read_csv(settings['score_data'])
|
380
|
-
|
381
|
-
reg_types = ['ols','gls','wls','rlm','glm','mixed','quantile','logit','probit','poisson','lasso','ridge']
|
382
|
-
if settings['regression_type'] not in reg_types:
|
383
|
-
print(f'Possible regression types: {reg_types}')
|
384
|
-
raise ValueError(f"Unsupported regression type {settings['regression_type']}")
|
385
|
-
|
386
|
-
if settings['dependent_variable'] not in score_data_df.columns:
|
387
|
-
print(f'Columns in DataFrame:')
|
388
|
-
for col in score_data_df.columns:
|
389
|
-
print(col)
|
390
|
-
raise ValueError(f"Dependent variable {settings['dependent_variable']} not found in the DataFrame")
|
391
|
-
|
392
|
-
if isinstance(settings['count_data'], list):
|
393
|
-
src = os.path.dirname(settings['count_data'][0])
|
394
|
-
csv_path = settings['count_data'][0]
|
395
|
-
else:
|
396
|
-
src = os.path.dirname(settings['count_data'])
|
397
|
-
csv_path = settings['count_data']
|
384
|
+
count_data_df = pd.read_csv(settings['count_data'])
|
385
|
+
score_data_df = pd.read_csv(settings['score_data'])
|
398
386
|
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
387
|
+
print(f"Dependent variable: {len(score_data_df)}")
|
388
|
+
print(f"Independent variable: {len(count_data_df)}")
|
389
|
+
|
390
|
+
if settings['dependent_variable'] not in score_data_df.columns:
|
391
|
+
print(f'Columns in DataFrame:')
|
392
|
+
for col in score_data_df.columns:
|
393
|
+
print(col)
|
394
|
+
raise ValueError(f"Dependent variable {settings['dependent_variable']} not found in the DataFrame")
|
395
|
+
|
396
|
+
if 'prediction_probability_class_1' in score_data_df.columns:
|
397
|
+
if not settings['class_1_threshold'] is None:
|
398
|
+
score_data_df['predictions'] = (score_data_df['prediction_probability_class_1'] >= settings['class_1_threshold']).astype(int)
|
399
|
+
|
400
|
+
reg_types = ['ols','gls','wls','rlm','glm','mixed','quantile','logit','probit','poisson','lasso','ridge']
|
401
|
+
if settings['regression_type'] not in reg_types:
|
402
|
+
print(f'Possible regression types: {reg_types}')
|
403
|
+
raise ValueError(f"Unsupported regression type {settings['regression_type']}")
|
403
404
|
|
404
|
-
|
405
|
-
|
405
|
+
return count_data_df, score_data_df
|
406
|
+
|
407
|
+
def _perform_regression_set_paths(settings):
|
408
|
+
|
409
|
+
if isinstance(settings['score_data'], list):
|
410
|
+
score_data = settings['score_data'][0]
|
411
|
+
else:
|
412
|
+
score_data = settings['score_data']
|
413
|
+
|
414
|
+
score_source = os.path.splitext(os.path.basename(score_data))[0]
|
415
|
+
|
416
|
+
if isinstance(settings['count_data'], list):
|
417
|
+
src = os.path.dirname(settings['count_data'][0])
|
418
|
+
csv_path = settings['count_data'][0]
|
419
|
+
else:
|
420
|
+
src = os.path.dirname(settings['count_data'])
|
421
|
+
csv_path = settings['count_data']
|
406
422
|
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
423
|
+
settings['src'] = src
|
424
|
+
res_folder = os.path.join(src, 'results', score_source, settings['regression_type'])
|
425
|
+
|
426
|
+
if isinstance(settings['count_data'], list):
|
427
|
+
res_folder = os.path.join(res_folder, 'list')
|
428
|
+
|
429
|
+
os.makedirs(res_folder, exist_ok=True)
|
430
|
+
results_filename = 'results.csv'
|
431
|
+
results_filename_gene = 'results_gene.csv'
|
432
|
+
results_filename_grna = 'results_grna.csv'
|
433
|
+
hits_filename = 'results_significant.csv'
|
434
|
+
results_path=os.path.join(res_folder, results_filename)
|
435
|
+
results_path_gene=os.path.join(res_folder, results_filename_gene)
|
436
|
+
results_path_grna=os.path.join(res_folder, results_filename_grna)
|
437
|
+
hits_path=os.path.join(res_folder, hits_filename)
|
438
|
+
|
439
|
+
return results_path, results_path_gene, results_path_grna, hits_path, res_folder, csv_path
|
440
|
+
|
441
|
+
def _count_variable_instances(df, column_1, column_2):
|
442
|
+
if column_1 is not None:
|
443
|
+
n_grna = df[column_1].value_counts().reset_index()
|
444
|
+
n_grna.columns = [column_1, f'n_{column_1}']
|
445
|
+
if column_2 is not None:
|
446
|
+
n_gene = df[column_2].value_counts().reset_index()
|
447
|
+
n_gene.columns = [column_2, f'n_{column_2}']
|
448
|
+
if column_1 is not None and column_2 is not None:
|
449
|
+
return df, n_grna, n_gene
|
450
|
+
elif column_1 is not None:
|
451
|
+
return df, n_grna
|
452
|
+
elif column_2 is not None:
|
453
|
+
return df, n_gene
|
454
|
+
else:
|
455
|
+
return df
|
413
456
|
|
414
457
|
settings = get_perform_regression_default_settings(settings)
|
458
|
+
count_data_df, score_data_df = _perform_regression_read_data(settings)
|
459
|
+
results_path, results_path_gene, results_path_grna, hits_path, res_folder, csv_path = _perform_regression_set_paths(settings)
|
415
460
|
save_settings(settings, name='regression', show=True)
|
416
461
|
|
417
462
|
score_data_df = clean_controls(score_data_df, settings['pc'], settings['nc'], settings['other'])
|
418
|
-
|
419
|
-
if 'prediction_probability_class_1' in score_data_df.columns:
|
420
|
-
if not settings['class_1_threshold'] is None:
|
421
|
-
score_data_df['predictions'] = (score_data_df['prediction_probability_class_1'] >= settings['class_1_threshold']).astype(int)
|
463
|
+
print(f"Dependent variable after clean_controls: {len(score_data_df)}")
|
422
464
|
|
423
465
|
dependent_df, dependent_variable = process_scores(score_data_df, settings['dependent_variable'], settings['plate'], settings['min_cell_count'], settings['agg_type'], settings['transform'])
|
424
|
-
|
425
|
-
|
466
|
+
print(f"Dependent variable after process_scores: {len(dependent_df)}")
|
467
|
+
|
468
|
+
filter_value = [settings['nc'], settings['pc']]
|
469
|
+
|
470
|
+
if settings['other'] is not None:
|
471
|
+
if isinstance(settings['other'], str):
|
472
|
+
settings['other'] = [settings['other']]
|
473
|
+
filter_value.extend(settings['other'])
|
474
|
+
|
475
|
+
independent_df = process_reads(count_data_df, settings['fraction_threshold'], settings['plate'], filter_column=settings['location_column'], filter_value=filter_value)
|
476
|
+
independent_df, n_grna, n_gene = _count_variable_instances(independent_df, column_1='grna', column_2='gene')
|
477
|
+
|
478
|
+
print(f"Independent variable after process_reads: {len(independent_df)}")
|
426
479
|
|
427
480
|
merged_df = pd.merge(independent_df, dependent_df, on='prc')
|
428
481
|
|
@@ -436,7 +489,20 @@ def perform_regression(settings):
|
|
436
489
|
|
437
490
|
model, coef_df = regression(merged_df, csv_path, dependent_variable, settings['regression_type'], settings['alpha'], settings['random_row_column_effects'], highlight=settings['highlight'], dst=res_folder, cov_type=settings['cov_type'])
|
438
491
|
|
492
|
+
coef_df['grna'] = coef_df['feature'].apply(lambda x: re.search(r'grna\[(.*?)\]', x).group(1) if 'grna' in x else None)
|
493
|
+
coef_df['gene'] = coef_df['feature'].apply(lambda x: re.search(r'gene\[(.*?)\]', x).group(1) if 'gene' in x else None)
|
494
|
+
coef_df = coef_df.merge(n_grna, how='left', on='grna')
|
495
|
+
coef_df = coef_df.merge(n_gene, how='left', on='gene')
|
496
|
+
display(coef_df)
|
497
|
+
|
498
|
+
gene_coef_df = coef_df[coef_df['n_gene'] != None]
|
499
|
+
grna_coef_df = coef_df[coef_df['n_grna'] != None]
|
500
|
+
gene_coef_df = gene_coef_df.dropna(subset=['n_gene'])
|
501
|
+
grna_coef_df = grna_coef_df.dropna(subset=['n_grna'])
|
502
|
+
|
439
503
|
coef_df.to_csv(results_path, index=False)
|
504
|
+
gene_coef_df.to_csv(results_path_gene, index=False)
|
505
|
+
grna_coef_df.to_csv(results_path_grna, index=False)
|
440
506
|
|
441
507
|
if settings['regression_type'] == 'lasso':
|
442
508
|
significant = coef_df[coef_df['coefficient'] > 0]
|
@@ -460,18 +526,24 @@ def perform_regression(settings):
|
|
460
526
|
filename, _ = os.path.splitext(file)
|
461
527
|
_ = merge_regression_res_with_metadata(hits_path, metadata_file, name=filename)
|
462
528
|
merged_df = merge_regression_res_with_metadata(results_path, metadata_file, name=filename)
|
529
|
+
gene_merged_df = merge_regression_res_with_metadata(results_path_gene, metadata_file, name=filename)
|
530
|
+
grna_merged_df = merge_regression_res_with_metadata(results_path_grna, metadata_file, name=filename)
|
463
531
|
|
464
532
|
if settings['toxo']:
|
465
533
|
|
466
534
|
data_path = merged_df
|
535
|
+
data_path_gene = gene_merged_df
|
536
|
+
data_path_grna = grna_merged_df
|
467
537
|
base_dir = os.path.dirname(os.path.abspath(__file__))
|
468
538
|
metadata_path = os.path.join(base_dir, 'resources', 'data', 'lopit.csv')
|
469
539
|
|
470
540
|
custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
471
|
-
|
472
|
-
metadata_path =
|
473
|
-
|
474
|
-
|
541
|
+
custom_volcano_plot(data_path_gene, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
542
|
+
custom_volcano_plot(data_path_grna, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
543
|
+
|
544
|
+
if len(significant) > 2:
|
545
|
+
metadata_path = os.path.join(base_dir, 'resources', 'data', 'toxoplasma_metadata.csv')
|
546
|
+
go_term_enrichment_by_column(significant, metadata_path)
|
475
547
|
|
476
548
|
print('Significant Genes')
|
477
549
|
display(significant)
|
@@ -481,29 +553,43 @@ def perform_regression(settings):
|
|
481
553
|
|
482
554
|
return output
|
483
555
|
|
484
|
-
def process_reads(csv_path, fraction_threshold, plate):
|
556
|
+
def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filter_value=None):
|
485
557
|
|
486
558
|
if isinstance(csv_path, pd.DataFrame):
|
487
559
|
csv_df = csv_path
|
488
560
|
else:
|
489
561
|
# Read the CSV file into a DataFrame
|
490
562
|
csv_df = pd.read_csv(csv_path)
|
491
|
-
|
563
|
+
|
492
564
|
if 'plate_name' in csv_df.columns:
|
493
565
|
csv_df = csv_df.rename(columns={'plate_name': 'plate'})
|
494
566
|
if 'column_name' in csv_df.columns:
|
495
567
|
csv_df = csv_df.rename(columns={'column_name': 'column'})
|
568
|
+
if 'col' in csv_df.columns:
|
569
|
+
csv_df = csv_df.rename(columns={'col': 'column'})
|
496
570
|
if 'row_name' in csv_df.columns:
|
497
571
|
csv_df = csv_df.rename(columns={'row_name': 'row'})
|
498
572
|
if 'grna_name' in csv_df.columns:
|
499
573
|
csv_df = csv_df.rename(columns={'grna_name': 'grna'})
|
500
574
|
if 'plate_row' in csv_df.columns:
|
501
575
|
csv_df[['plate', 'row']] = csv_df['plate_row'].str.split('_', expand=True)
|
576
|
+
|
502
577
|
if not 'plate' in csv_df.columns:
|
503
578
|
if not plate is None:
|
504
579
|
csv_df['plate'] = plate
|
505
580
|
else:
|
506
581
|
csv_df['plate'] = 'plate1'
|
582
|
+
|
583
|
+
if isinstance(filter_column, str):
|
584
|
+
filter_column = [filter_column]
|
585
|
+
|
586
|
+
if isinstance(filter_value, str):
|
587
|
+
filter_value = [filter_value]
|
588
|
+
|
589
|
+
if isinstance(filter_column, list):
|
590
|
+
for filter_col in filter_column:
|
591
|
+
for value in filter_value:
|
592
|
+
csv_df = csv_df[csv_df[filter_col] != value]
|
507
593
|
|
508
594
|
# Ensure the necessary columns are present
|
509
595
|
if not all(col in csv_df.columns for col in ['row','column','grna','count']):
|
@@ -587,7 +673,8 @@ def process_scores(df, dependent_variable, plate, min_cell_count=25, agg_type='m
|
|
587
673
|
if 'col' not in df.columns:
|
588
674
|
df['col'] = df['column']
|
589
675
|
|
590
|
-
df['prc'] = df['plate'] + '_' + df['row'] + '_' + df['col']
|
676
|
+
df['prc'] = df['plate'].astype(str) + '_' + df['row'].astype(str) + '_' + df['col'].astype(str)
|
677
|
+
|
591
678
|
df = df[['prc', dependent_variable]]
|
592
679
|
|
593
680
|
# Group by prc and calculate the mean and count of the dependent_variable
|