spacr 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spacr/ml.py CHANGED
@@ -1,4 +1,4 @@
1
- import os, shap
1
+ import os, shap, re
2
2
  import pandas as pd
3
3
  import numpy as np
4
4
  from scipy import stats
@@ -354,75 +354,128 @@ def perform_regression(settings):
354
354
  from .settings import get_perform_regression_default_settings
355
355
  from .toxo import go_term_enrichment_by_column, custom_volcano_plot
356
356
 
357
- if isinstance(settings['score_data'], list) and isinstance(settings['count_data'], list):
358
- settings['plate'] = None
359
- if len(settings['score_data']) == 1:
360
- settings['score_data'] = settings['score_data'][0]
361
- if len(settings['count_data']) == 1:
362
- settings['count_data'] = settings['count_data'][0]
357
+ def _perform_regression_read_data(settings):
358
+
359
+ if isinstance(settings['score_data'], list) and isinstance(settings['count_data'], list):
360
+ settings['plate'] = None
361
+ if len(settings['score_data']) == 1:
362
+ settings['score_data'] = settings['score_data'][0]
363
+ if len(settings['count_data']) == 1:
364
+ settings['count_data'] = settings['count_data'][0]
365
+ else:
366
+ count_data_df = pd.DataFrame()
367
+ for i, count_data in enumerate(settings['count_data']):
368
+ df = pd.read_csv(count_data)
369
+ df['plate_name'] = f'plate{i+1}'
370
+ if 'column' in df.columns:
371
+ df['col'] = df['column']
372
+ count_data_df = pd.concat([count_data_df, df])
373
+ print('Count data:', len(count_data_df))
374
+
375
+ score_data_df = pd.DataFrame()
376
+ for i, score_data in enumerate(settings['score_data']):
377
+ df = pd.read_csv(score_data)
378
+ df['plate_name'] = f'plate{i+1}'
379
+ if 'column' in df.columns:
380
+ df['col'] = df['column']
381
+ score_data_df = pd.concat([score_data_df, df])
382
+ print('Score data:', len(score_data_df))
363
383
  else:
364
- count_data_df = pd.DataFrame()
365
- for i, count_data in enumerate(settings['count_data']):
366
- df = pd.read_csv(count_data)
367
- df['plate_name'] = f'plate{i+1}'
368
- count_data_df = pd.concat([count_data_df, df])
369
- print('Count data:', len(count_data_df))
370
-
371
- score_data_df = pd.DataFrame()
372
- for i, score_data in enumerate(settings['score_data']):
373
- df = pd.read_csv(score_data)
374
- df['plate_name'] = f'plate{i+1}'
375
- score_data_df = pd.concat([score_data_df, df])
376
- print('Score data:', len(score_data_df))
377
- else:
378
- count_data_df = pd.read_csv(settings['count_data'])
379
- score_data_df = pd.read_csv(settings['score_data'])
380
-
381
- reg_types = ['ols','gls','wls','rlm','glm','mixed','quantile','logit','probit','poisson','lasso','ridge']
382
- if settings['regression_type'] not in reg_types:
383
- print(f'Possible regression types: {reg_types}')
384
- raise ValueError(f"Unsupported regression type {settings['regression_type']}")
385
-
386
- if settings['dependent_variable'] not in score_data_df.columns:
387
- print(f'Columns in DataFrame:')
388
- for col in score_data_df.columns:
389
- print(col)
390
- raise ValueError(f"Dependent variable {settings['dependent_variable']} not found in the DataFrame")
391
-
392
- if isinstance(settings['count_data'], list):
393
- src = os.path.dirname(settings['count_data'][0])
394
- csv_path = settings['count_data'][0]
395
- else:
396
- src = os.path.dirname(settings['count_data'])
397
- csv_path = settings['count_data']
384
+ count_data_df = pd.read_csv(settings['count_data'])
385
+ score_data_df = pd.read_csv(settings['score_data'])
398
386
 
399
- settings['src'] = src
400
- fldr = 'results_' + settings['regression_type']
401
- if isinstance(settings['count_data'], list):
402
- fldr = fldr + '_list'
387
+ print(f"Dependent variable: {len(score_data_df)}")
388
+ print(f"Independent variable: {len(count_data_df)}")
389
+
390
+ if settings['dependent_variable'] not in score_data_df.columns:
391
+ print(f'Columns in DataFrame:')
392
+ for col in score_data_df.columns:
393
+ print(col)
394
+ raise ValueError(f"Dependent variable {settings['dependent_variable']} not found in the DataFrame")
395
+
396
+ if 'prediction_probability_class_1' in score_data_df.columns:
397
+ if not settings['class_1_threshold'] is None:
398
+ score_data_df['predictions'] = (score_data_df['prediction_probability_class_1'] >= settings['class_1_threshold']).astype(int)
399
+
400
+ reg_types = ['ols','gls','wls','rlm','glm','mixed','quantile','logit','probit','poisson','lasso','ridge']
401
+ if settings['regression_type'] not in reg_types:
402
+ print(f'Possible regression types: {reg_types}')
403
+ raise ValueError(f"Unsupported regression type {settings['regression_type']}")
403
404
 
404
- if settings['regression_type'] == 'quantile':
405
- fldr = fldr + '_' + str(settings['alpha'])
405
+ return count_data_df, score_data_df
406
+
407
+ def _perform_regression_set_paths(settings):
408
+
409
+ if isinstance(settings['score_data'], list):
410
+ score_data = settings['score_data'][0]
411
+ else:
412
+ score_data = settings['score_data']
413
+
414
+ score_source = os.path.splitext(os.path.basename(score_data))[0]
415
+
416
+ if isinstance(settings['count_data'], list):
417
+ src = os.path.dirname(settings['count_data'][0])
418
+ csv_path = settings['count_data'][0]
419
+ else:
420
+ src = os.path.dirname(settings['count_data'])
421
+ csv_path = settings['count_data']
406
422
 
407
- res_folder = os.path.join(src, fldr)
408
- os.makedirs(res_folder, exist_ok=True)
409
- results_filename = 'results.csv'
410
- hits_filename = 'results_significant.csv'
411
- results_path=os.path.join(res_folder, results_filename)
412
- hits_path=os.path.join(res_folder, hits_filename)
423
+ settings['src'] = src
424
+ res_folder = os.path.join(src, 'results', score_source, settings['regression_type'])
425
+
426
+ if isinstance(settings['count_data'], list):
427
+ res_folder = os.path.join(res_folder, 'list')
428
+
429
+ os.makedirs(res_folder, exist_ok=True)
430
+ results_filename = 'results.csv'
431
+ results_filename_gene = 'results_gene.csv'
432
+ results_filename_grna = 'results_grna.csv'
433
+ hits_filename = 'results_significant.csv'
434
+ results_path=os.path.join(res_folder, results_filename)
435
+ results_path_gene=os.path.join(res_folder, results_filename_gene)
436
+ results_path_grna=os.path.join(res_folder, results_filename_grna)
437
+ hits_path=os.path.join(res_folder, hits_filename)
438
+
439
+ return results_path, results_path_gene, results_path_grna, hits_path, res_folder, csv_path
440
+
441
+ def _count_variable_instances(df, column_1, column_2):
442
+ if column_1 is not None:
443
+ n_grna = df[column_1].value_counts().reset_index()
444
+ n_grna.columns = [column_1, f'n_{column_1}']
445
+ if column_2 is not None:
446
+ n_gene = df[column_2].value_counts().reset_index()
447
+ n_gene.columns = [column_2, f'n_{column_2}']
448
+ if column_1 is not None and column_2 is not None:
449
+ return df, n_grna, n_gene
450
+ elif column_1 is not None:
451
+ return df, n_grna
452
+ elif column_2 is not None:
453
+ return df, n_gene
454
+ else:
455
+ return df
413
456
 
414
457
  settings = get_perform_regression_default_settings(settings)
458
+ count_data_df, score_data_df = _perform_regression_read_data(settings)
459
+ results_path, results_path_gene, results_path_grna, hits_path, res_folder, csv_path = _perform_regression_set_paths(settings)
415
460
  save_settings(settings, name='regression', show=True)
416
461
 
417
462
  score_data_df = clean_controls(score_data_df, settings['pc'], settings['nc'], settings['other'])
418
-
419
- if 'prediction_probability_class_1' in score_data_df.columns:
420
- if not settings['class_1_threshold'] is None:
421
- score_data_df['predictions'] = (score_data_df['prediction_probability_class_1'] >= settings['class_1_threshold']).astype(int)
463
+ print(f"Dependent variable after clean_controls: {len(score_data_df)}")
422
464
 
423
465
  dependent_df, dependent_variable = process_scores(score_data_df, settings['dependent_variable'], settings['plate'], settings['min_cell_count'], settings['agg_type'], settings['transform'])
424
-
425
- independent_df = process_reads(count_data_df, settings['fraction_threshold'], settings['plate'])
466
+ print(f"Dependent variable after process_scores: {len(dependent_df)}")
467
+
468
+ filter_value = [settings['nc'], settings['pc']]
469
+
470
+ if settings['other'] is not None:
471
+ if isinstance(settings['other'], str):
472
+ settings['other'] = [settings['other']]
473
+ filter_value.extend(settings['other'])
474
+
475
+ independent_df = process_reads(count_data_df, settings['fraction_threshold'], settings['plate'], filter_column=settings['location_column'], filter_value=filter_value)
476
+ independent_df, n_grna, n_gene = _count_variable_instances(independent_df, column_1='grna', column_2='gene')
477
+
478
+ print(f"Independent variable after process_reads: {len(independent_df)}")
426
479
 
427
480
  merged_df = pd.merge(independent_df, dependent_df, on='prc')
428
481
 
@@ -436,7 +489,20 @@ def perform_regression(settings):
436
489
 
437
490
  model, coef_df = regression(merged_df, csv_path, dependent_variable, settings['regression_type'], settings['alpha'], settings['random_row_column_effects'], highlight=settings['highlight'], dst=res_folder, cov_type=settings['cov_type'])
438
491
 
492
+ coef_df['grna'] = coef_df['feature'].apply(lambda x: re.search(r'grna\[(.*?)\]', x).group(1) if 'grna' in x else None)
493
+ coef_df['gene'] = coef_df['feature'].apply(lambda x: re.search(r'gene\[(.*?)\]', x).group(1) if 'gene' in x else None)
494
+ coef_df = coef_df.merge(n_grna, how='left', on='grna')
495
+ coef_df = coef_df.merge(n_gene, how='left', on='gene')
496
+ display(coef_df)
497
+
498
+ gene_coef_df = coef_df[coef_df['n_gene'] != None]
499
+ grna_coef_df = coef_df[coef_df['n_grna'] != None]
500
+ gene_coef_df = gene_coef_df.dropna(subset=['n_gene'])
501
+ grna_coef_df = grna_coef_df.dropna(subset=['n_grna'])
502
+
439
503
  coef_df.to_csv(results_path, index=False)
504
+ gene_coef_df.to_csv(results_path_gene, index=False)
505
+ grna_coef_df.to_csv(results_path_grna, index=False)
440
506
 
441
507
  if settings['regression_type'] == 'lasso':
442
508
  significant = coef_df[coef_df['coefficient'] > 0]
@@ -460,18 +526,24 @@ def perform_regression(settings):
460
526
  filename, _ = os.path.splitext(file)
461
527
  _ = merge_regression_res_with_metadata(hits_path, metadata_file, name=filename)
462
528
  merged_df = merge_regression_res_with_metadata(results_path, metadata_file, name=filename)
529
+ gene_merged_df = merge_regression_res_with_metadata(results_path_gene, metadata_file, name=filename)
530
+ grna_merged_df = merge_regression_res_with_metadata(results_path_grna, metadata_file, name=filename)
463
531
 
464
532
  if settings['toxo']:
465
533
 
466
534
  data_path = merged_df
535
+ data_path_gene = gene_merged_df
536
+ data_path_grna = grna_merged_df
467
537
  base_dir = os.path.dirname(os.path.abspath(__file__))
468
538
  metadata_path = os.path.join(base_dir, 'resources', 'data', 'lopit.csv')
469
539
 
470
540
  custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
471
-
472
- metadata_path = os.path.join(base_dir, 'resources', 'data', 'toxoplasma_metadata.csv')
473
-
474
- go_term_enrichment_by_column(significant, metadata_path)
541
+ custom_volcano_plot(data_path_gene, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
542
+ custom_volcano_plot(data_path_grna, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
543
+
544
+ if len(significant) > 2:
545
+ metadata_path = os.path.join(base_dir, 'resources', 'data', 'toxoplasma_metadata.csv')
546
+ go_term_enrichment_by_column(significant, metadata_path)
475
547
 
476
548
  print('Significant Genes')
477
549
  display(significant)
@@ -481,29 +553,43 @@ def perform_regression(settings):
481
553
 
482
554
  return output
483
555
 
484
- def process_reads(csv_path, fraction_threshold, plate):
556
+ def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filter_value=None):
485
557
 
486
558
  if isinstance(csv_path, pd.DataFrame):
487
559
  csv_df = csv_path
488
560
  else:
489
561
  # Read the CSV file into a DataFrame
490
562
  csv_df = pd.read_csv(csv_path)
491
-
563
+
492
564
  if 'plate_name' in csv_df.columns:
493
565
  csv_df = csv_df.rename(columns={'plate_name': 'plate'})
494
566
  if 'column_name' in csv_df.columns:
495
567
  csv_df = csv_df.rename(columns={'column_name': 'column'})
568
+ if 'col' in csv_df.columns:
569
+ csv_df = csv_df.rename(columns={'col': 'column'})
496
570
  if 'row_name' in csv_df.columns:
497
571
  csv_df = csv_df.rename(columns={'row_name': 'row'})
498
572
  if 'grna_name' in csv_df.columns:
499
573
  csv_df = csv_df.rename(columns={'grna_name': 'grna'})
500
574
  if 'plate_row' in csv_df.columns:
501
575
  csv_df[['plate', 'row']] = csv_df['plate_row'].str.split('_', expand=True)
576
+
502
577
  if not 'plate' in csv_df.columns:
503
578
  if not plate is None:
504
579
  csv_df['plate'] = plate
505
580
  else:
506
581
  csv_df['plate'] = 'plate1'
582
+
583
+ if isinstance(filter_column, str):
584
+ filter_column = [filter_column]
585
+
586
+ if isinstance(filter_value, str):
587
+ filter_value = [filter_value]
588
+
589
+ if isinstance(filter_column, list):
590
+ for filter_col in filter_column:
591
+ for value in filter_value:
592
+ csv_df = csv_df[csv_df[filter_col] != value]
507
593
 
508
594
  # Ensure the necessary columns are present
509
595
  if not all(col in csv_df.columns for col in ['row','column','grna','count']):
@@ -587,7 +673,8 @@ def process_scores(df, dependent_variable, plate, min_cell_count=25, agg_type='m
587
673
  if 'col' not in df.columns:
588
674
  df['col'] = df['column']
589
675
 
590
- df['prc'] = df['plate'] + '_' + df['row'] + '_' + df['col']
676
+ df['prc'] = df['plate'].astype(str) + '_' + df['row'].astype(str) + '_' + df['col'].astype(str)
677
+
591
678
  df = df[['prc', dependent_variable]]
592
679
 
593
680
  # Group by prc and calculate the mean and count of the dependent_variable