msreport 0.0.32__py3-none-any.whl → 0.0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/__init__.py CHANGED
@@ -1,11 +1,20 @@
1
- import msreport.analyze
2
- import msreport.export
3
- import msreport.impute
4
- import msreport.normalize
5
- import msreport.plot
6
- import msreport.reader
1
+ from msreport import analyze, export, impute, normalize, plot, reader
7
2
  from msreport.fasta import import_protein_database
8
3
  from msreport.qtable import Qtable
9
4
  from msreport.reader import FragPipeReader, MaxQuantReader, SpectronautReader
10
5
 
11
- __version__ = "0.0.32"
6
+ __version__ = "0.0.33"
7
+
8
+ __all__ = [
9
+ "analyze",
10
+ "export",
11
+ "impute",
12
+ "normalize",
13
+ "plot",
14
+ "reader",
15
+ "import_protein_database",
16
+ "Qtable",
17
+ "FragPipeReader",
18
+ "MaxQuantReader",
19
+ "SpectronautReader",
20
+ ]
msreport/analyze.py CHANGED
@@ -10,6 +10,8 @@ from typing import Iterable, Optional, Protocol, Sequence
10
10
 
11
11
  import numpy as np
12
12
  import pandas as pd
13
+ import scipy.stats
14
+ import statsmodels.stats.multitest
13
15
  from typing_extensions import Self
14
16
 
15
17
  import msreport.normalize
@@ -533,7 +535,7 @@ def calculate_multi_group_comparison(
533
535
  qtable.add_expression_features(comparison_table)
534
536
 
535
537
 
536
- def two_group_comparison(
538
+ def calculate_two_group_comparison(
537
539
  qtable: Qtable, experiment_pair: Iterable[str], exclude_invalid: bool = True
538
540
  ) -> None:
539
541
  """Calculates comparison values for two experiments.
@@ -560,19 +562,23 @@ def calculate_multi_group_limma(
560
562
  exclude_invalid: bool = True,
561
563
  batch: bool = False,
562
564
  limma_trend: bool = True,
565
+ min_replicates: int = 2,
563
566
  ) -> None:
564
567
  """Uses limma to perform a differential expression analysis of multiple experiments.
565
568
 
566
569
  For each experiment pair specified in 'experiment_pairs' the following new columns
567
570
  are added to the qtable:
568
- - "P-value Experiment_1 vs Experiment_2"
569
- - "Adjusted p-value Experiment_1 vs Experiment_2"
570
- - "Average expression Experiment_1 vs Experiment_2"
571
- - "Ratio [log2] Experiment_1 vs Experiment_2"
572
-
573
- Requires that expression columns are set, and expression values are log2 transformed
574
- All rows with missing values are ignored, impute missing values to allow
575
- differential expression analysis of all rows.
571
+ - "P-value 'Experiment_1' vs 'Experiment_2'"
572
+ - "Adjusted p-value 'Experiment_1' vs 'Experiment_2'"
573
+ - "Average expression 'Experiment_1' vs 'Experiment_2'"
574
+ - "Ratio [log2] 'Experiment_1' vs 'Experiment_2'"
575
+
576
+ Requires that the Qtable has defined expression columns, and expression values are
577
+ log2 transformed. For each experiment and row, if the number of non-missing values
578
+ is below 'min_replicates', all values for that experiment and row are set to NaN for
579
+ the LIMMA calculation. As a result, p-values are only calculated for rows where both
580
+ experiments have at least 'min_replicates' non-missing values. Adjusted p-values are
581
+ calculated using the Benjamini-Hochberg (BH) method.
576
582
 
577
583
  Args:
578
584
  qtable: Qtable instance that contains expression values for differential
@@ -588,8 +594,12 @@ def calculate_multi_group_limma(
588
594
  limma_trend: If true, an intensity-dependent trend is fitted to the prior
589
595
  variance during calculation of the moderated t-statistics, refer to
590
596
  limma.eBayes for details; default True.
597
+ min_replicates: Minimum number of non-missing values required in the row of any
598
+ experiment to be included in the analysis; default 2.
591
599
 
592
600
  Raises:
601
+ OptionalDependencyError: If the R interface is not available, which is required
602
+ for using LIMMA.
593
603
  ValueError: If 'experiment_pairs' contains invalid entries. Each experiment pair
594
604
  must have exactly two entries and the two entries must not be the same. All
595
605
  experiments must be present in qtable.design. No duplicate experiment pairs
@@ -620,12 +630,27 @@ def calculate_multi_group_limma(
620
630
  table.index = table.index.astype(str) # It appears that a string is required for R
621
631
  comparison_tag = " vs "
622
632
 
633
+ # Apply min_replicates filter
634
+ for experiment in qtable.get_experiments():
635
+ samples = qtable.get_samples(experiment)
636
+ counts = table[samples].notna().sum(axis=1)
637
+ table.loc[counts < min_replicates, samples] = np.nan
638
+
623
639
  if exclude_invalid:
624
- valid = qtable["Valid"]
640
+ mask = qtable["Valid"].copy().to_numpy() & (table.notna().sum(axis=1) > 0)
625
641
  else:
626
- valid = np.full(table.shape[0], True)
627
- not_nan = table.isna().sum(axis=1) == 0
628
- mask = np.all([valid, not_nan], axis=0)
642
+ mask = table.notna().sum(axis=1) > 0
643
+
644
+ # At least one row with one condition with two values are required for LIMMA
645
+ valid_exp_rows = []
646
+ for exp in qtable.get_experiments():
647
+ samples = qtable.get_samples(exp)
648
+ valid_exp_rows.append(table.loc[mask, samples].notna().sum(axis=1) >= 2)
649
+ if np.array(valid_exp_rows).any(axis=0).sum() == 0:
650
+ raise ValueError(
651
+ "No rows with sufficient data for differential expression analysis remain"
652
+ " after applying 'min_replicates' and 'exclude_invalid' filters."
653
+ )
629
654
 
630
655
  # Exchange experiment names with names that are guaranteed to be valid in R
631
656
  experiment_to_r = {}
@@ -637,7 +662,7 @@ def calculate_multi_group_limma(
637
662
  for exp1, exp2 in experiment_pairs:
638
663
  r_experiment_pairs.append(f"{experiment_to_r[exp1]}-{experiment_to_r[exp2]}")
639
664
 
640
- design.replace({"Experiment": experiment_to_r}, inplace=True)
665
+ design = design.replace({"Experiment": experiment_to_r})
641
666
 
642
667
  # Run limma and join results for all comparison groups
643
668
  limma_results = msreport.rinterface.multi_group_limma(
@@ -647,35 +672,43 @@ def calculate_multi_group_limma(
647
672
  experiment_pair = [r_to_experiment[s] for s in r_comparison_group.split("-")]
648
673
  comparison_group = comparison_tag.join(experiment_pair)
649
674
  mapping = {col: f"{col} {comparison_group}" for col in limma_result.columns}
650
- limma_result.rename(columns=mapping, inplace=True)
675
+ limma_result = limma_result.rename(columns=mapping)
651
676
 
652
677
  limma_table = pd.DataFrame(index=table.index)
653
678
  limma_table = limma_table.join(list(limma_results.values()))
654
- limma_table.fillna(np.nan, inplace=True)
679
+ limma_table = limma_table.astype("float64")
655
680
  qtable.add_expression_features(limma_table)
656
681
 
657
682
  # Average expression from limma is the whole row mean, overwrite with the average
658
683
  # expression of the experiment group
659
684
  for experiment_pair in experiment_pairs:
660
- two_group_comparison(qtable, experiment_pair, exclude_invalid=exclude_invalid)
685
+ calculate_two_group_comparison(
686
+ qtable, experiment_pair, exclude_invalid=exclude_invalid
687
+ )
661
688
 
662
689
 
663
690
  def calculate_two_group_limma(
664
691
  qtable: Qtable,
665
692
  experiment_pair: Sequence[str],
666
693
  exclude_invalid: bool = True,
694
+ batch: bool = False,
667
695
  limma_trend: bool = True,
696
+ min_replicates: int = 2,
668
697
  ) -> None:
669
698
  """Uses limma to perform a differential expression analysis of two experiments.
670
699
 
671
- Adds new columns "P-value Experiment_1 vs Experiment_2",
672
- "Adjusted p-value Experiment_1 vs Experiment_2",
673
- "Average expression Experiment_1 vs Experiment_2", and
674
- "Ratio [log2] Experiment_1 vs Experiment_2" to the qtable.
700
+ New columns that are added to the qtable:
701
+ - "P-value 'Experiment_1' vs 'Experiment_2'"
702
+ - "Adjusted p-value 'Experiment_1' vs 'Experiment_2'"
703
+ - "Average expression 'Experiment_1' vs 'Experiment_2'"
704
+ - "Ratio [log2] 'Experiment_1' vs 'Experiment_2'"
675
705
 
676
- Requires that expression columns are set, and expression values are log2
677
- transformed. All rows with missing values are ignored, impute missing values to
678
- allow differential expression analysis of all rows.
706
+ Requires that the Qtable has defined expression columns, and expression values are
707
+ log2 transformed. For each experiment and row, if the number of non-missing values
708
+ is below 'min_replicates', all values for that experiment and row are set to NaN for
709
+ the LIMMA calculation. As a result, p-values are only calculated for rows where both
710
+ experiments have at least 'min_replicates' non-missing values. Adjusted p-values are
711
+ calculated using the Benjamini-Hochberg (BH) method.
679
712
 
680
713
  Args:
681
714
  qtable: Qtable instance that contains expression values for differential
@@ -684,56 +717,398 @@ def calculate_two_group_limma(
684
717
  experiments must be present in qtable.design
685
718
  exclude_invalid: If true, the column "Valid" is used to determine which rows are
686
719
  used for the differential expression analysis; default True.
720
+ batch: If true batch effects are considered for the differential expression
721
+ analysis. Batches must be specified in the design in a "Batch" column.
687
722
  limma_trend: If true, an intensity-dependent trend is fitted to the prior
688
723
  variances; default True.
724
+ min_replicates: Minimum number of non-missing values required in both
725
+ experiments to include a row in the analysis; default 2.
726
+
689
727
  Raises:
728
+ OptionalDependencyError: If the R interface is not available, which is required
729
+ for using LIMMA.
730
+ KeyError: If the "Batch" column is not present in the qtable.design when
731
+ 'batch' is set to True.
690
732
  ValueError: If 'experiment_pair' contains invalid entries. The experiment pair
691
733
  must have exactly two entries and the two entries must not be the same. Both
692
734
  experiments must be present in qtable.design.
735
+ ValueError: If all values from qtable.design["Batch"] are identical when 'batch'
736
+ is set to True.
737
+ ValueError: If no rows with sufficient data for differential expression analysis
738
+ remain after applying 'min_replicates' and 'exclude_invalid' filters.
693
739
  """
694
740
  if not _rinterface_available:
695
741
  raise OptionalDependencyError(_rinterface_error)
696
742
 
697
743
  _validate_experiment_pair(qtable, experiment_pair)
744
+ if batch and "Batch" not in qtable.get_design():
745
+ raise KeyError(
746
+ "When using calculate_two_group_limma(batch=True) a"
747
+ ' "Batch" column must be present in qtable.design'
748
+ )
749
+ if batch and qtable.get_design()["Batch"].nunique() == 1:
750
+ raise ValueError(
751
+ "When using calculate_two_group_limma(batch=True), not all values from"
752
+ ' qtable.design["Batch"] are allowed to be identical.'
753
+ )
698
754
  # TODO: LIMMA function not tested #
699
755
  table = qtable.make_expression_table(samples_as_columns=True)
700
756
  comparison_tag = " vs "
701
757
 
702
- if exclude_invalid:
703
- valid = qtable["Valid"]
704
- else:
705
- valid = np.full(table.shape[0], True)
706
-
707
758
  samples_to_experiment = {}
708
- for experiment in experiment_pair:
709
- mapping = dict.fromkeys(qtable.get_samples(experiment), experiment)
710
- samples_to_experiment.update(mapping)
759
+ for exp in experiment_pair:
760
+ samples_to_experiment.update(dict.fromkeys(qtable.get_samples(exp), exp))
711
761
 
712
762
  # Keep only samples that are present in the 'experiment_pair'
713
763
  table = table[samples_to_experiment.keys()]
714
764
  table.index = table.index.astype(str) # It appears that a string is required for R
715
- not_nan = table.isna().sum(axis=1) == 0
765
+ groups = [samples_to_experiment[s] for s in table.columns]
716
766
 
717
- mask = np.all([valid, not_nan], axis=0)
718
- experiments = list(samples_to_experiment.values())
767
+ # Apply min_replicates filter
768
+ for experiment in experiment_pair:
769
+ samples = qtable.get_samples(experiment)
770
+ counts = table[samples].notna().sum(axis=1)
771
+ table.loc[counts < min_replicates, samples] = np.nan
719
772
 
773
+ if exclude_invalid:
774
+ mask = qtable["Valid"].copy().to_numpy() & (table.notna().sum(axis=1) > 0)
775
+ else:
776
+ mask = table.notna().sum(axis=1) > 0
777
+
778
+ # At least one row with one condition with two values are required for LIMMA
779
+ valid_exp_rows = []
780
+ for exp in experiment_pair:
781
+ samples = qtable.get_samples(exp)
782
+ valid_exp_rows.append(table.loc[mask, samples].notna().sum(axis=1) >= 2)
783
+ if np.array(valid_exp_rows).any(axis=0).sum() == 0:
784
+ raise ValueError(
785
+ "No rows with sufficient data for differential expression analysis remain"
786
+ " after applying 'min_replicates' and 'exclude_invalid' filters."
787
+ )
788
+
789
+ batch_groups = None
790
+ if batch:
791
+ design_df = qtable.get_design().set_index("Sample")
792
+ batch_groups = [str(design_df.loc[s, "Batch"]) for s in table.columns]
793
+ print(table[mask])
720
794
  # Note that the order of experiments for calling limma is reversed
721
795
  limma_result = msreport.rinterface.two_group_limma(
722
- table[mask], experiments, experiment_pair[1], experiment_pair[0], limma_trend
796
+ table[mask],
797
+ groups,
798
+ experiment_pair[1],
799
+ experiment_pair[0],
800
+ limma_trend,
801
+ batch_groups,
723
802
  )
724
803
 
725
804
  # For adding expression features to the qtable it is necessary that the
726
805
  # the limma_results have the same number of rows.
727
- limma_table = pd.DataFrame(index=table.index, columns=limma_result.columns)
806
+ limma_table = pd.DataFrame(
807
+ index=table.index, columns=limma_result.columns, dtype="float64"
808
+ )
728
809
  limma_table[mask] = limma_result
729
- limma_table.fillna(np.nan, inplace=True)
730
810
 
731
811
  comparison_group = comparison_tag.join(experiment_pair)
732
812
  mapping = {col: f"{col} {comparison_group}" for col in limma_table.columns}
733
- limma_table.rename(columns=mapping, inplace=True)
813
+ limma_table = limma_table.rename(columns=mapping)
734
814
  qtable.add_expression_features(limma_table)
735
815
 
736
816
 
817
+ def calculate_anova_limma(
818
+ qtable: Qtable,
819
+ experiments: Iterable[str] | None = None,
820
+ exclude_invalid: bool = True,
821
+ batch: bool = False,
822
+ limma_trend: bool = True,
823
+ min_replicates: int = 2,
824
+ ) -> None:
825
+ """Calculates one-way moderated ANOVA using LIMMA across multiple experiment groups.
826
+
827
+ New columns that are added to the qtable:
828
+ - "ANOVA p-value"
829
+ - "ANOVA adjusted p-value"
830
+
831
+ Requires that the Qtable has defined expression columns, and expression values are
832
+ log2 transformed. ANOVA is calculated for rows where at least two experiments meet
833
+ the 'min_replicates' threshold of non-missing values. For a given row, an experiment
834
+ group failing this threshold is treated as missing data for the calculation. At
835
+ least two valid experiments must remain for a p-value to be generated. Adjusted
836
+ p-values are calculated using the Benjamini-Hochberg (BH) method.
837
+
838
+ Args:
839
+ qtable: Qtable instance that contains expression values for the analysis.
840
+ experiments: A list of experiment names from qtable.design["Experiment"] to
841
+ include in the ANOVA. If None, all experiments in the design are used;
842
+ default None.
843
+ exclude_invalid: If true, the column "Valid" is used to determine which rows are
844
+ used for the ANOVA; default True.
845
+ batch: If true batch effects are considered for the differential expression
846
+ analysis. Batches must be specified in the design in a "Batch" column.
847
+ limma_trend: If true, an intensity-dependent trend is fitted to the prior
848
+ variances; default True.
849
+ min_replicates: Minimum number of non-missing values required per experiment to
850
+ include that experiment's data for a given row; default 2.
851
+
852
+ Raises:
853
+ OptionalDependencyError: If the R interface is not available, which is required
854
+ for using LIMMA.
855
+ KeyError: If the "Batch" column is not present in the qtable.design when
856
+ 'batch' is set to True.
857
+ ValueError: If 'experiments' contains entries not present in qtable.design.
858
+ ValueError: If less than two experiments with at least 'min_replicates' are
859
+ are present in the qtable.design when performing the ANOVA.
860
+ """
861
+ if not _rinterface_available:
862
+ raise OptionalDependencyError(_rinterface_error)
863
+
864
+ # TODO: not tested #
865
+ if batch and "Batch" not in qtable.get_design():
866
+ raise KeyError(
867
+ 'When using calculate_anova_limma(batch=True) a "Batch" column must be '
868
+ "present in qtable.design"
869
+ )
870
+ if batch and qtable.get_design()["Batch"].nunique() == 1:
871
+ raise ValueError(
872
+ "When using calculate_anova_limma(batch=True), not all values from"
873
+ ' qtable.design["Batch"] are allowed to be identical.'
874
+ )
875
+
876
+ if experiments is not None and any(
877
+ e not in qtable.design["Experiment"].unique() for e in experiments
878
+ ):
879
+ raise ValueError("Some specified experiments are not present in qtable.design.")
880
+ if experiments is None:
881
+ experiments = qtable.get_experiments()
882
+
883
+ valid_experiments = []
884
+ for experiment in experiments:
885
+ if len(qtable.get_samples(experiment)) >= min_replicates:
886
+ valid_experiments.append(experiment)
887
+ if len(valid_experiments) < 2:
888
+ raise ValueError(
889
+ f"At least two experiments with {min_replicates} or more replicates are "
890
+ "required for calculating moderated ANOVA statistics with LIMMA."
891
+ )
892
+
893
+ design = qtable.get_design()
894
+ design = design[design["Experiment"].isin(valid_experiments)]
895
+
896
+ table = qtable.make_expression_table(samples_as_columns=True)
897
+ table = table[design["Sample"]]
898
+
899
+ for experiment in valid_experiments:
900
+ samples = qtable.get_samples(experiment)
901
+ counts = table[samples].notna().sum(axis=1)
902
+ table.loc[counts < min_replicates, samples] = np.nan
903
+
904
+ if exclude_invalid:
905
+ mask = qtable["Valid"].to_numpy() & (table.notna().sum(axis=1) > 0)
906
+ else:
907
+ mask = table.notna().sum(axis=1) > 0
908
+
909
+ # At least one row with one condition with two values are required for LIMMA
910
+ valid_exp_rows = []
911
+ for exp in valid_experiments:
912
+ samples = qtable.get_samples(exp)
913
+ valid_exp_rows.append(table.loc[mask, samples].notna().sum(axis=1) >= 2)
914
+ if np.array(valid_exp_rows).any(axis=0).sum() == 0:
915
+ raise ValueError(
916
+ "No rows with sufficient data for ANOVA analysis remain"
917
+ " after applying 'min_replicates' and 'exclude_invalid' filters."
918
+ )
919
+
920
+ # Exchange experiment names with names that are guaranteed to be valid in R
921
+ experiment_to_r = {}
922
+ for i, experiment in enumerate(design["Experiment"].unique()):
923
+ experiment_to_r[experiment] = f".EXPERIMENT__{i:04d}"
924
+
925
+ design = design.replace({"Experiment": experiment_to_r})
926
+
927
+ limma_result = msreport.rinterface.limma.limma_anova(
928
+ table[mask], design, batch, limma_trend
929
+ )
930
+
931
+ # For adding expression features to the qtable it is necessary that the
932
+ # the limma_results have the same number of rows.
933
+ limma_table = pd.DataFrame(
934
+ index=table.index, columns=limma_result.columns, dtype="float64"
935
+ )
936
+ limma_table[mask] = limma_result
937
+ qtable.add_expression_features(limma_table)
938
+
939
+
940
+ def calculate_multi_group_ttest(
941
+ qtable: Qtable,
942
+ experiment_pairs: Sequence[Iterable[str]],
943
+ exclude_invalid: bool = True,
944
+ equal_var: bool = False,
945
+ ) -> None:
946
+ """Calculates t-tests for multiple experiment pairs.
947
+
948
+ For each experiment pair specified in 'experiment_pairs' the following new columns
949
+ are added to the qtable:
950
+ - "P-value Experiment_1 vs Experiment_2"
951
+ - "Adjusted p-value Experiment_1 vs Experiment_2"
952
+
953
+ Missing values are ommitted and the ttest is calculated only for rows where both
954
+ experiment groups have at least two quantified values. Adjusted p-values are
955
+ calculated using the Benjamini-Hochberg (BH) method. Requires that expression
956
+ columns are set in the qtable.
957
+
958
+ Args:
959
+ qtable: Qtable instance that contains expression values for t-tests.
960
+ experiment_pairs: A list containing one or multiple experiment pairs for which
961
+ t-tests should be calculated. The specified experiments must correspond to
962
+ entries from qtable.design["Experiment"].
963
+ exclude_invalid: If true, the column "Valid" is used to determine which rows are
964
+ used for the t-tests; default True.
965
+ equal_var: If true, the two groups are assumed to have identical variances and
966
+ a standard independent 2 sample t-test is performed. If false, Welch's
967
+ t-test is performed; default False.
968
+
969
+ Raises:
970
+ ValueError: If 'experiment_pairs' contains invalid entries. Each experiment pair
971
+ must have exactly two entries and the two entries must not be the same. All
972
+ experiments must be present in qtable.design. No duplicate experiment pairs
973
+ are allowed.
974
+ """
975
+ _validate_experiment_pairs(qtable, experiment_pairs)
976
+ min_required_values = 2
977
+
978
+ table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
979
+ comparison_tag = " vs "
980
+
981
+ if exclude_invalid:
982
+ valid = table["Valid"].to_numpy()
983
+ else:
984
+ valid = np.full(table.shape[0], True)
985
+
986
+ for experiment_pair in experiment_pairs:
987
+ group_expressions = []
988
+ for experiment in experiment_pair:
989
+ samples = qtable.get_samples(experiment)
990
+ group_expressions.append(table.loc[valid, samples])
991
+
992
+ # implement the at least two values per experiment rule here, set rows to nan
993
+ # where this is not the case
994
+ for i in range(2):
995
+ num_values = np.isfinite(group_expressions[i]).sum(axis=1)
996
+ insufficient_values = num_values < min_required_values
997
+ group_expressions[i].loc[insufficient_values, :] = np.nan
998
+
999
+ _, pvalues = scipy.stats.ttest_ind(
1000
+ group_expressions[0],
1001
+ group_expressions[1],
1002
+ axis=1,
1003
+ equal_var=equal_var,
1004
+ nan_policy="omit",
1005
+ )
1006
+
1007
+ finite_pvalues = pvalues[np.isfinite(pvalues)]
1008
+ _, finite_adjusted_pvalues, _, _ = statsmodels.stats.multitest.multipletests(
1009
+ finite_pvalues, method="fdr_bh"
1010
+ )
1011
+ adjusted_pvalues = pvalues.copy()
1012
+ adjusted_pvalues[np.isfinite(pvalues)] = finite_adjusted_pvalues
1013
+
1014
+ comparison_group = comparison_tag.join(experiment_pair)
1015
+ comparison_table = pd.DataFrame(
1016
+ {
1017
+ f"P-value {comparison_group}": np.full(table.shape[0], np.nan),
1018
+ f"Adjusted p-value {comparison_group}": np.full(table.shape[0], np.nan),
1019
+ }
1020
+ )
1021
+ comparison_table.loc[valid, f"P-value {comparison_group}"] = pvalues
1022
+ comparison_table.loc[valid, f"Adjusted p-value {comparison_group}"] = (
1023
+ adjusted_pvalues
1024
+ )
1025
+
1026
+ qtable.add_expression_features(comparison_table)
1027
+
1028
+
1029
+ def calculate_anova(
1030
+ qtable: Qtable,
1031
+ experiments: Iterable[str] | None = None,
1032
+ exclude_invalid: bool = True,
1033
+ equal_var: bool = False,
1034
+ ) -> None:
1035
+ """Calculates one-way ANOVA across multiple experiment groups.
1036
+
1037
+ New columns are added to the qtable:
1038
+ - "ANOVA P-value Experiments_Used"
1039
+ - "ANOVA Adjusted p-value Experiments_Used"
1040
+
1041
+ Missing values are omitted, and ANOVA is calculated only for rows where all
1042
+ experiment groups have at least two quantified values. Adjusted p-values are
1043
+ calculated using the Benjamini-Hochberg (BH) method. Requires that expression
1044
+ columns are set in the qtable.
1045
+
1046
+ Args:
1047
+ qtable: Qtable instance that contains expression values for the analysis.
1048
+ experiments: A list of experiment names from qtable.design["Experiment"] to
1049
+ include in the ANOVA. If None, all experiments in the design are used;
1050
+ default None.
1051
+ exclude_invalid: If true, the column "Valid" is used to determine which rows are
1052
+ used for the ANOVA; default True.
1053
+ equal_var: If true, the groups are assumed to have identical variances and a
1054
+ standard one-way ANOVA is performed. If false, Welch's ANOVA is performed;
1055
+ default False.
1056
+
1057
+ Raises:
1058
+ ValueError: If 'experiments' contains entries not present in qtable.design.
1059
+ """
1060
+ if experiments is not None and any(
1061
+ e not in qtable.design["Experiment"].unique() for e in experiments
1062
+ ):
1063
+ raise ValueError("Some specified experiments are not present in qtable.design.")
1064
+ min_required_values = 2
1065
+ if experiments is None:
1066
+ experiments = qtable.get_experiments()
1067
+
1068
+ table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
1069
+ if exclude_invalid:
1070
+ valid = table["Valid"].to_numpy()
1071
+ else:
1072
+ valid = np.full(table.shape[0], True)
1073
+
1074
+ experiment_data = []
1075
+ for experiment in experiments:
1076
+ samples = qtable.get_samples(experiment)
1077
+ experiment_data.append(table[samples].to_numpy())
1078
+ experiment_array = np.array(experiment_data)
1079
+
1080
+ for replicate_data in experiment_data:
1081
+ valid_entries = np.isfinite(replicate_data).sum(axis=1) < min_required_values
1082
+ valid[valid_entries] = False
1083
+
1084
+ anova_pvalues = []
1085
+ for row_data in experiment_array[:, valid, :].swapaxes(0, 1):
1086
+ anova_input = []
1087
+ for values in row_data:
1088
+ anova_input.append(values[~np.isnan(values)])
1089
+ _, pvalue = scipy.stats.f_oneway(*anova_input, equal_var=equal_var)
1090
+ anova_pvalues.append(pvalue)
1091
+ _, anova_adjusted_pvalues, _, _ = statsmodels.stats.multitest.multipletests(
1092
+ anova_pvalues, method="fdr_bh"
1093
+ )
1094
+
1095
+ pvalues = np.empty((table.shape[0],))
1096
+ pvalues[:] = np.nan
1097
+ pvalues[valid] = anova_pvalues
1098
+
1099
+ adjusted_pvalues = np.empty((table.shape[0],))
1100
+ adjusted_pvalues[:] = np.nan
1101
+ adjusted_pvalues[valid] = anova_adjusted_pvalues
1102
+
1103
+ comparison_table = pd.DataFrame(
1104
+ {
1105
+ "ANOVA p-value": pvalues,
1106
+ "ANOVA adjusted p-value": adjusted_pvalues,
1107
+ }
1108
+ )
1109
+ qtable.add_expression_features(comparison_table)
1110
+
1111
+
737
1112
  def _validate_experiment_pairs(
738
1113
  qtable: Qtable, exp_pairs: Iterable[Iterable[str]]
739
1114
  ) -> None:
@@ -201,11 +201,11 @@ def experiment_ratios(
201
201
  experiment_data = pd.DataFrame(experiment_means)
202
202
 
203
203
  # Only consider rows with quantitative values in all experiments
204
- mask = np.all([(qtable.data[f"Events {exp}"] > 0) for exp in experiments], axis=0)
204
+ mask = experiment_data.isna().sum(axis=1) == 0
205
205
  if exclude_invalid:
206
- mask = mask & qtable["Valid"]
206
+ mask = mask.to_numpy() & qtable["Valid"].to_numpy()
207
207
  # Use `mask.to_numpy` to solve issue with different indices of mask and dataframe
208
- experiment_data = experiment_data[mask.to_numpy()]
208
+ experiment_data = experiment_data[mask]
209
209
  pseudo_reference = np.nanmean(experiment_data, axis=1)
210
210
  ratio_data = experiment_data.subtract(pseudo_reference, axis=0)
211
211
 
@@ -258,7 +258,6 @@ def expression_clustermap(
258
258
  raise ValueError("At least two samples are required to generate a clustermap.")
259
259
 
260
260
  data = qtable.make_expression_table(samples_as_columns=True, exclude_invalid=False)
261
- data = data[samples]
262
261
  data = data.fillna(0)
263
262
 
264
263
  if not mean_center: # Hide missing values in the heatmap, making them appear white