msreport 0.0.32__py3-none-any.whl → 0.0.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +16 -7
- msreport/analyze.py +414 -39
- msreport/plot/distribution.py +3 -3
- msreport/plot/multivariate.py +0 -1
- msreport/plot/style_sheets/_all_relevant_styles.md +594 -0
- msreport/qtable.py +3 -2
- msreport/reader.py +9 -1
- msreport/rinterface/limma.py +68 -3
- msreport/rinterface/rscripts/limma.R +79 -18
- {msreport-0.0.32.dist-info → msreport-0.0.33.dist-info}/METADATA +153 -154
- {msreport-0.0.32.dist-info → msreport-0.0.33.dist-info}/RECORD +13 -13
- {msreport-0.0.32.dist-info → msreport-0.0.33.dist-info}/WHEEL +1 -2
- msreport-0.0.32.dist-info/top_level.txt +0 -1
- {msreport-0.0.32.dist-info → msreport-0.0.33.dist-info}/licenses/LICENSE.txt +0 -0
msreport/__init__.py
CHANGED
|
@@ -1,11 +1,20 @@
|
|
|
1
|
-
import
|
|
2
|
-
import msreport.export
|
|
3
|
-
import msreport.impute
|
|
4
|
-
import msreport.normalize
|
|
5
|
-
import msreport.plot
|
|
6
|
-
import msreport.reader
|
|
1
|
+
from msreport import analyze, export, impute, normalize, plot, reader
|
|
7
2
|
from msreport.fasta import import_protein_database
|
|
8
3
|
from msreport.qtable import Qtable
|
|
9
4
|
from msreport.reader import FragPipeReader, MaxQuantReader, SpectronautReader
|
|
10
5
|
|
|
11
|
-
__version__ = "0.0.
|
|
6
|
+
__version__ = "0.0.33"
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"analyze",
|
|
10
|
+
"export",
|
|
11
|
+
"impute",
|
|
12
|
+
"normalize",
|
|
13
|
+
"plot",
|
|
14
|
+
"reader",
|
|
15
|
+
"import_protein_database",
|
|
16
|
+
"Qtable",
|
|
17
|
+
"FragPipeReader",
|
|
18
|
+
"MaxQuantReader",
|
|
19
|
+
"SpectronautReader",
|
|
20
|
+
]
|
msreport/analyze.py
CHANGED
|
@@ -10,6 +10,8 @@ from typing import Iterable, Optional, Protocol, Sequence
|
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
import pandas as pd
|
|
13
|
+
import scipy.stats
|
|
14
|
+
import statsmodels.stats.multitest
|
|
13
15
|
from typing_extensions import Self
|
|
14
16
|
|
|
15
17
|
import msreport.normalize
|
|
@@ -533,7 +535,7 @@ def calculate_multi_group_comparison(
|
|
|
533
535
|
qtable.add_expression_features(comparison_table)
|
|
534
536
|
|
|
535
537
|
|
|
536
|
-
def
|
|
538
|
+
def calculate_two_group_comparison(
|
|
537
539
|
qtable: Qtable, experiment_pair: Iterable[str], exclude_invalid: bool = True
|
|
538
540
|
) -> None:
|
|
539
541
|
"""Calculates comparison values for two experiments.
|
|
@@ -560,19 +562,23 @@ def calculate_multi_group_limma(
|
|
|
560
562
|
exclude_invalid: bool = True,
|
|
561
563
|
batch: bool = False,
|
|
562
564
|
limma_trend: bool = True,
|
|
565
|
+
min_replicates: int = 2,
|
|
563
566
|
) -> None:
|
|
564
567
|
"""Uses limma to perform a differential expression analysis of multiple experiments.
|
|
565
568
|
|
|
566
569
|
For each experiment pair specified in 'experiment_pairs' the following new columns
|
|
567
570
|
are added to the qtable:
|
|
568
|
-
- "P-value Experiment_1 vs Experiment_2"
|
|
569
|
-
- "Adjusted p-value Experiment_1 vs Experiment_2"
|
|
570
|
-
- "Average expression Experiment_1 vs Experiment_2"
|
|
571
|
-
- "Ratio [log2] Experiment_1 vs Experiment_2"
|
|
572
|
-
|
|
573
|
-
Requires that expression columns
|
|
574
|
-
|
|
575
|
-
|
|
571
|
+
- "P-value 'Experiment_1' vs 'Experiment_2'"
|
|
572
|
+
- "Adjusted p-value 'Experiment_1' vs 'Experiment_2'"
|
|
573
|
+
- "Average expression 'Experiment_1' vs 'Experiment_2'"
|
|
574
|
+
- "Ratio [log2] 'Experiment_1' vs 'Experiment_2'"
|
|
575
|
+
|
|
576
|
+
Requires that the Qtable has defined expression columns, and expression values are
|
|
577
|
+
log2 transformed. For each experiment and row, if the number of non-missing values
|
|
578
|
+
is below 'min_replicates', all values for that experiment and row are set to NaN for
|
|
579
|
+
the LIMMA calculation. As a result, p-values are only calculated for rows where both
|
|
580
|
+
experiments have at least 'min_replicates' non-missing values. Adjusted p-values are
|
|
581
|
+
calculated using the Benjamini-Hochberg (BH) method.
|
|
576
582
|
|
|
577
583
|
Args:
|
|
578
584
|
qtable: Qtable instance that contains expression values for differential
|
|
@@ -588,8 +594,12 @@ def calculate_multi_group_limma(
|
|
|
588
594
|
limma_trend: If true, an intensity-dependent trend is fitted to the prior
|
|
589
595
|
variance during calculation of the moderated t-statistics, refer to
|
|
590
596
|
limma.eBayes for details; default True.
|
|
597
|
+
min_replicates: Minimum number of non-missing values required in the row of any
|
|
598
|
+
experiment to be included in the analysis; default 2.
|
|
591
599
|
|
|
592
600
|
Raises:
|
|
601
|
+
OptionalDependencyError: If the R interface is not available, which is required
|
|
602
|
+
for using LIMMA.
|
|
593
603
|
ValueError: If 'experiment_pairs' contains invalid entries. Each experiment pair
|
|
594
604
|
must have exactly two entries and the two entries must not be the same. All
|
|
595
605
|
experiments must be present in qtable.design. No duplicate experiment pairs
|
|
@@ -620,12 +630,27 @@ def calculate_multi_group_limma(
|
|
|
620
630
|
table.index = table.index.astype(str) # It appears that a string is required for R
|
|
621
631
|
comparison_tag = " vs "
|
|
622
632
|
|
|
633
|
+
# Apply min_replicates filter
|
|
634
|
+
for experiment in qtable.get_experiments():
|
|
635
|
+
samples = qtable.get_samples(experiment)
|
|
636
|
+
counts = table[samples].notna().sum(axis=1)
|
|
637
|
+
table.loc[counts < min_replicates, samples] = np.nan
|
|
638
|
+
|
|
623
639
|
if exclude_invalid:
|
|
624
|
-
|
|
640
|
+
mask = qtable["Valid"].copy().to_numpy() & (table.notna().sum(axis=1) > 0)
|
|
625
641
|
else:
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
642
|
+
mask = table.notna().sum(axis=1) > 0
|
|
643
|
+
|
|
644
|
+
# At least one row with one condition with two values are required for LIMMA
|
|
645
|
+
valid_exp_rows = []
|
|
646
|
+
for exp in qtable.get_experiments():
|
|
647
|
+
samples = qtable.get_samples(exp)
|
|
648
|
+
valid_exp_rows.append(table.loc[mask, samples].notna().sum(axis=1) >= 2)
|
|
649
|
+
if np.array(valid_exp_rows).any(axis=0).sum() == 0:
|
|
650
|
+
raise ValueError(
|
|
651
|
+
"No rows with sufficient data for differential expression analysis remain"
|
|
652
|
+
" after applying 'min_replicates' and 'exclude_invalid' filters."
|
|
653
|
+
)
|
|
629
654
|
|
|
630
655
|
# Exchange experiment names with names that are guaranteed to be valid in R
|
|
631
656
|
experiment_to_r = {}
|
|
@@ -637,7 +662,7 @@ def calculate_multi_group_limma(
|
|
|
637
662
|
for exp1, exp2 in experiment_pairs:
|
|
638
663
|
r_experiment_pairs.append(f"{experiment_to_r[exp1]}-{experiment_to_r[exp2]}")
|
|
639
664
|
|
|
640
|
-
design.replace({"Experiment": experiment_to_r}
|
|
665
|
+
design = design.replace({"Experiment": experiment_to_r})
|
|
641
666
|
|
|
642
667
|
# Run limma and join results for all comparison groups
|
|
643
668
|
limma_results = msreport.rinterface.multi_group_limma(
|
|
@@ -647,35 +672,43 @@ def calculate_multi_group_limma(
|
|
|
647
672
|
experiment_pair = [r_to_experiment[s] for s in r_comparison_group.split("-")]
|
|
648
673
|
comparison_group = comparison_tag.join(experiment_pair)
|
|
649
674
|
mapping = {col: f"{col} {comparison_group}" for col in limma_result.columns}
|
|
650
|
-
limma_result.rename(columns=mapping
|
|
675
|
+
limma_result = limma_result.rename(columns=mapping)
|
|
651
676
|
|
|
652
677
|
limma_table = pd.DataFrame(index=table.index)
|
|
653
678
|
limma_table = limma_table.join(list(limma_results.values()))
|
|
654
|
-
limma_table.
|
|
679
|
+
limma_table = limma_table.astype("float64")
|
|
655
680
|
qtable.add_expression_features(limma_table)
|
|
656
681
|
|
|
657
682
|
# Average expression from limma is the whole row mean, overwrite with the average
|
|
658
683
|
# expression of the experiment group
|
|
659
684
|
for experiment_pair in experiment_pairs:
|
|
660
|
-
|
|
685
|
+
calculate_two_group_comparison(
|
|
686
|
+
qtable, experiment_pair, exclude_invalid=exclude_invalid
|
|
687
|
+
)
|
|
661
688
|
|
|
662
689
|
|
|
663
690
|
def calculate_two_group_limma(
|
|
664
691
|
qtable: Qtable,
|
|
665
692
|
experiment_pair: Sequence[str],
|
|
666
693
|
exclude_invalid: bool = True,
|
|
694
|
+
batch: bool = False,
|
|
667
695
|
limma_trend: bool = True,
|
|
696
|
+
min_replicates: int = 2,
|
|
668
697
|
) -> None:
|
|
669
698
|
"""Uses limma to perform a differential expression analysis of two experiments.
|
|
670
699
|
|
|
671
|
-
|
|
672
|
-
"
|
|
673
|
-
"
|
|
674
|
-
"
|
|
700
|
+
New columns that are added to the qtable:
|
|
701
|
+
- "P-value 'Experiment_1' vs 'Experiment_2'"
|
|
702
|
+
- "Adjusted p-value 'Experiment_1' vs 'Experiment_2'"
|
|
703
|
+
- "Average expression 'Experiment_1' vs 'Experiment_2'"
|
|
704
|
+
- "Ratio [log2] 'Experiment_1' vs 'Experiment_2'"
|
|
675
705
|
|
|
676
|
-
Requires that expression columns
|
|
677
|
-
transformed.
|
|
678
|
-
|
|
706
|
+
Requires that the Qtable has defined expression columns, and expression values are
|
|
707
|
+
log2 transformed. For each experiment and row, if the number of non-missing values
|
|
708
|
+
is below 'min_replicates', all values for that experiment and row are set to NaN for
|
|
709
|
+
the LIMMA calculation. As a result, p-values are only calculated for rows where both
|
|
710
|
+
experiments have at least 'min_replicates' non-missing values. Adjusted p-values are
|
|
711
|
+
calculated using the Benjamini-Hochberg (BH) method.
|
|
679
712
|
|
|
680
713
|
Args:
|
|
681
714
|
qtable: Qtable instance that contains expression values for differential
|
|
@@ -684,56 +717,398 @@ def calculate_two_group_limma(
|
|
|
684
717
|
experiments must be present in qtable.design
|
|
685
718
|
exclude_invalid: If true, the column "Valid" is used to determine which rows are
|
|
686
719
|
used for the differential expression analysis; default True.
|
|
720
|
+
batch: If true batch effects are considered for the differential expression
|
|
721
|
+
analysis. Batches must be specified in the design in a "Batch" column.
|
|
687
722
|
limma_trend: If true, an intensity-dependent trend is fitted to the prior
|
|
688
723
|
variances; default True.
|
|
724
|
+
min_replicates: Minimum number of non-missing values required in both
|
|
725
|
+
experiments to include a row in the analysis; default 2.
|
|
726
|
+
|
|
689
727
|
Raises:
|
|
728
|
+
OptionalDependencyError: If the R interface is not available, which is required
|
|
729
|
+
for using LIMMA.
|
|
730
|
+
KeyError: If the "Batch" column is not present in the qtable.design when
|
|
731
|
+
'batch' is set to True.
|
|
690
732
|
ValueError: If 'experiment_pair' contains invalid entries. The experiment pair
|
|
691
733
|
must have exactly two entries and the two entries must not be the same. Both
|
|
692
734
|
experiments must be present in qtable.design.
|
|
735
|
+
ValueError: If all values from qtable.design["Batch"] are identical when 'batch'
|
|
736
|
+
is set to True.
|
|
737
|
+
ValueError: If no rows with sufficient data for differential expression analysis
|
|
738
|
+
remain after applying 'min_replicates' and 'exclude_invalid' filters.
|
|
693
739
|
"""
|
|
694
740
|
if not _rinterface_available:
|
|
695
741
|
raise OptionalDependencyError(_rinterface_error)
|
|
696
742
|
|
|
697
743
|
_validate_experiment_pair(qtable, experiment_pair)
|
|
744
|
+
if batch and "Batch" not in qtable.get_design():
|
|
745
|
+
raise KeyError(
|
|
746
|
+
"When using calculate_two_group_limma(batch=True) a"
|
|
747
|
+
' "Batch" column must be present in qtable.design'
|
|
748
|
+
)
|
|
749
|
+
if batch and qtable.get_design()["Batch"].nunique() == 1:
|
|
750
|
+
raise ValueError(
|
|
751
|
+
"When using calculate_two_group_limma(batch=True), not all values from"
|
|
752
|
+
' qtable.design["Batch"] are allowed to be identical.'
|
|
753
|
+
)
|
|
698
754
|
# TODO: LIMMA function not tested #
|
|
699
755
|
table = qtable.make_expression_table(samples_as_columns=True)
|
|
700
756
|
comparison_tag = " vs "
|
|
701
757
|
|
|
702
|
-
if exclude_invalid:
|
|
703
|
-
valid = qtable["Valid"]
|
|
704
|
-
else:
|
|
705
|
-
valid = np.full(table.shape[0], True)
|
|
706
|
-
|
|
707
758
|
samples_to_experiment = {}
|
|
708
|
-
for
|
|
709
|
-
|
|
710
|
-
samples_to_experiment.update(mapping)
|
|
759
|
+
for exp in experiment_pair:
|
|
760
|
+
samples_to_experiment.update(dict.fromkeys(qtable.get_samples(exp), exp))
|
|
711
761
|
|
|
712
762
|
# Keep only samples that are present in the 'experiment_pair'
|
|
713
763
|
table = table[samples_to_experiment.keys()]
|
|
714
764
|
table.index = table.index.astype(str) # It appears that a string is required for R
|
|
715
|
-
|
|
765
|
+
groups = [samples_to_experiment[s] for s in table.columns]
|
|
716
766
|
|
|
717
|
-
|
|
718
|
-
|
|
767
|
+
# Apply min_replicates filter
|
|
768
|
+
for experiment in experiment_pair:
|
|
769
|
+
samples = qtable.get_samples(experiment)
|
|
770
|
+
counts = table[samples].notna().sum(axis=1)
|
|
771
|
+
table.loc[counts < min_replicates, samples] = np.nan
|
|
719
772
|
|
|
773
|
+
if exclude_invalid:
|
|
774
|
+
mask = qtable["Valid"].copy().to_numpy() & (table.notna().sum(axis=1) > 0)
|
|
775
|
+
else:
|
|
776
|
+
mask = table.notna().sum(axis=1) > 0
|
|
777
|
+
|
|
778
|
+
# At least one row with one condition with two values are required for LIMMA
|
|
779
|
+
valid_exp_rows = []
|
|
780
|
+
for exp in experiment_pair:
|
|
781
|
+
samples = qtable.get_samples(exp)
|
|
782
|
+
valid_exp_rows.append(table.loc[mask, samples].notna().sum(axis=1) >= 2)
|
|
783
|
+
if np.array(valid_exp_rows).any(axis=0).sum() == 0:
|
|
784
|
+
raise ValueError(
|
|
785
|
+
"No rows with sufficient data for differential expression analysis remain"
|
|
786
|
+
" after applying 'min_replicates' and 'exclude_invalid' filters."
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
batch_groups = None
|
|
790
|
+
if batch:
|
|
791
|
+
design_df = qtable.get_design().set_index("Sample")
|
|
792
|
+
batch_groups = [str(design_df.loc[s, "Batch"]) for s in table.columns]
|
|
793
|
+
print(table[mask])
|
|
720
794
|
# Note that the order of experiments for calling limma is reversed
|
|
721
795
|
limma_result = msreport.rinterface.two_group_limma(
|
|
722
|
-
table[mask],
|
|
796
|
+
table[mask],
|
|
797
|
+
groups,
|
|
798
|
+
experiment_pair[1],
|
|
799
|
+
experiment_pair[0],
|
|
800
|
+
limma_trend,
|
|
801
|
+
batch_groups,
|
|
723
802
|
)
|
|
724
803
|
|
|
725
804
|
# For adding expression features to the qtable it is necessary that the
|
|
726
805
|
# the limma_results have the same number of rows.
|
|
727
|
-
limma_table = pd.DataFrame(
|
|
806
|
+
limma_table = pd.DataFrame(
|
|
807
|
+
index=table.index, columns=limma_result.columns, dtype="float64"
|
|
808
|
+
)
|
|
728
809
|
limma_table[mask] = limma_result
|
|
729
|
-
limma_table.fillna(np.nan, inplace=True)
|
|
730
810
|
|
|
731
811
|
comparison_group = comparison_tag.join(experiment_pair)
|
|
732
812
|
mapping = {col: f"{col} {comparison_group}" for col in limma_table.columns}
|
|
733
|
-
limma_table.rename(columns=mapping
|
|
813
|
+
limma_table = limma_table.rename(columns=mapping)
|
|
734
814
|
qtable.add_expression_features(limma_table)
|
|
735
815
|
|
|
736
816
|
|
|
817
|
+
def calculate_anova_limma(
|
|
818
|
+
qtable: Qtable,
|
|
819
|
+
experiments: Iterable[str] | None = None,
|
|
820
|
+
exclude_invalid: bool = True,
|
|
821
|
+
batch: bool = False,
|
|
822
|
+
limma_trend: bool = True,
|
|
823
|
+
min_replicates: int = 2,
|
|
824
|
+
) -> None:
|
|
825
|
+
"""Calculates one-way moderated ANOVA using LIMMA across multiple experiment groups.
|
|
826
|
+
|
|
827
|
+
New columns that are added to the qtable:
|
|
828
|
+
- "ANOVA p-value"
|
|
829
|
+
- "ANOVA adjusted p-value"
|
|
830
|
+
|
|
831
|
+
Requires that the Qtable has defined expression columns, and expression values are
|
|
832
|
+
log2 transformed. ANOVA is calculated for rows where at least two experiments meet
|
|
833
|
+
the 'min_replicates' threshold of non-missing values. For a given row, an experiment
|
|
834
|
+
group failing this threshold is treated as missing data for the calculation. At
|
|
835
|
+
least two valid experiments must remain for a p-value to be generated. Adjusted
|
|
836
|
+
p-values are calculated using the Benjamini-Hochberg (BH) method.
|
|
837
|
+
|
|
838
|
+
Args:
|
|
839
|
+
qtable: Qtable instance that contains expression values for the analysis.
|
|
840
|
+
experiments: A list of experiment names from qtable.design["Experiment"] to
|
|
841
|
+
include in the ANOVA. If None, all experiments in the design are used;
|
|
842
|
+
default None.
|
|
843
|
+
exclude_invalid: If true, the column "Valid" is used to determine which rows are
|
|
844
|
+
used for the ANOVA; default True.
|
|
845
|
+
batch: If true batch effects are considered for the differential expression
|
|
846
|
+
analysis. Batches must be specified in the design in a "Batch" column.
|
|
847
|
+
limma_trend: If true, an intensity-dependent trend is fitted to the prior
|
|
848
|
+
variances; default True.
|
|
849
|
+
min_replicates: Minimum number of non-missing values required per experiment to
|
|
850
|
+
include that experiment's data for a given row; default 2.
|
|
851
|
+
|
|
852
|
+
Raises:
|
|
853
|
+
OptionalDependencyError: If the R interface is not available, which is required
|
|
854
|
+
for using LIMMA.
|
|
855
|
+
KeyError: If the "Batch" column is not present in the qtable.design when
|
|
856
|
+
'batch' is set to True.
|
|
857
|
+
ValueError: If 'experiments' contains entries not present in qtable.design.
|
|
858
|
+
ValueError: If less than two experiments with at least 'min_replicates' are
|
|
859
|
+
are present in the qtable.design when performing the ANOVA.
|
|
860
|
+
"""
|
|
861
|
+
if not _rinterface_available:
|
|
862
|
+
raise OptionalDependencyError(_rinterface_error)
|
|
863
|
+
|
|
864
|
+
# TODO: not tested #
|
|
865
|
+
if batch and "Batch" not in qtable.get_design():
|
|
866
|
+
raise KeyError(
|
|
867
|
+
'When using calculate_anova_limma(batch=True) a "Batch" column must be '
|
|
868
|
+
"present in qtable.design"
|
|
869
|
+
)
|
|
870
|
+
if batch and qtable.get_design()["Batch"].nunique() == 1:
|
|
871
|
+
raise ValueError(
|
|
872
|
+
"When using calculate_anova_limma(batch=True), not all values from"
|
|
873
|
+
' qtable.design["Batch"] are allowed to be identical.'
|
|
874
|
+
)
|
|
875
|
+
|
|
876
|
+
if experiments is not None and any(
|
|
877
|
+
e not in qtable.design["Experiment"].unique() for e in experiments
|
|
878
|
+
):
|
|
879
|
+
raise ValueError("Some specified experiments are not present in qtable.design.")
|
|
880
|
+
if experiments is None:
|
|
881
|
+
experiments = qtable.get_experiments()
|
|
882
|
+
|
|
883
|
+
valid_experiments = []
|
|
884
|
+
for experiment in experiments:
|
|
885
|
+
if len(qtable.get_samples(experiment)) >= min_replicates:
|
|
886
|
+
valid_experiments.append(experiment)
|
|
887
|
+
if len(valid_experiments) < 2:
|
|
888
|
+
raise ValueError(
|
|
889
|
+
f"At least two experiments with {min_replicates} or more replicates are "
|
|
890
|
+
"required for calculating moderated ANOVA statistics with LIMMA."
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
design = qtable.get_design()
|
|
894
|
+
design = design[design["Experiment"].isin(valid_experiments)]
|
|
895
|
+
|
|
896
|
+
table = qtable.make_expression_table(samples_as_columns=True)
|
|
897
|
+
table = table[design["Sample"]]
|
|
898
|
+
|
|
899
|
+
for experiment in valid_experiments:
|
|
900
|
+
samples = qtable.get_samples(experiment)
|
|
901
|
+
counts = table[samples].notna().sum(axis=1)
|
|
902
|
+
table.loc[counts < min_replicates, samples] = np.nan
|
|
903
|
+
|
|
904
|
+
if exclude_invalid:
|
|
905
|
+
mask = qtable["Valid"].to_numpy() & (table.notna().sum(axis=1) > 0)
|
|
906
|
+
else:
|
|
907
|
+
mask = table.notna().sum(axis=1) > 0
|
|
908
|
+
|
|
909
|
+
# At least one row with one condition with two values are required for LIMMA
|
|
910
|
+
valid_exp_rows = []
|
|
911
|
+
for exp in valid_experiments:
|
|
912
|
+
samples = qtable.get_samples(exp)
|
|
913
|
+
valid_exp_rows.append(table.loc[mask, samples].notna().sum(axis=1) >= 2)
|
|
914
|
+
if np.array(valid_exp_rows).any(axis=0).sum() == 0:
|
|
915
|
+
raise ValueError(
|
|
916
|
+
"No rows with sufficient data for ANOVA analysis remain"
|
|
917
|
+
" after applying 'min_replicates' and 'exclude_invalid' filters."
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
# Exchange experiment names with names that are guaranteed to be valid in R
|
|
921
|
+
experiment_to_r = {}
|
|
922
|
+
for i, experiment in enumerate(design["Experiment"].unique()):
|
|
923
|
+
experiment_to_r[experiment] = f".EXPERIMENT__{i:04d}"
|
|
924
|
+
|
|
925
|
+
design = design.replace({"Experiment": experiment_to_r})
|
|
926
|
+
|
|
927
|
+
limma_result = msreport.rinterface.limma.limma_anova(
|
|
928
|
+
table[mask], design, batch, limma_trend
|
|
929
|
+
)
|
|
930
|
+
|
|
931
|
+
# For adding expression features to the qtable it is necessary that the
|
|
932
|
+
# the limma_results have the same number of rows.
|
|
933
|
+
limma_table = pd.DataFrame(
|
|
934
|
+
index=table.index, columns=limma_result.columns, dtype="float64"
|
|
935
|
+
)
|
|
936
|
+
limma_table[mask] = limma_result
|
|
937
|
+
qtable.add_expression_features(limma_table)
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
def calculate_multi_group_ttest(
|
|
941
|
+
qtable: Qtable,
|
|
942
|
+
experiment_pairs: Sequence[Iterable[str]],
|
|
943
|
+
exclude_invalid: bool = True,
|
|
944
|
+
equal_var: bool = False,
|
|
945
|
+
) -> None:
|
|
946
|
+
"""Calculates t-tests for multiple experiment pairs.
|
|
947
|
+
|
|
948
|
+
For each experiment pair specified in 'experiment_pairs' the following new columns
|
|
949
|
+
are added to the qtable:
|
|
950
|
+
- "P-value Experiment_1 vs Experiment_2"
|
|
951
|
+
- "Adjusted p-value Experiment_1 vs Experiment_2"
|
|
952
|
+
|
|
953
|
+
Missing values are ommitted and the ttest is calculated only for rows where both
|
|
954
|
+
experiment groups have at least two quantified values. Adjusted p-values are
|
|
955
|
+
calculated using the Benjamini-Hochberg (BH) method. Requires that expression
|
|
956
|
+
columns are set in the qtable.
|
|
957
|
+
|
|
958
|
+
Args:
|
|
959
|
+
qtable: Qtable instance that contains expression values for t-tests.
|
|
960
|
+
experiment_pairs: A list containing one or multiple experiment pairs for which
|
|
961
|
+
t-tests should be calculated. The specified experiments must correspond to
|
|
962
|
+
entries from qtable.design["Experiment"].
|
|
963
|
+
exclude_invalid: If true, the column "Valid" is used to determine which rows are
|
|
964
|
+
used for the t-tests; default True.
|
|
965
|
+
equal_var: If true, the two groups are assumed to have identical variances and
|
|
966
|
+
a standard independent 2 sample t-test is performed. If false, Welch's
|
|
967
|
+
t-test is performed; default False.
|
|
968
|
+
|
|
969
|
+
Raises:
|
|
970
|
+
ValueError: If 'experiment_pairs' contains invalid entries. Each experiment pair
|
|
971
|
+
must have exactly two entries and the two entries must not be the same. All
|
|
972
|
+
experiments must be present in qtable.design. No duplicate experiment pairs
|
|
973
|
+
are allowed.
|
|
974
|
+
"""
|
|
975
|
+
_validate_experiment_pairs(qtable, experiment_pairs)
|
|
976
|
+
min_required_values = 2
|
|
977
|
+
|
|
978
|
+
table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
|
|
979
|
+
comparison_tag = " vs "
|
|
980
|
+
|
|
981
|
+
if exclude_invalid:
|
|
982
|
+
valid = table["Valid"].to_numpy()
|
|
983
|
+
else:
|
|
984
|
+
valid = np.full(table.shape[0], True)
|
|
985
|
+
|
|
986
|
+
for experiment_pair in experiment_pairs:
|
|
987
|
+
group_expressions = []
|
|
988
|
+
for experiment in experiment_pair:
|
|
989
|
+
samples = qtable.get_samples(experiment)
|
|
990
|
+
group_expressions.append(table.loc[valid, samples])
|
|
991
|
+
|
|
992
|
+
# implement the at least two values per experiment rule here, set rows to nan
|
|
993
|
+
# where this is not the case
|
|
994
|
+
for i in range(2):
|
|
995
|
+
num_values = np.isfinite(group_expressions[i]).sum(axis=1)
|
|
996
|
+
insufficient_values = num_values < min_required_values
|
|
997
|
+
group_expressions[i].loc[insufficient_values, :] = np.nan
|
|
998
|
+
|
|
999
|
+
_, pvalues = scipy.stats.ttest_ind(
|
|
1000
|
+
group_expressions[0],
|
|
1001
|
+
group_expressions[1],
|
|
1002
|
+
axis=1,
|
|
1003
|
+
equal_var=equal_var,
|
|
1004
|
+
nan_policy="omit",
|
|
1005
|
+
)
|
|
1006
|
+
|
|
1007
|
+
finite_pvalues = pvalues[np.isfinite(pvalues)]
|
|
1008
|
+
_, finite_adjusted_pvalues, _, _ = statsmodels.stats.multitest.multipletests(
|
|
1009
|
+
finite_pvalues, method="fdr_bh"
|
|
1010
|
+
)
|
|
1011
|
+
adjusted_pvalues = pvalues.copy()
|
|
1012
|
+
adjusted_pvalues[np.isfinite(pvalues)] = finite_adjusted_pvalues
|
|
1013
|
+
|
|
1014
|
+
comparison_group = comparison_tag.join(experiment_pair)
|
|
1015
|
+
comparison_table = pd.DataFrame(
|
|
1016
|
+
{
|
|
1017
|
+
f"P-value {comparison_group}": np.full(table.shape[0], np.nan),
|
|
1018
|
+
f"Adjusted p-value {comparison_group}": np.full(table.shape[0], np.nan),
|
|
1019
|
+
}
|
|
1020
|
+
)
|
|
1021
|
+
comparison_table.loc[valid, f"P-value {comparison_group}"] = pvalues
|
|
1022
|
+
comparison_table.loc[valid, f"Adjusted p-value {comparison_group}"] = (
|
|
1023
|
+
adjusted_pvalues
|
|
1024
|
+
)
|
|
1025
|
+
|
|
1026
|
+
qtable.add_expression_features(comparison_table)
|
|
1027
|
+
|
|
1028
|
+
|
|
1029
|
+
def calculate_anova(
|
|
1030
|
+
qtable: Qtable,
|
|
1031
|
+
experiments: Iterable[str] | None = None,
|
|
1032
|
+
exclude_invalid: bool = True,
|
|
1033
|
+
equal_var: bool = False,
|
|
1034
|
+
) -> None:
|
|
1035
|
+
"""Calculates one-way ANOVA across multiple experiment groups.
|
|
1036
|
+
|
|
1037
|
+
New columns are added to the qtable:
|
|
1038
|
+
- "ANOVA P-value Experiments_Used"
|
|
1039
|
+
- "ANOVA Adjusted p-value Experiments_Used"
|
|
1040
|
+
|
|
1041
|
+
Missing values are omitted, and ANOVA is calculated only for rows where all
|
|
1042
|
+
experiment groups have at least two quantified values. Adjusted p-values are
|
|
1043
|
+
calculated using the Benjamini-Hochberg (BH) method. Requires that expression
|
|
1044
|
+
columns are set in the qtable.
|
|
1045
|
+
|
|
1046
|
+
Args:
|
|
1047
|
+
qtable: Qtable instance that contains expression values for the analysis.
|
|
1048
|
+
experiments: A list of experiment names from qtable.design["Experiment"] to
|
|
1049
|
+
include in the ANOVA. If None, all experiments in the design are used;
|
|
1050
|
+
default None.
|
|
1051
|
+
exclude_invalid: If true, the column "Valid" is used to determine which rows are
|
|
1052
|
+
used for the ANOVA; default True.
|
|
1053
|
+
equal_var: If true, the groups are assumed to have identical variances and a
|
|
1054
|
+
standard one-way ANOVA is performed. If false, Welch's ANOVA is performed;
|
|
1055
|
+
default False.
|
|
1056
|
+
|
|
1057
|
+
Raises:
|
|
1058
|
+
ValueError: If 'experiments' contains entries not present in qtable.design.
|
|
1059
|
+
"""
|
|
1060
|
+
if experiments is not None and any(
|
|
1061
|
+
e not in qtable.design["Experiment"].unique() for e in experiments
|
|
1062
|
+
):
|
|
1063
|
+
raise ValueError("Some specified experiments are not present in qtable.design.")
|
|
1064
|
+
min_required_values = 2
|
|
1065
|
+
if experiments is None:
|
|
1066
|
+
experiments = qtable.get_experiments()
|
|
1067
|
+
|
|
1068
|
+
table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
|
|
1069
|
+
if exclude_invalid:
|
|
1070
|
+
valid = table["Valid"].to_numpy()
|
|
1071
|
+
else:
|
|
1072
|
+
valid = np.full(table.shape[0], True)
|
|
1073
|
+
|
|
1074
|
+
experiment_data = []
|
|
1075
|
+
for experiment in experiments:
|
|
1076
|
+
samples = qtable.get_samples(experiment)
|
|
1077
|
+
experiment_data.append(table[samples].to_numpy())
|
|
1078
|
+
experiment_array = np.array(experiment_data)
|
|
1079
|
+
|
|
1080
|
+
for replicate_data in experiment_data:
|
|
1081
|
+
valid_entries = np.isfinite(replicate_data).sum(axis=1) < min_required_values
|
|
1082
|
+
valid[valid_entries] = False
|
|
1083
|
+
|
|
1084
|
+
anova_pvalues = []
|
|
1085
|
+
for row_data in experiment_array[:, valid, :].swapaxes(0, 1):
|
|
1086
|
+
anova_input = []
|
|
1087
|
+
for values in row_data:
|
|
1088
|
+
anova_input.append(values[~np.isnan(values)])
|
|
1089
|
+
_, pvalue = scipy.stats.f_oneway(*anova_input, equal_var=equal_var)
|
|
1090
|
+
anova_pvalues.append(pvalue)
|
|
1091
|
+
_, anova_adjusted_pvalues, _, _ = statsmodels.stats.multitest.multipletests(
|
|
1092
|
+
anova_pvalues, method="fdr_bh"
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
pvalues = np.empty((table.shape[0],))
|
|
1096
|
+
pvalues[:] = np.nan
|
|
1097
|
+
pvalues[valid] = anova_pvalues
|
|
1098
|
+
|
|
1099
|
+
adjusted_pvalues = np.empty((table.shape[0],))
|
|
1100
|
+
adjusted_pvalues[:] = np.nan
|
|
1101
|
+
adjusted_pvalues[valid] = anova_adjusted_pvalues
|
|
1102
|
+
|
|
1103
|
+
comparison_table = pd.DataFrame(
|
|
1104
|
+
{
|
|
1105
|
+
"ANOVA p-value": pvalues,
|
|
1106
|
+
"ANOVA adjusted p-value": adjusted_pvalues,
|
|
1107
|
+
}
|
|
1108
|
+
)
|
|
1109
|
+
qtable.add_expression_features(comparison_table)
|
|
1110
|
+
|
|
1111
|
+
|
|
737
1112
|
def _validate_experiment_pairs(
|
|
738
1113
|
qtable: Qtable, exp_pairs: Iterable[Iterable[str]]
|
|
739
1114
|
) -> None:
|
msreport/plot/distribution.py
CHANGED
|
@@ -201,11 +201,11 @@ def experiment_ratios(
|
|
|
201
201
|
experiment_data = pd.DataFrame(experiment_means)
|
|
202
202
|
|
|
203
203
|
# Only consider rows with quantitative values in all experiments
|
|
204
|
-
mask =
|
|
204
|
+
mask = experiment_data.isna().sum(axis=1) == 0
|
|
205
205
|
if exclude_invalid:
|
|
206
|
-
mask = mask & qtable["Valid"]
|
|
206
|
+
mask = mask.to_numpy() & qtable["Valid"].to_numpy()
|
|
207
207
|
# Use `mask.to_numpy` to solve issue with different indices of mask and dataframe
|
|
208
|
-
experiment_data = experiment_data[mask
|
|
208
|
+
experiment_data = experiment_data[mask]
|
|
209
209
|
pseudo_reference = np.nanmean(experiment_data, axis=1)
|
|
210
210
|
ratio_data = experiment_data.subtract(pseudo_reference, axis=0)
|
|
211
211
|
|
msreport/plot/multivariate.py
CHANGED
|
@@ -258,7 +258,6 @@ def expression_clustermap(
|
|
|
258
258
|
raise ValueError("At least two samples are required to generate a clustermap.")
|
|
259
259
|
|
|
260
260
|
data = qtable.make_expression_table(samples_as_columns=True, exclude_invalid=False)
|
|
261
|
-
data = data[samples]
|
|
262
261
|
data = data.fillna(0)
|
|
263
262
|
|
|
264
263
|
if not mean_center: # Hide missing values in the heatmap, making them appear white
|