rosetta-bioc 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rosetta_bioc-0.1.0/PKG-INFO +144 -0
- rosetta_bioc-0.1.0/README.md +116 -0
- rosetta_bioc-0.1.0/pyproject.toml +42 -0
- rosetta_bioc-0.1.0/rosetta/__init__.py +23 -0
- rosetta_bioc-0.1.0/rosetta/__main__.py +3 -0
- rosetta_bioc-0.1.0/rosetta/_bridge.py +64 -0
- rosetta_bioc-0.1.0/rosetta/_deps.py +19 -0
- rosetta_bioc-0.1.0/rosetta/_errors.py +24 -0
- rosetta_bioc-0.1.0/rosetta/example.py +65 -0
- rosetta_bioc-0.1.0/rosetta/pipelines.py +160 -0
- rosetta_bioc-0.1.0/rosetta/results.py +119 -0
- rosetta_bioc-0.1.0/rosetta/stats/__init__.py +0 -0
- rosetta_bioc-0.1.0/rosetta/stats/decide.py +32 -0
- rosetta_bioc-0.1.0/rosetta/stats/design.py +28 -0
- rosetta_bioc-0.1.0/rosetta/stats/treat.py +13 -0
- rosetta_bioc-0.1.0/rosetta/wrappers/__init__.py +1 -0
- rosetta_bioc-0.1.0/rosetta/wrappers/clusterprofiler.py +103 -0
- rosetta_bioc-0.1.0/rosetta/wrappers/deseq2.py +176 -0
- rosetta_bioc-0.1.0/rosetta/wrappers/edger.py +76 -0
- rosetta_bioc-0.1.0/rosetta/wrappers/limma.py +81 -0
- rosetta_bioc-0.1.0/rosetta/wrappers/phyloseq.py +85 -0
- rosetta_bioc-0.1.0/rosetta/wrappers/seurat.py +78 -0
- rosetta_bioc-0.1.0/rosetta_bioc.egg-info/PKG-INFO +144 -0
- rosetta_bioc-0.1.0/rosetta_bioc.egg-info/SOURCES.txt +34 -0
- rosetta_bioc-0.1.0/rosetta_bioc.egg-info/dependency_links.txt +1 -0
- rosetta_bioc-0.1.0/rosetta_bioc.egg-info/requires.txt +10 -0
- rosetta_bioc-0.1.0/rosetta_bioc.egg-info/top_level.txt +1 -0
- rosetta_bioc-0.1.0/setup.cfg +4 -0
- rosetta_bioc-0.1.0/tests/test_bridge.py +104 -0
- rosetta_bioc-0.1.0/tests/test_clusterprofiler.py +92 -0
- rosetta_bioc-0.1.0/tests/test_deps.py +26 -0
- rosetta_bioc-0.1.0/tests/test_errors.py +25 -0
- rosetta_bioc-0.1.0/tests/test_init.py +67 -0
- rosetta_bioc-0.1.0/tests/test_phyloseq.py +57 -0
- rosetta_bioc-0.1.0/tests/test_report.py +82 -0
- rosetta_bioc-0.1.0/tests/test_seurat.py +79 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rosetta-bioc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pythonic API for R/Bioconductor statistical methods — calls validated R code, returns pandas DataFrames.
|
|
5
|
+
Author: Catherine Chi Chung
|
|
6
|
+
Author-email: John Muirhead-Gould <john@nodes.bio>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/rosetta-bioc/rosetta
|
|
9
|
+
Project-URL: Documentation, https://github.com/rosetta-bioc/rosetta#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/rosetta-bioc/rosetta
|
|
11
|
+
Project-URL: Issues, https://github.com/rosetta-bioc/rosetta/issues
|
|
12
|
+
Keywords: bioinformatics,R,bioconductor,DESeq2,edgeR,limma,rpy2,RNA-seq,differential-expression,pathway-enrichment,clusterProfiler,genomics,pandas
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: rpy2>=3.5
|
|
21
|
+
Requires-Dist: pandas>=1.5
|
|
22
|
+
Requires-Dist: numpy>=1.23
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
25
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
26
|
+
Provides-Extra: posit
|
|
27
|
+
Requires-Dist: rpy2>=3.5; extra == "posit"
|
|
28
|
+
|
|
29
|
+
# 🪨 rosetta
|
|
30
|
+
|
|
31
|
+
**Python interface to R/Bioconductor — pandas in, pandas out, `.report()` when you're done.**
|
|
32
|
+
|
|
33
|
+
[](https://pypi.org/project/rosetta-bioc/)
|
|
34
|
+
[](https://opensource.org/licenses/MIT)
|
|
35
|
+
[]()
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install rosetta-bioc
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## 30-second demo
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import rosetta as rb
|
|
45
|
+
|
|
46
|
+
# DESeq2 differential expression — one call, pandas out
|
|
47
|
+
results = rb.deseq2(counts_df, metadata_df, design="~ condition")
|
|
48
|
+
results.report()
|
|
49
|
+
```
|
|
50
|
+
```
|
|
51
|
+
DESeq2 Results Summary
|
|
52
|
+
──────────────────────────────
|
|
53
|
+
Total genes tested: 12,000
|
|
54
|
+
Significant (padj<0.05): 843 (7.0%)
|
|
55
|
+
↑ Upregulated: 428
|
|
56
|
+
↓ Downregulated: 415
|
|
57
|
+
LFC range: [-4.71, 3.50]
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
That's it. No R code. No rpy2 boilerplate. No type conversion. Just results.
|
|
61
|
+
|
|
62
|
+
## What it wraps
|
|
63
|
+
|
|
64
|
+
| R Package | Python | What it does |
|
|
65
|
+
|-----------|--------|--------------|
|
|
66
|
+
| DESeq2 | `rb.deseq2()` | Differential expression (negative binomial) |
|
|
67
|
+
| edgeR | `rb.edger()` | Quasi-likelihood differential expression |
|
|
68
|
+
| limma | `rb.limma_voom()` | Linear models + TREAT significance |
|
|
69
|
+
| clusterProfiler | `rb.enrich_go()` | GO/KEGG/Reactome pathway enrichment |
|
|
70
|
+
| phyloseq | `rb.phyloseq()` | Microbiome diversity analysis |
|
|
71
|
+
| Seurat | `rb.seurat()` | Single-cell RNA-seq |
|
|
72
|
+
|
|
73
|
+
All functions return a `RosettaDataFrame` (pandas DataFrame subclass) with a `.report()` method.
|
|
74
|
+
|
|
75
|
+
## Modular DESeq2 API
|
|
76
|
+
|
|
77
|
+
For more control, use the step-by-step interface:
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from rosetta.wrappers.deseq2 import run_deseq2, get_results, lfc_shrink
|
|
81
|
+
|
|
82
|
+
dds = run_deseq2(counts_df, metadata_df, design="~ condition")
|
|
83
|
+
res = get_results(dds, contrast=["condition", "treated", "control"], alpha=0.05)
|
|
84
|
+
shrunk = lfc_shrink(dds, coef="condition_treated_vs_control", type="apeglm")
|
|
85
|
+
|
|
86
|
+
res.report()
|
|
87
|
+
shrunk.report()
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Enrichment analysis
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import rosetta as rb
|
|
94
|
+
|
|
95
|
+
# Over-representation analysis
|
|
96
|
+
go_results = rb.enrich_go(gene_list, org_db="org.Hs.eg.db", ont="BP")
|
|
97
|
+
go_results.report()
|
|
98
|
+
|
|
99
|
+
# KEGG pathways
|
|
100
|
+
kegg = rb.enrich_kegg(gene_list, organism="hsa")
|
|
101
|
+
kegg.report()
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Setup
|
|
105
|
+
|
|
106
|
+
**Python side:**
|
|
107
|
+
```bash
|
|
108
|
+
pip install rosetta-bioc
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
**R side** (one-time):
|
|
112
|
+
```bash
|
|
113
|
+
Rscript install.R
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Or manually:
|
|
117
|
+
```r
|
|
118
|
+
BiocManager::install(c("DESeq2", "edgeR", "limma", "clusterProfiler"))
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**Posit Cloud:** See [docs/posit-cloud.md](docs/posit-cloud.md) for zero-config setup.
|
|
122
|
+
|
|
123
|
+
## Requirements
|
|
124
|
+
|
|
125
|
+
- Python 3.9+
|
|
126
|
+
- R 4.0+ with Bioconductor
|
|
127
|
+
- rpy2 ≥ 3.5
|
|
128
|
+
|
|
129
|
+
## Philosophy
|
|
130
|
+
|
|
131
|
+
1. **Rosetta calls R — it doesn't reimplement it.** All statistics run in the original, validated R packages.
|
|
132
|
+
2. **Pandas in, pandas out.** No R objects leak into your Python workflow.
|
|
133
|
+
3. **Fail early, fail clearly.** Input validation happens in Python before crossing the R boundary.
|
|
134
|
+
4. **`.report()` everything.** Results should be immediately interpretable without manual inspection.
|
|
135
|
+
|
|
136
|
+
## Contributing
|
|
137
|
+
|
|
138
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). Good first issues are labeled — start with [Issue #1: `report()` enhancements](https://github.com/rosetta-bioc/rosetta/issues/1).
|
|
139
|
+
|
|
140
|
+
## Acknowledgments
|
|
141
|
+
|
|
142
|
+
Built on [rpy2](https://rpy2.github.io/) and the extraordinary R/Bioconductor ecosystem. All credit for the statistical methods goes to the original R package authors.
|
|
143
|
+
|
|
144
|
+
GSoC 2026 · MIT License · [Nodes Bio](https://nodes.bio)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# 🪨 rosetta
|
|
2
|
+
|
|
3
|
+
**Python interface to R/Bioconductor — pandas in, pandas out, `.report()` when you're done.**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/rosetta-bioc/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[]()
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install rosetta-bioc
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## 30-second demo
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
import rosetta as rb
|
|
17
|
+
|
|
18
|
+
# DESeq2 differential expression — one call, pandas out
|
|
19
|
+
results = rb.deseq2(counts_df, metadata_df, design="~ condition")
|
|
20
|
+
results.report()
|
|
21
|
+
```
|
|
22
|
+
```
|
|
23
|
+
DESeq2 Results Summary
|
|
24
|
+
──────────────────────────────
|
|
25
|
+
Total genes tested: 12,000
|
|
26
|
+
Significant (padj<0.05): 843 (7.0%)
|
|
27
|
+
↑ Upregulated: 428
|
|
28
|
+
↓ Downregulated: 415
|
|
29
|
+
LFC range: [-4.71, 3.50]
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
That's it. No R code. No rpy2 boilerplate. No type conversion. Just results.
|
|
33
|
+
|
|
34
|
+
## What it wraps
|
|
35
|
+
|
|
36
|
+
| R Package | Python | What it does |
|
|
37
|
+
|-----------|--------|--------------|
|
|
38
|
+
| DESeq2 | `rb.deseq2()` | Differential expression (negative binomial) |
|
|
39
|
+
| edgeR | `rb.edger()` | Quasi-likelihood differential expression |
|
|
40
|
+
| limma | `rb.limma_voom()` | Linear models + TREAT significance |
|
|
41
|
+
| clusterProfiler | `rb.enrich_go()` | GO/KEGG/Reactome pathway enrichment |
|
|
42
|
+
| phyloseq | `rb.phyloseq()` | Microbiome diversity analysis |
|
|
43
|
+
| Seurat | `rb.seurat()` | Single-cell RNA-seq |
|
|
44
|
+
|
|
45
|
+
All functions return a `RosettaDataFrame` (pandas DataFrame subclass) with a `.report()` method.
|
|
46
|
+
|
|
47
|
+
## Modular DESeq2 API
|
|
48
|
+
|
|
49
|
+
For more control, use the step-by-step interface:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from rosetta.wrappers.deseq2 import run_deseq2, get_results, lfc_shrink
|
|
53
|
+
|
|
54
|
+
dds = run_deseq2(counts_df, metadata_df, design="~ condition")
|
|
55
|
+
res = get_results(dds, contrast=["condition", "treated", "control"], alpha=0.05)
|
|
56
|
+
shrunk = lfc_shrink(dds, coef="condition_treated_vs_control", type="apeglm")
|
|
57
|
+
|
|
58
|
+
res.report()
|
|
59
|
+
shrunk.report()
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Enrichment analysis
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
import rosetta as rb
|
|
66
|
+
|
|
67
|
+
# Over-representation analysis
|
|
68
|
+
go_results = rb.enrich_go(gene_list, org_db="org.Hs.eg.db", ont="BP")
|
|
69
|
+
go_results.report()
|
|
70
|
+
|
|
71
|
+
# KEGG pathways
|
|
72
|
+
kegg = rb.enrich_kegg(gene_list, organism="hsa")
|
|
73
|
+
kegg.report()
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Setup
|
|
77
|
+
|
|
78
|
+
**Python side:**
|
|
79
|
+
```bash
|
|
80
|
+
pip install rosetta-bioc
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
**R side** (one-time):
|
|
84
|
+
```bash
|
|
85
|
+
Rscript install.R
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Or manually:
|
|
89
|
+
```r
|
|
90
|
+
BiocManager::install(c("DESeq2", "edgeR", "limma", "clusterProfiler"))
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
**Posit Cloud:** See [docs/posit-cloud.md](docs/posit-cloud.md) for zero-config setup.
|
|
94
|
+
|
|
95
|
+
## Requirements
|
|
96
|
+
|
|
97
|
+
- Python 3.9+
|
|
98
|
+
- R 4.0+ with Bioconductor
|
|
99
|
+
- rpy2 ≥ 3.5
|
|
100
|
+
|
|
101
|
+
## Philosophy
|
|
102
|
+
|
|
103
|
+
1. **Rosetta calls R — it doesn't reimplement it.** All statistics run in the original, validated R packages.
|
|
104
|
+
2. **Pandas in, pandas out.** No R objects leak into your Python workflow.
|
|
105
|
+
3. **Fail early, fail clearly.** Input validation happens in Python before crossing the R boundary.
|
|
106
|
+
4. **`.report()` everything.** Results should be immediately interpretable without manual inspection.
|
|
107
|
+
|
|
108
|
+
## Contributing
|
|
109
|
+
|
|
110
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). Good first issues are labeled — start with [Issue #1: `report()` enhancements](https://github.com/rosetta-bioc/rosetta/issues/1).
|
|
111
|
+
|
|
112
|
+
## Acknowledgments
|
|
113
|
+
|
|
114
|
+
Built on [rpy2](https://rpy2.github.io/) and the extraordinary R/Bioconductor ecosystem. All credit for the statistical methods goes to the original R package authors.
|
|
115
|
+
|
|
116
|
+
GSoC 2026 · MIT License · [Nodes Bio](https://nodes.bio)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "rosetta-bioc"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Pythonic API for R/Bioconductor statistical methods — calls validated R code, returns pandas DataFrames."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "John Muirhead-Gould", email = "john@nodes.bio"},
|
|
14
|
+
{name = "Catherine Chi Chung"},
|
|
15
|
+
]
|
|
16
|
+
keywords = ["bioinformatics", "R", "bioconductor", "DESeq2", "edgeR", "limma", "rpy2", "RNA-seq", "differential-expression", "pathway-enrichment", "clusterProfiler", "genomics", "pandas"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 3 - Alpha",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Medical Science Apps.",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"rpy2>=3.5",
|
|
26
|
+
"pandas>=1.5",
|
|
27
|
+
"numpy>=1.23",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = ["pytest>=7", "ruff>=0.5"]
|
|
32
|
+
posit = ["rpy2>=3.5"]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/rosetta-bioc/rosetta"
|
|
36
|
+
Documentation = "https://github.com/rosetta-bioc/rosetta#readme"
|
|
37
|
+
Repository = "https://github.com/rosetta-bioc/rosetta"
|
|
38
|
+
Issues = "https://github.com/rosetta-bioc/rosetta/issues"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
include = ["rosetta*"]
|
|
42
|
+
exclude = ["tests*", "examples*"]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""rosetta — Seamless Python wrappers for R bioinformatics packages."""
|
|
2
|
+
|
|
3
|
+
from ._errors import RDataError, RFormulaError, RPackageMissing
|
|
4
|
+
from .results import RosettaDataFrame
|
|
5
|
+
from .wrappers.deseq2 import deseq2
|
|
6
|
+
from .wrappers.edger import edger
|
|
7
|
+
from .wrappers.limma import limma_voom
|
|
8
|
+
from .wrappers.clusterprofiler import enrich_go, enrich_kegg, enrich_pathway, enrich_custom
|
|
9
|
+
from .wrappers.phyloseq import phyloseq, phyloseq_richness
|
|
10
|
+
from .wrappers.seurat import seurat
|
|
11
|
+
from . import pipelines
|
|
12
|
+
|
|
13
|
+
# Alias for backward compatibility
|
|
14
|
+
enrichment = enrich_go
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"deseq2", "edger", "limma_voom", "enrichment",
|
|
18
|
+
"enrich_go", "enrich_kegg", "enrich_pathway", "enrich_custom",
|
|
19
|
+
"phyloseq", "phyloseq_richness", "seurat",
|
|
20
|
+
"pipelines",
|
|
21
|
+
"RosettaDataFrame",
|
|
22
|
+
"RDataError", "RFormulaError", "RPackageMissing",
|
|
23
|
+
]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""R session management and bidirectional type conversion."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import rpy2.robjects as ro
|
|
6
|
+
from rpy2.robjects import numpy2ri, pandas2ri
|
|
7
|
+
from rpy2.robjects.conversion import Converter, localconverter
|
|
8
|
+
from rpy2.robjects.packages import importr
|
|
9
|
+
|
|
10
|
+
_converter = Converter("rosetta")
|
|
11
|
+
_converter += numpy2ri.converter
|
|
12
|
+
_converter += pandas2ri.converter
|
|
13
|
+
_converter += ro.default_converter
|
|
14
|
+
|
|
15
|
+
_base = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_base():
|
|
19
|
+
"""Lazily import R base package."""
|
|
20
|
+
global _base
|
|
21
|
+
if _base is None:
|
|
22
|
+
_base = importr("base")
|
|
23
|
+
return _base
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def to_r_matrix(df: pd.DataFrame):
|
|
27
|
+
"""Convert pandas DataFrame to R matrix."""
|
|
28
|
+
from ._errors import RDataError
|
|
29
|
+
if not isinstance(df, pd.DataFrame):
|
|
30
|
+
raise RDataError("Expected pandas DataFrame")
|
|
31
|
+
with localconverter(_converter):
|
|
32
|
+
return _get_base().as_matrix(ro.conversion.get_conversion().py2rpy(df))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def to_r_dataframe(df: pd.DataFrame):
|
|
36
|
+
"""Convert pandas DataFrame to R data.frame."""
|
|
37
|
+
from ._errors import RDataError
|
|
38
|
+
if not isinstance(df, pd.DataFrame):
|
|
39
|
+
raise RDataError("Expected pandas DataFrame")
|
|
40
|
+
with localconverter(_converter):
|
|
41
|
+
return ro.conversion.get_conversion().py2rpy(df)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def to_pandas(r_obj) -> "pd.DataFrame":
|
|
45
|
+
"""Convert R data.frame/matrix to pandas DataFrame (with .report() method)."""
|
|
46
|
+
from .results import RosettaDataFrame
|
|
47
|
+
with localconverter(_converter):
|
|
48
|
+
df = ro.conversion.get_conversion().rpy2py(r_obj)
|
|
49
|
+
if isinstance(df, pd.DataFrame):
|
|
50
|
+
return RosettaDataFrame(df)
|
|
51
|
+
return df
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def to_r_df(r_obj):
|
|
55
|
+
"""Convert an R object to R data.frame via base::as.data.frame."""
|
|
56
|
+
with localconverter(_converter):
|
|
57
|
+
return _get_base().as_data_frame(r_obj)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def r_nrow(r_obj):
|
|
61
|
+
"""Get nrow of an R object via base::nrow."""
|
|
62
|
+
with localconverter(_converter):
|
|
63
|
+
result = _get_base().nrow(r_obj)
|
|
64
|
+
return int(result[0]) # Convert R vector to Python int
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""R package detection and installation via BiocManager."""
|
|
2
|
+
|
|
3
|
+
from rpy2.robjects.conversion import localconverter
|
|
4
|
+
|
|
5
|
+
from ._bridge import _converter, _get_base
|
|
6
|
+
from ._errors import RPackageMissing
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def is_installed(package: str) -> bool:
|
|
10
|
+
"""Check if an R package is installed."""
|
|
11
|
+
with localconverter(_converter):
|
|
12
|
+
result = _get_base().requireNamespace(package, quietly=True)
|
|
13
|
+
return bool(result[0])
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def ensure_installed(package: str) -> None:
|
|
17
|
+
"""Ensure an R package is installed, raising RPackageMissing if not."""
|
|
18
|
+
if not is_installed(package):
|
|
19
|
+
raise RPackageMissing(package)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Rosetta exception classes for R error translation."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class RosettaError(Exception):
|
|
5
|
+
"""Base exception for rosetta."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RPackageMissing(RosettaError):
|
|
9
|
+
"""Required R package is not installed."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, package: str):
|
|
12
|
+
self.package = package
|
|
13
|
+
super().__init__(f"R package '{package}' is not installed. Install with: R -e 'BiocManager::install(\"{package}\")'")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RFormulaError(RosettaError):
|
|
17
|
+
"""Invalid R design formula."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RDataError(RosettaError):
|
|
21
|
+
"""Incompatible input data for R function."""
|
|
22
|
+
class RosettaSecurityError(RosettaError):
|
|
23
|
+
"""Exception raised for security-related issues."""
|
|
24
|
+
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""rosetta quick-start example — synthetic data, no files needed.
|
|
2
|
+
|
|
3
|
+
Run: python -m rosetta.example
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
"""Demonstrate rosetta with synthetic RNA-seq count data."""
|
|
12
|
+
from .results import RosettaDataFrame
|
|
13
|
+
|
|
14
|
+
print("🪨 rosetta — quick demo with synthetic data\n")
|
|
15
|
+
|
|
16
|
+
# Generate fake count matrix (1000 genes × 6 samples)
|
|
17
|
+
np.random.seed(42)
|
|
18
|
+
n_genes, n_samples = 1000, 6
|
|
19
|
+
gene_names = [f"Gene_{i:04d}" for i in range(n_genes)]
|
|
20
|
+
sample_names = [f"S{i+1}" for i in range(n_samples)]
|
|
21
|
+
|
|
22
|
+
# Base expression + condition effect for first 100 genes
|
|
23
|
+
base = np.random.negative_binomial(n=5, p=0.01, size=(n_genes, n_samples))
|
|
24
|
+
base[:100, 3:] += np.random.negative_binomial(n=3, p=0.01, size=(100, 3)) # upregulated in treated
|
|
25
|
+
|
|
26
|
+
counts = pd.DataFrame(base, index=gene_names, columns=sample_names)
|
|
27
|
+
metadata = pd.DataFrame(
|
|
28
|
+
{"condition": ["control"] * 3 + ["treated"] * 3},
|
|
29
|
+
index=sample_names,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
print(f"Count matrix: {counts.shape[0]} genes × {counts.shape[1]} samples")
|
|
33
|
+
print(f"Conditions: {metadata['condition'].value_counts().to_dict()}\n")
|
|
34
|
+
|
|
35
|
+
# Simulate DESeq2-like results (without R, for demo purposes)
|
|
36
|
+
pvals = np.random.uniform(0, 1, n_genes)
|
|
37
|
+
pvals[:100] = np.random.uniform(0, 0.001, 100) # truly DE genes
|
|
38
|
+
lfc = np.random.normal(0, 0.5, n_genes)
|
|
39
|
+
lfc[:100] = np.random.normal(2.0, 0.8, 100)
|
|
40
|
+
|
|
41
|
+
results = RosettaDataFrame({
|
|
42
|
+
"baseMean": counts.mean(axis=1).values,
|
|
43
|
+
"log2FoldChange": lfc,
|
|
44
|
+
"lfcSE": np.abs(np.random.normal(0.3, 0.1, n_genes)),
|
|
45
|
+
"stat": lfc / 0.3,
|
|
46
|
+
"pvalue": pvals,
|
|
47
|
+
"padj": np.minimum(pvals * n_genes / np.arange(1, n_genes + 1), 1.0), # BH correction
|
|
48
|
+
}, index=gene_names)
|
|
49
|
+
|
|
50
|
+
print("─" * 40)
|
|
51
|
+
results.report()
|
|
52
|
+
print("─" * 40)
|
|
53
|
+
|
|
54
|
+
# Show top genes
|
|
55
|
+
sig = results[results["padj"] < 0.05].sort_values("log2FoldChange", ascending=False)
|
|
56
|
+
print(f"\nTop 5 upregulated genes:")
|
|
57
|
+
print(sig[["log2FoldChange", "padj"]].head().to_string())
|
|
58
|
+
|
|
59
|
+
print("\n✓ To run with real R packages:")
|
|
60
|
+
print(" results = rb.deseq2(counts, metadata, design='~ condition')")
|
|
61
|
+
print(" results.report()")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
if __name__ == "__main__":
|
|
65
|
+
main()
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""rosetta.pipelines — Complete analysis workflows in one call.
|
|
2
|
+
|
|
3
|
+
These are the "I just want results" functions. Each one runs the full
|
|
4
|
+
statistical pipeline and returns a RosettaDataFrame with .report().
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from .results import RosettaDataFrame
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def diff_expr(
|
|
12
|
+
counts: pd.DataFrame,
|
|
13
|
+
metadata: pd.DataFrame,
|
|
14
|
+
design: str = "~ condition",
|
|
15
|
+
method: str = "deseq2",
|
|
16
|
+
alpha: float = 0.05,
|
|
17
|
+
lfc_threshold: float = 0.0,
|
|
18
|
+
shrinkage: str | None = None,
|
|
19
|
+
contrast: list | None = None,
|
|
20
|
+
) -> RosettaDataFrame:
|
|
21
|
+
"""Run differential expression — full pipeline, one call.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
counts: Gene count matrix (genes × samples), raw integers.
|
|
25
|
+
metadata: Sample metadata with index matching count columns.
|
|
26
|
+
design: R formula string (e.g. "~ condition", "~ batch + treatment").
|
|
27
|
+
method: One of "deseq2", "edger", "limma".
|
|
28
|
+
alpha: FDR significance threshold.
|
|
29
|
+
lfc_threshold: Minimum absolute log2 fold change.
|
|
30
|
+
shrinkage: For DESeq2: "apeglm", "ashr", or "normal". None = no shrinkage.
|
|
31
|
+
contrast: For DESeq2: [factor, numerator, denominator].
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
RosettaDataFrame with .report() method.
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
>>> results = rb.pipelines.diff_expr(counts, meta, method="deseq2")
|
|
38
|
+
>>> results.report()
|
|
39
|
+
>>> sig_genes = results[results["padj"] < 0.05]
|
|
40
|
+
"""
|
|
41
|
+
if method == "deseq2":
|
|
42
|
+
from .wrappers.deseq2 import run_deseq2, get_results, lfc_shrink
|
|
43
|
+
|
|
44
|
+
dds = run_deseq2(counts, metadata, design)
|
|
45
|
+
|
|
46
|
+
if shrinkage:
|
|
47
|
+
# Need coefficient name for shrinkage
|
|
48
|
+
from rpy2.robjects.packages import importr
|
|
49
|
+
from ._bridge import _converter
|
|
50
|
+
from rpy2.robjects.conversion import localconverter
|
|
51
|
+
deseq2_pkg = importr("DESeq2")
|
|
52
|
+
with localconverter(_converter):
|
|
53
|
+
coefs = list(deseq2_pkg.resultsNames(dds))
|
|
54
|
+
# Use last coefficient (typically the treatment effect)
|
|
55
|
+
coef = coefs[-1] if coefs else None
|
|
56
|
+
if coef:
|
|
57
|
+
return lfc_shrink(dds, coef=coef, type=shrinkage)
|
|
58
|
+
|
|
59
|
+
return get_results(dds, contrast=contrast, lfc_threshold=lfc_threshold, alpha=alpha)
|
|
60
|
+
|
|
61
|
+
elif method == "edger":
|
|
62
|
+
from .wrappers.edger import edger
|
|
63
|
+
return edger(counts, metadata, design, lfc=lfc_threshold)
|
|
64
|
+
|
|
65
|
+
elif method == "limma":
|
|
66
|
+
from .wrappers.limma import limma_voom
|
|
67
|
+
return limma_voom(counts, metadata, design)
|
|
68
|
+
|
|
69
|
+
else:
|
|
70
|
+
raise ValueError(f"Unknown method '{method}'. Use 'deseq2', 'edger', or 'limma'.")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def enrichment(
|
|
74
|
+
gene_list: list[str],
|
|
75
|
+
method: str = "go",
|
|
76
|
+
organism: str = "hsa",
|
|
77
|
+
org_db: str = "org.Hs.eg.db",
|
|
78
|
+
ont: str = "BP",
|
|
79
|
+
**kwargs,
|
|
80
|
+
) -> RosettaDataFrame:
|
|
81
|
+
"""Run pathway enrichment — full pipeline, one call.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
gene_list: List of gene IDs (Entrez by default).
|
|
85
|
+
method: One of "go", "kegg", "reactome".
|
|
86
|
+
organism: KEGG organism code (default "hsa" for human).
|
|
87
|
+
org_db: OrgDb for GO (default "org.Hs.eg.db").
|
|
88
|
+
ont: GO ontology — "BP", "MF", or "CC".
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
RosettaDataFrame with .report() method.
|
|
92
|
+
|
|
93
|
+
Example:
|
|
94
|
+
>>> results = rb.pipelines.enrichment(sig_genes, method="kegg")
|
|
95
|
+
>>> results.report()
|
|
96
|
+
"""
|
|
97
|
+
if method == "go":
|
|
98
|
+
from .wrappers.clusterprofiler import enrich_go
|
|
99
|
+
return enrich_go(gene_list, organism=org_db, ont=ont, **kwargs)
|
|
100
|
+
elif method == "kegg":
|
|
101
|
+
from .wrappers.clusterprofiler import enrich_kegg
|
|
102
|
+
return enrich_kegg(gene_list, organism=organism, **kwargs)
|
|
103
|
+
elif method == "reactome":
|
|
104
|
+
from .wrappers.clusterprofiler import enrich_pathway
|
|
105
|
+
return enrich_pathway(gene_list, **kwargs)
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError(f"Unknown method '{method}'. Use 'go', 'kegg', or 'reactome'.")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def compare(
|
|
111
|
+
counts: pd.DataFrame,
|
|
112
|
+
metadata: pd.DataFrame,
|
|
113
|
+
design: str = "~ condition",
|
|
114
|
+
methods: list[str] | None = None,
|
|
115
|
+
alpha: float = 0.05,
|
|
116
|
+
) -> RosettaDataFrame:
|
|
117
|
+
"""Run multiple DE methods and return a comparison summary.
|
|
118
|
+
|
|
119
|
+
This is the "which genes do all methods agree on?" function.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
counts: Gene count matrix (genes × samples).
|
|
123
|
+
metadata: Sample metadata.
|
|
124
|
+
design: R formula string.
|
|
125
|
+
methods: List of methods to compare. Default: ["deseq2", "edger", "limma"].
|
|
126
|
+
alpha: FDR significance threshold.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
RosettaDataFrame with columns for each method's significance call
|
|
130
|
+
and an 'n_methods' column showing agreement count.
|
|
131
|
+
|
|
132
|
+
Example:
|
|
133
|
+
>>> consensus = rb.pipelines.compare(counts, meta)
|
|
134
|
+
>>> robust_genes = consensus[consensus["n_methods"] == 3]
|
|
135
|
+
"""
|
|
136
|
+
if methods is None:
|
|
137
|
+
methods = ["deseq2", "edger", "limma"]
|
|
138
|
+
|
|
139
|
+
results = {}
|
|
140
|
+
for method in methods:
|
|
141
|
+
try:
|
|
142
|
+
res = diff_expr(counts, metadata, design, method=method, alpha=alpha)
|
|
143
|
+
# Extract significance column
|
|
144
|
+
if "padj" in res.columns:
|
|
145
|
+
results[method] = res["padj"] < alpha
|
|
146
|
+
elif "FDR" in res.columns:
|
|
147
|
+
results[method] = res["FDR"] < alpha
|
|
148
|
+
elif "adj.P.Val" in res.columns:
|
|
149
|
+
results[method] = res["adj.P.Val"] < alpha
|
|
150
|
+
except Exception as e:
|
|
151
|
+
print(f" ⚠ {method} failed: {e}")
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
if not results:
|
|
155
|
+
raise RuntimeError("All methods failed")
|
|
156
|
+
|
|
157
|
+
comparison = pd.DataFrame(results)
|
|
158
|
+
comparison["n_methods"] = comparison.sum(axis=1)
|
|
159
|
+
comparison = comparison.sort_values("n_methods", ascending=False)
|
|
160
|
+
return RosettaDataFrame(comparison)
|