scatrans 0.7.0.dev0__tar.gz → 0.8.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/.github/workflows/ci.yml +2 -0
- scatrans-0.8.0.dev0/.github/workflows/publish.yml +74 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/CHANGELOG.md +25 -0
- scatrans-0.8.0.dev0/MANIFEST.in +15 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/PKG-INFO +109 -144
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/README.md +108 -143
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/pyproject.toml +1 -1
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/__init__.py +13 -2
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_version.py +3 -3
- scatrans-0.8.0.dev0/src/scatrans/data/Hs_GO_Biological_Process_2026.txt +14209 -0
- scatrans-0.8.0.dev0/src/scatrans/data/Hs_KEGG_2026.txt +223 -0
- scatrans-0.8.0.dev0/src/scatrans/data/Mm_GO_Biological_Process_2026.txt +14957 -0
- scatrans-0.8.0.dev0/src/scatrans/data/Mm_KEGG_2026.txt +219 -0
- scatrans-0.8.0.dev0/src/scatrans/enrich.py +1449 -0
- scatrans-0.8.0.dev0/src/scatrans/pl.py +1754 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/tl.py +483 -225
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/PKG-INFO +109 -144
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/SOURCES.txt +6 -1
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/tests/test_basic.py +398 -0
- scatrans-0.7.0.dev0/.github/workflows/pypi-publish.yml +0 -28
- scatrans-0.7.0.dev0/src/scatrans/data/Hs_KEGG_2026.txt +0 -223
- scatrans-0.7.0.dev0/src/scatrans/data/Mm_KEGG_2026.txt +0 -219
- scatrans-0.7.0.dev0/src/scatrans/enrich.py +0 -735
- scatrans-0.7.0.dev0/src/scatrans/pl.py +0 -1168
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/.gitignore +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/LICENSE +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/examples/memento_de_example.py +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/examples/real_data_template.py +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/examples/synthetic_active_transcription.py +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/setup.cfg +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_bias.py +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_de.py +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_permutation.py +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_utils.py +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_velocity.py +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/data/Mus_musculus.GRCm39.115_gene_features.parquet +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/data/README.md +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/data/mouse_2020A_gene_features.parquet +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/generate_gene_features.py +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/pp_bias.py +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/qc.py +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/dependency_links.txt +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/entry_points.txt +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/requires.txt +0 -0
- {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
release:
|
|
8
|
+
types: [published]
|
|
9
|
+
workflow_dispatch:
|
|
10
|
+
inputs:
|
|
11
|
+
version:
|
|
12
|
+
description: "Force a specific version (SETUPTOOLS_SCM_PRETEND_VERSION). Useful for dev releases when not on a tag."
|
|
13
|
+
required: false
|
|
14
|
+
default: ""
|
|
15
|
+
|
|
16
|
+
jobs:
|
|
17
|
+
build:
|
|
18
|
+
name: Build distribution 📦
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
with:
|
|
23
|
+
fetch-depth: 0 # Critical for setuptools_scm to detect tags and produce correct version
|
|
24
|
+
|
|
25
|
+
- name: Set up Python
|
|
26
|
+
uses: actions/setup-python@v5
|
|
27
|
+
with:
|
|
28
|
+
python-version: "3.11"
|
|
29
|
+
|
|
30
|
+
- name: Install build tools
|
|
31
|
+
run: python -m pip install --upgrade build
|
|
32
|
+
|
|
33
|
+
- name: Build source and wheel distributions
|
|
34
|
+
run: |
|
|
35
|
+
if [ -n "${{ github.event.inputs.version }}" ]; then
|
|
36
|
+
echo "Using forced version: ${{ github.event.inputs.version }}"
|
|
37
|
+
SETUPTOOLS_SCM_PRETEND_VERSION="${{ github.event.inputs.version }}" python -m build
|
|
38
|
+
else
|
|
39
|
+
python -m build
|
|
40
|
+
fi
|
|
41
|
+
|
|
42
|
+
- name: Upload distribution artifacts
|
|
43
|
+
uses: actions/upload-artifact@v4
|
|
44
|
+
with:
|
|
45
|
+
name: python-package-distributions
|
|
46
|
+
path: dist/
|
|
47
|
+
|
|
48
|
+
publish:
|
|
49
|
+
name: Publish to PyPI
|
|
50
|
+
needs: build
|
|
51
|
+
runs-on: ubuntu-latest
|
|
52
|
+
|
|
53
|
+
# Required for Trusted Publishing (OIDC) - no API token secret needed
|
|
54
|
+
permissions:
|
|
55
|
+
id-token: write
|
|
56
|
+
|
|
57
|
+
# Recommended: tie to a protected GitHub Environment (create "pypi" environment in repo settings)
|
|
58
|
+
# You can add required reviewers or branch restrictions in the environment settings.
|
|
59
|
+
# environment:
|
|
60
|
+
# name: pypi
|
|
61
|
+
|
|
62
|
+
steps:
|
|
63
|
+
- name: Download all dists
|
|
64
|
+
uses: actions/download-artifact@v4
|
|
65
|
+
with:
|
|
66
|
+
name: python-package-distributions
|
|
67
|
+
path: dist/
|
|
68
|
+
|
|
69
|
+
- name: Publish distribution 📦 to PyPI
|
|
70
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
71
|
+
# For publishing to TestPyPI instead (for testing the workflow):
|
|
72
|
+
# with:
|
|
73
|
+
# repository-url: https://test.pypi.org/legacy/
|
|
74
|
+
# verbose: true
|
|
@@ -5,6 +5,31 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.8.0] - 2026-06-14
|
|
9
|
+
|
|
10
|
+
### Added (enrichment module — major paper-readiness upgrade)
|
|
11
|
+
- `run_go(ontology="BP"|"CC"|"MF"|"ALL", ...)` — direct wrapper analogous to clusterProfiler `enrichGO`. Supports `adjust_across_all=True` for a single BH correction across all GO terms when using "ALL".
|
|
12
|
+
- `save_enrichment_report(res, prefix=..., save_excel=True, save_csv=True, save_tsv=True, save_metadata=True, save_term_gene_table=True)` — one-call export of main table, term-gene long table (via `expand_enrichment_genes`), and rich `metadata.json` + xlsx sheet. Auto-creates parent directories. List columns (e.g. `Genes_list`) are sanitized to `;` strings for clean export.
|
|
13
|
+
- `expand_enrichment_genes(res)` — expands the `Genes` (semicolon) column into a long-format Term–Gene table (one row per gene). Preserves `Ontology` column when input came from `run_go(..., "ALL")`.
|
|
14
|
+
- Rich provenance in every result `.attrs` (success and empty):
|
|
15
|
+
- `analysis_info`: package, version, timestamp, module
|
|
16
|
+
- `gene_set_info`: `requested`/`resolved`, `requested_source` vs `actual_source` ("bundled", "gseapy", "gmt", "dict"), `library_name`, `n_terms`, `n_unique_genes`
|
|
17
|
+
- `universe_info`: full details of background handling (provided size, restricted, dropped_by_annotation, force_universe, mapping counts)
|
|
18
|
+
- Empty results now carry `reason` ("gene_list_empty", "universe_empty", "no_term_overlap_after_filters", ...) + the above fields so users can diagnose why nothing came back.
|
|
19
|
+
- New `run_enrichment` / `run_kegg` / `run_go` parameters: `padj_cutoff` (preferred modern name), `include_gene_list` (adds `Genes_list` python-list column), `adjust_across_all`.
|
|
20
|
+
- `list_bundled_gene_sets()` now clearly documents the 2026 organism-specific defaults.
|
|
21
|
+
- Improved low-mapping-rate warning (includes input examples + gene-set examples).
|
|
22
|
+
- `background` is now a documented deprecated alias of `universe`; passing both raises immediately.
|
|
23
|
+
- All empty-result DataFrames preserve consistent columns (including optional `Genes_list` when requested) and full diagnostic attrs.
|
|
24
|
+
|
|
25
|
+
### Changed / Improved
|
|
26
|
+
- `_load_gene_sets` now returns `(term_to_genes, term_to_desc, load_info)` so `actual_source` is always recorded accurately (even on gseapy fallback after bundled attempt).
|
|
27
|
+
- `run_kegg` fully synchronized with new parameters (`padj_cutoff`, `include_gene_list`, etc.).
|
|
28
|
+
- `enrich_dotplot` (pl.py) and various tl.py flows updated for new columns/attrs.
|
|
29
|
+
- Version unified to 0.8.0 for this release.
|
|
30
|
+
- README and docstrings extensively updated with manuscript-export examples, `run_go`, provenance details, and `adjust_across_all` guidance.
|
|
31
|
+
- Full test coverage for new paths (per-ontology attrs, within_ontology p.adjust, save+tsv+dir creation, expand with Ontology, dual-cutoff warning, etc.). All tests pass.
|
|
32
|
+
|
|
8
33
|
## [Unreleased]
|
|
9
34
|
|
|
10
35
|
### Added
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Control what goes into the source distribution (sdist).
|
|
2
|
+
# The wheel only contains the runtime package (src/scatrans + data).
|
|
3
|
+
|
|
4
|
+
# Standard important files
|
|
5
|
+
include LICENSE
|
|
6
|
+
include README.md
|
|
7
|
+
include CHANGELOG.md
|
|
8
|
+
include pyproject.toml
|
|
9
|
+
|
|
10
|
+
# Include the GitHub Actions workflows (requested)
|
|
11
|
+
include .github/workflows/ci.yml
|
|
12
|
+
include .github/workflows/publish.yml
|
|
13
|
+
|
|
14
|
+
# If more workflows are added in the future, this will catch them:
|
|
15
|
+
include .github/workflows/*.yml
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scatrans
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0.dev0
|
|
4
4
|
Summary: Single-cell Active Transcription Analysis
|
|
5
5
|
Author: scATrans Developers
|
|
6
6
|
License: MIT
|
|
@@ -42,9 +42,9 @@ Dynamic: license-file
|
|
|
42
42
|
|
|
43
43
|
# scATrans
|
|
44
44
|
|
|
45
|
-
scATrans computes a composite score
|
|
45
|
+
scATrans computes a composite score from differential expression and reference-based excess unspliced (nascent) RNA between groups. It ranks genes in single-cell spliced/unspliced or mature/nascent data.
|
|
46
46
|
|
|
47
|
-
|
|
47
|
+
Results must be interpreted using the provided diagnostics. The method has known limitations and does not guarantee recovery of truly active genes.
|
|
48
48
|
|
|
49
49
|
## Installation
|
|
50
50
|
|
|
@@ -88,109 +88,60 @@ ufrac = scat.qc.unspliced_global(adata) # logs INFO + WARNING if > 50%
|
|
|
88
88
|
|
|
89
89
|
`active_score` automatically runs this check and records the value in diagnostics.
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
scATrans (especially the Memento backend and velocity/active-transcription calculations) works best when you still have access to the original raw counts and the original spliced/unspliced (or mature/nascent) matrices on as many genes as possible.
|
|
94
|
-
|
|
95
|
-
Call this **early** (right after loading + basic QC, before any HVG, normalize or log1p):
|
|
91
|
+
## Quick Start (Minimal Default Flow)
|
|
96
92
|
|
|
97
93
|
```python
|
|
94
|
+
import scanpy as sc
|
|
98
95
|
import scatrans as scat
|
|
99
96
|
|
|
100
|
-
|
|
97
|
+
adata = sc.read_h5ad("your_data.h5ad")
|
|
98
|
+
|
|
99
|
+
# Preserve original counts and spliced/unspliced layers before HVG or normalization.
|
|
101
100
|
scat.store_raw_counts(adata, layer="counts", save_raw=False)
|
|
102
101
|
|
|
103
|
-
# Now you can safely do the usual Scanpy preprocessing for visualization
|
|
104
102
|
sc.pp.highly_variable_genes(adata, n_top_genes=3000)
|
|
105
|
-
# ...
|
|
106
|
-
```
|
|
103
|
+
# ... normalize, log1p, neighbors, UMAP, clustering ...
|
|
107
104
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
- If your adata contains `"spliced"` / `"unspliced"` (or `"mature"` / `"nascent"`) layers, it also saves them under `raw_spliced`, `raw_unspliced` etc. These preserved layers survive later HVG subsetting of the main object.
|
|
111
|
-
- `save_raw=False` is now the default (we do **not** automatically set `adata.raw` unless you explicitly ask for it with `save_raw=True`).
|
|
105
|
+
# Optional: attach bundled gene features for bias correction.
|
|
106
|
+
adata = scat.add_gene_features(adata)
|
|
112
107
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
108
|
+
adata_res, significant, all_results = scat.active_score(
|
|
109
|
+
adata_input=adata,
|
|
110
|
+
groupby="condition",
|
|
111
|
+
target_group="Disease",
|
|
112
|
+
reference_group="Control",
|
|
113
|
+
)
|
|
117
114
|
|
|
118
|
-
|
|
115
|
+
print(all_results.head())
|
|
116
|
+
```
|
|
119
117
|
|
|
120
|
-
|
|
118
|
+
Default parameters require no choices for bias correction, effective gamma, or mixed models. Pseudobulk mode and DE method (`de_method`) are configurable options. The built-in `significant` list is strict and often small or empty; use the full ranked table in `all_results`.
|
|
121
119
|
|
|
122
|
-
|
|
120
|
+
### Preserving raw counts and layers
|
|
123
121
|
|
|
124
|
-
|
|
125
|
-
sc.pp.highly_variable_genes(adata, n_top_genes=3000)
|
|
126
|
-
adata = adata[:, adata.var.highly_variable].copy()
|
|
127
|
-
```
|
|
122
|
+
Call `store_raw_counts` immediately after loading and basic QC, before HVG selection or normalization. It preserves the full raw counts and the original spliced/unspliced (or mature/nascent) layers.
|
|
128
123
|
|
|
129
|
-
|
|
124
|
+
`store_raw_counts` writes the current `.X` to `layers["counts"]` and copies spliced/unspliced layers to `raw_spliced` / `raw_unspliced`. These survive HVG subsetting. The default `save_raw=False` avoids setting `adata.raw`.
|
|
130
125
|
|
|
131
|
-
-
|
|
132
|
-
- This is standard AnnData behavior and is usually **desired**, because velocity calculations (gamma estimation, unspliced excess, active_score) require the same gene set as the main expression matrix.
|
|
133
|
-
- If you want to use HVGs only for **visualization/clustering**, but use more genes (the full post-QC gene set or a large collection) for **differential analysis (especially Memento)**, the recommended workflow is:
|
|
134
|
-
1. Immediately after loading + basic QC, call `scat.store_raw_counts(adata)` (preserves the full/large gene raw counts into the layer + .raw at that time).
|
|
135
|
-
2. Make a copy for HVG + visualization: `adata_viz = adata.copy(); ... HVG on adata_viz ...`
|
|
136
|
-
3. For DE, use the **original adata** (or the restored version), at which point it can still retrieve the corresponding raw counts from the layer (the number of genes depends on how many genes the adata had when you called store).
|
|
137
|
-
4. If you have already performed HVG subset on the main adata, the layer will also only contain raw counts for those HVGs. In that case DE can only be performed on these genes (consistent with the principle of "user performs filtering before store").
|
|
126
|
+
After HVG-based visualization on a copy, restore or use the preserved layers for full-gene DE, active scoring, or enrichment (pass `adata=` to `run_enrichment` or `run_kegg` to use the stored gene list as background).
|
|
138
127
|
|
|
139
|
-
|
|
128
|
+
HVG subsetting also subsets the saved layers. This keeps velocity calculations consistent with `.X`. To analyze more genes than the HVG set, store before subsetting or operate on the unfiltered object for DE and enrichment steps.
|
|
140
129
|
|
|
141
|
-
|
|
130
|
+
To restore raw counts into `.X` for the current gene set:
|
|
142
131
|
|
|
143
132
|
```python
|
|
144
|
-
# Restore raw counts into .X (non-destructive by default)
|
|
145
133
|
adata_raw = scat.restore_raw_counts(adata, layer="counts", inplace=False)
|
|
146
|
-
# or inplace=True to modify the current adata
|
|
147
134
|
```
|
|
148
135
|
|
|
149
|
-
See
|
|
136
|
+
See the standalone differential expression section for the no-velocity use case.
|
|
150
137
|
|
|
151
138
|
---
|
|
152
139
|
|
|
153
|
-
## Core
|
|
154
|
-
|
|
155
|
-
scATrans helps users extract **condition-wise nascent RNA relative excess** signals (a lightweight proxy for differential active transcription) from single-cell velocity-style data.
|
|
156
|
-
|
|
157
|
-
- **Basic pipeline (on by default):** DE + unspliced excess after reference gamma correction + optional light bias correction for length/intron number + composite scoring + gene filtering + enrichment + plotting.
|
|
158
|
-
- **Advanced options are opt-in:** They are powerful but add complexity and information overload. New users should start with defaults.
|
|
159
|
-
- **Honest by design:** The default `significant` list is deliberately strict (often empty or very small on real data). The primary deliverable is the full ranked table (`all_results`). Diagnostics are always provided so you can judge whether the signals are trustworthy in your data.
|
|
160
|
-
|
|
161
|
-
---
|
|
162
|
-
|
|
163
|
-
## Quick Start (Minimal Default Flow) — Recommended
|
|
164
|
-
|
|
165
|
-
```python
|
|
166
|
-
import scanpy as sc
|
|
167
|
-
import scatrans as scat
|
|
168
|
-
|
|
169
|
-
# 1. Load data that contains spliced/unspliced or mature/nascent layers
|
|
170
|
-
adata = sc.read_h5ad("your_data.h5ad")
|
|
171
|
-
|
|
172
|
-
# 2. (Optional but recommended) Attach gene features for bias correction
|
|
173
|
-
# Uses the bundled mouse table by default.
|
|
174
|
-
adata = scat.add_gene_features(adata)
|
|
175
|
-
|
|
176
|
-
# 3. Run the analysis with default parameters — no need to worry about
|
|
177
|
-
# bias_correction, effective_gamma, mixed models, etc.
|
|
178
|
-
adata_res, significant, all_results = scat.active_score(
|
|
179
|
-
adata_input=adata,
|
|
180
|
-
groupby="condition",
|
|
181
|
-
target_group="Disease",
|
|
182
|
-
reference_group="Control",
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
# 4. The most important output for almost everyone is all_results (full ranked table)
|
|
186
|
-
print(all_results.head())
|
|
187
|
-
```
|
|
188
|
-
|
|
189
|
-
**Key point:** The default settings run a basic analysis without requiring decisions about `bias_correction`, `effective_gamma`, `use_mixed_model`, or `use_permutation`.
|
|
140
|
+
## Core Workflow
|
|
190
141
|
|
|
191
|
-
|
|
142
|
+
The default path performs differential expression, reference-gamma unspliced excess, optional length/intron bias correction, composite scoring, gene filtering, enrichment, and plotting.
|
|
192
143
|
|
|
193
|
-
The
|
|
144
|
+
Advanced options are disabled by default. The internal `significant` list applies strict thresholds and is frequently empty or small. Return the complete ranked table in `all_results` and apply custom filters. Diagnostics are stored in `adata_res.uns["scatrans"]["diagnostics"]`.
|
|
194
145
|
|
|
195
146
|
---
|
|
196
147
|
|
|
@@ -208,16 +159,11 @@ adata_res, significant, all_results = scat.active_score(
|
|
|
208
159
|
)
|
|
209
160
|
```
|
|
210
161
|
|
|
211
|
-
This
|
|
212
|
-
- Differential expression between the two groups
|
|
213
|
-
- Velocity delta (nascent excess) using a reference-group gamma
|
|
214
|
-
- Light Huber bias correction on gene length + intron number (default)
|
|
215
|
-
- Composite active_score (0–100)
|
|
216
|
-
- Rich diagnostics written to `adata_res.uns["scatrans"]["diagnostics"]`
|
|
162
|
+
This computes differential expression, reference-group gamma excess for the unspliced layer, optional Huber bias correction on gene length and intron number, a composite active score, and stores diagnostics in `adata_res.uns["scatrans"]["diagnostics"]`.
|
|
217
163
|
|
|
218
164
|
### 3.1.1 Common basic switches: pseudobulk and DE test method
|
|
219
165
|
|
|
220
|
-
These
|
|
166
|
+
These are standard options available for most analyses.
|
|
221
167
|
|
|
222
168
|
**Pseudobulk mode** (recommended when you have multiple biological replicates per condition):
|
|
223
169
|
|
|
@@ -258,7 +204,7 @@ The `filter_active_genes` helper has a `preset="pseudobulk"` that applies more l
|
|
|
258
204
|
|
|
259
205
|
### 3.2 Gene filtering with filter_active_genes (core output tool)
|
|
260
206
|
|
|
261
|
-
|
|
207
|
+
The internal `significant` list is strict. Most users filter the full table returned in `all_results` with `filter_active_genes`.
|
|
262
208
|
|
|
263
209
|
```python
|
|
264
210
|
# Start permissive, then tighten based on your data
|
|
@@ -314,14 +260,7 @@ kegg_res = scat.run_kegg(
|
|
|
314
260
|
|
|
315
261
|
### Default: use the package's bundled gene sets (clearest logic)
|
|
316
262
|
|
|
317
|
-
The package
|
|
318
|
-
|
|
319
|
-
- `Hs_GO_Biological_Process_2026.txt` + `Hs_KEGG_2026.txt` for human
|
|
320
|
-
- `Mm_GO_Biological_Process_2026.txt` + `Mm_KEGG_2026.txt` for mouse
|
|
321
|
-
|
|
322
|
-
You only need to specify `organism=` (for KEGG especially). Base names like "GO_Biological_Process", "KEGG", "GO_BP" are automatically resolved to the correct organism + 2026 built-in file.
|
|
323
|
-
|
|
324
|
-
If you want a specific historical Enrichr version (e.g. GO_Biological_Process_2023), just write the full name — it will be treated as an Enrichr request.
|
|
263
|
+
The package defaults to organism-specific bundled sets (`Hs_GO_Biological_Process_2026.txt`, `Hs_KEGG_2026.txt`, and the corresponding mouse files). Specify `organism=` for KEGG or base GO names. Historical Enrichr names (e.g., `GO_Biological_Process_2023`) are passed through when supplied explicitly.
|
|
325
264
|
|
|
326
265
|
```python
|
|
327
266
|
# KEGG — just specify organism, gets the correct built-in (Hs/Mm_2026) automatically
|
|
@@ -391,6 +330,61 @@ simplified = scat.simplify_enrichment(
|
|
|
391
330
|
|
|
392
331
|
`run_kegg` and `simplify_enrichment` are convenience wrappers around the core `run_enrichment` function.
|
|
393
332
|
|
|
333
|
+
### run_go (GO enrichment, clusterProfiler-style)
|
|
334
|
+
|
|
335
|
+
```python
|
|
336
|
+
# Biological Process (defaults to the bundled Mm/Hs_GO_Biological_Process_2026)
|
|
337
|
+
go_bp = scat.run_go(
|
|
338
|
+
gene_list=markers,
|
|
339
|
+
ontology="BP", # "BP", "CC", "MF", or "ALL"
|
|
340
|
+
organism="mouse",
|
|
341
|
+
adata=adata, # recommended for correct universe
|
|
342
|
+
return_all=True,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# ALL three ontologies + unified multiple-testing correction across them
|
|
346
|
+
go_all = scat.run_go(
|
|
347
|
+
markers, ontology="ALL", organism="mouse",
|
|
348
|
+
return_all=True,
|
|
349
|
+
adjust_across_all=True, # re-compute BH on all terms together (stricter)
|
|
350
|
+
)
|
|
351
|
+
# go_all.attrs["per_ontology_attrs"] contains full diagnostics for BP/CC/MF separately
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
`run_go` automatically resolves to the organism-specific bundled sets when possible (BP is bundled; CC/MF fall back to gseapy/Enrichr if the library is installed).
|
|
355
|
+
|
|
356
|
+
### Exporting results for manuscripts / supplementary materials
|
|
357
|
+
|
|
358
|
+
The new helpers make it trivial to produce clean, reproducible tables:
|
|
359
|
+
|
|
360
|
+
```python
|
|
361
|
+
res = scat.run_kegg(genes, organism="mouse", return_all=True, include_gene_list=True)
|
|
362
|
+
|
|
363
|
+
saved = scat.save_enrichment_report(
|
|
364
|
+
res,
|
|
365
|
+
prefix="cluster1_kegg", # or "results/suppl/my_enrich" (directories created automatically)
|
|
366
|
+
save_excel=True,
|
|
367
|
+
save_csv=True,
|
|
368
|
+
save_tsv=True, # often preferred for gene symbols + Excel locale safety
|
|
369
|
+
save_metadata=True,
|
|
370
|
+
save_term_gene_table=True,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# saved -> {'results_csv': ..., 'results_tsv': ..., 'term_gene_table_csv': ..., 'metadata_json': ..., 'results_xlsx': ...}
|
|
374
|
+
|
|
375
|
+
# Long-format term–gene table (one row per gene; perfect for networks, follow-up stats, etc.)
|
|
376
|
+
long_table = scat.expand_enrichment_genes(res)
|
|
377
|
+
# If the input was from run_go(ontology="ALL"), long_table will have an "Ontology" column first.
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
`save_enrichment_report` also writes a rich `metadata.json` (and a "metadata" sheet in the xlsx) containing:
|
|
381
|
+
- `analysis_info` (package, version, timestamp)
|
|
382
|
+
- `gene_set_info` (requested/resolved + `requested_source` vs `actual_source`: "bundled", "gseapy", "gmt", "dict")
|
|
383
|
+
- `universe_info` (effective N, dropped genes, restrict behavior, etc.)
|
|
384
|
+
- Full `.attrs` from the enrichment call (including per-ontology details for GO ALL)
|
|
385
|
+
|
|
386
|
+
All empty results still carry diagnostic `.attrs` (`reason`, `gene_set_info`, `universe_info`, etc.) so you never lose information when a call returns no terms.
|
|
387
|
+
|
|
394
388
|
### 3.4 Visualization
|
|
395
389
|
|
|
396
390
|
```python
|
|
@@ -501,28 +495,11 @@ This function looks for common gene list columns (`Genes`, `Lead_genes`, etc.) a
|
|
|
501
495
|
|
|
502
496
|
---
|
|
503
497
|
|
|
504
|
-
## Result Interpretation
|
|
505
|
-
|
|
506
|
-
### Default `significant` is often empty or very small — this is normal
|
|
507
|
-
|
|
508
|
-
The internal significance mask is a strict conjunction:
|
|
509
|
-
- `p_adj < pval_cutoff`
|
|
510
|
-
- `logFC > logfc_cutoff`
|
|
511
|
-
- `velocity_residual > 0`
|
|
512
|
-
- sufficient expression
|
|
513
|
-
- `active_score > 0`
|
|
514
|
-
- (if `use_permutation`) `active_score_fdr < active_fdr_cutoff`
|
|
515
|
-
- (if `use_delta_variance_pval`) `delta_var_pval < cutoff`
|
|
516
|
-
|
|
517
|
-
On real data this frequently returns 0–few genes. **Use `all_results`** and apply your own biologically motivated filters.
|
|
498
|
+
## Result Interpretation
|
|
518
499
|
|
|
519
|
-
|
|
500
|
+
The internal significance mask applies a strict conjunction of thresholds. On real data it often returns zero or few genes. Use the full table in `all_results`, which is sorted by `active_score` descending and retains every gene that passed initial expression filters.
|
|
520
501
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
### Diagnostics (always inspect these)
|
|
524
|
-
|
|
525
|
-
After every run look at:
|
|
502
|
+
After each run inspect the diagnostics:
|
|
526
503
|
|
|
527
504
|
```python
|
|
528
505
|
meta = adata_res.uns["scatrans"]
|
|
@@ -531,25 +508,23 @@ print(meta["diagnostics"]["bias_correction"])
|
|
|
531
508
|
print(meta.get("permutation_approximation_note"))
|
|
532
509
|
```
|
|
533
510
|
|
|
534
|
-
|
|
535
|
-
- **bias_correction**: number of genes used for the fit, coefficients, whether median fallback was used.
|
|
536
|
-
- **permutation_approximation_note**: only present when `use_permutation=True`. Records that velocity layers/gamma were fixed for speed.
|
|
511
|
+
Global unspliced fractions above ~50% frequently indicate technical issues. Bias-correction diagnostics report the number of genes used and any fallback behavior. The permutation note records that velocity layers and the reference gamma were fixed for speed.
|
|
537
512
|
|
|
538
513
|
---
|
|
539
514
|
|
|
540
|
-
## Optional Advanced Features
|
|
515
|
+
## Optional Advanced Features
|
|
541
516
|
|
|
542
|
-
The following
|
|
517
|
+
The following flags are disabled by default and should be enabled only when required by the experimental design:
|
|
543
518
|
|
|
544
|
-
- `use_permutation=True
|
|
545
|
-
- `bias_correction="none"
|
|
546
|
-
- `show_effective_gamma=True
|
|
547
|
-
- `use_mixed_model=True
|
|
548
|
-
- `prioritize_velocity=True
|
|
519
|
+
- `use_permutation=True`
|
|
520
|
+
- `bias_correction="none"`
|
|
521
|
+
- `show_effective_gamma=True`
|
|
522
|
+
- `use_mixed_model=True`
|
|
523
|
+
- `prioritize_velocity=True`
|
|
549
524
|
|
|
550
|
-
|
|
525
|
+
`diagnose_design` summarizes cell and sample counts plus global unspliced fraction and returns warnings and a suggested `filter_active_genes` preset. It runs automatically when `sample_col` or `use_pseudobulk=True` is supplied.
|
|
551
526
|
|
|
552
|
-
|
|
527
|
+
Inspect the corresponding diagnostics after enabling any advanced option.
|
|
553
528
|
|
|
554
529
|
### use_permutation=True
|
|
555
530
|
|
|
@@ -598,17 +573,11 @@ Recommended only when you have a reasonable number of cells and want noise reduc
|
|
|
598
573
|
|
|
599
574
|
## Limitations
|
|
600
575
|
|
|
601
|
-
The
|
|
576
|
+
The unspliced excess term is a group-contrast proxy derived from a reference-group gamma calculation. It is not a full stochastic or dynamical model.
|
|
602
577
|
|
|
603
|
-
- The
|
|
604
|
-
- The approach is most straightforward to interpret for clear binary group contrasts. Heterogeneity within the target group can reduce the observed signal.
|
|
605
|
-
- When `use_permutation=True`, only the group labels are permuted; the velocity layers and reference gamma are computed once on the original data for computational efficiency. This approximation is recorded in the results metadata.
|
|
606
|
-
- Global unspliced fractions above ~50% are flagged by the package, as they may indicate technical issues affecting the velocity layers.
|
|
607
|
-
- Bias correction performance depends on the number and quality of genes with length and intron annotations.
|
|
608
|
-
- With small numbers of biological replicates, power for the velocity component and for permutation-based FDR is limited. Users should examine the full distributions in `all_results`.
|
|
609
|
-
- `delta_variance` and the associated mixed-model p-values tend to be conservative in the presence of substantial between-sample variation.
|
|
578
|
+
Interpretation is simplest for clear binary contrasts. Within-group heterogeneity reduces observed signal. The permutation approximation (used when `use_permutation=True`) fixes velocity layers and the reference gamma on the original labels; the note is recorded in the results. Global unspliced fractions above ~50% are flagged as potential technical artifacts. Bias-correction quality depends on the number of genes with length and intron annotations. With few biological replicates, power for the velocity term and permutation-based FDR is limited. Mixed-model statistics tend to be conservative when between-sample variation is large.
|
|
610
579
|
|
|
611
|
-
|
|
580
|
+
Always examine diagnostics, score distributions, and (when available) the original spliced/unspliced counts before biological interpretation.
|
|
612
581
|
|
|
613
582
|
---
|
|
614
583
|
|
|
@@ -651,7 +620,7 @@ Full signatures and all parameters are documented in the function docstrings and
|
|
|
651
620
|
- `add_gene_features(adata, organism="mouse", ...)` — attach length/intron info
|
|
652
621
|
- `list_available_gene_features()`
|
|
653
622
|
- `diagnose_design(adata, groupby, target_group, reference_group, sample_col=None)` — analyzes cell/sample counts and global unspliced fraction; returns warnings, recommendations, and a suggested `filter_active_genes` preset. Automatically called internally when `sample_col` or `use_pseudobulk=True` is used.
|
|
654
|
-
- `run_enrichment(...)`, `run_kegg(...)`, `simplify_enrichment(...)`, `list_bundled_gene_sets()`
|
|
623
|
+
- `run_enrichment(...)`, `run_kegg(...)`, `run_go(...)`, `simplify_enrichment(...)`, `save_enrichment_report(...)`, `expand_enrichment_genes(...)`, `list_bundled_gene_sets()`
|
|
655
624
|
- `scat.pl.*` plotting functions (comet_plot, volcano_plot, bias_diagnostic_plot, ...)
|
|
656
625
|
- `scat.qc.unspliced_global(adata)`
|
|
657
626
|
|
|
@@ -691,12 +660,10 @@ All `scat.pl.*` functions support `ax=` / `axes=` (for embedding in multi-panel
|
|
|
691
660
|
Recommended: log fold change vs. bias-corrected unspliced residual (velocity_residual), sized and colored by active_score.
|
|
692
661
|
- `s=3` (or 1-5): force **fixed** small point size for everything (direct, simple control).
|
|
693
662
|
- `point_scale=0.2` + `min_size=1`: for variable sizing, make tiniest background points truly small.
|
|
694
|
-
(Size API modeled after flexible controls seen in omicverse.pl.* )
|
|
695
663
|
|
|
696
664
|
- `scat.pl.volcano_plot(results_df, top_n=10, label_genes=None, point_scale=1.0, min_size=2, s=None, ...)`
|
|
697
665
|
2D volcano (logFC vs. -log10(p_adj)). Supports `label_genes=[...]` for manual gene labels
|
|
698
|
-
(combined with top_n)
|
|
699
|
-
not using active_score. See https://github.com/BioSenior/ggVolcano for style inspiration.
|
|
666
|
+
(combined with top_n). Classic up/down/ns coloring when not using active_score.
|
|
700
667
|
Use `s=2` for uniformly small points, or min_size + point_scale for score/p-value sized tiny backgrounds.
|
|
701
668
|
Especially helpful for pure DE results (no active_score).
|
|
702
669
|
|
|
@@ -778,9 +745,7 @@ scat.pl.enrich_dotplot(enrich)
|
|
|
778
745
|
|
|
779
746
|
`differential_expression` supports the same flexible backends as `active_score` (scanpy methods, PyDESeq2 pseudobulk, mixed models, and optionally Memento as a method-of-moments estimator). The returned table is directly compatible with `filter_active_genes`, enrichment functions, and all `scat.pl.*` plotting helpers.
|
|
780
747
|
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
See `examples/memento_de_example.py` for a complete demonstration of both the velocity-focused and pure-DE usage patterns.
|
|
748
|
+
The package therefore supports both velocity-based active transcription analysis and conventional DE + enrichment workflows. See `examples/memento_de_example.py` for a complete demonstration of the pure-DE path.
|
|
784
749
|
|
|
785
750
|
**Important: raw counts requirement**
|
|
786
751
|
|
|
@@ -823,4 +788,4 @@ MIT License.
|
|
|
823
788
|
|
|
824
789
|
---
|
|
825
790
|
|
|
826
|
-
|
|
791
|
+
|