scatrans 0.7.0.dev0__tar.gz → 0.8.0.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/.github/workflows/ci.yml +2 -0
  2. scatrans-0.8.0.dev0/.github/workflows/publish.yml +74 -0
  3. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/CHANGELOG.md +25 -0
  4. scatrans-0.8.0.dev0/MANIFEST.in +15 -0
  5. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/PKG-INFO +109 -144
  6. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/README.md +108 -143
  7. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/pyproject.toml +1 -1
  8. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/__init__.py +13 -2
  9. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_version.py +3 -3
  10. scatrans-0.8.0.dev0/src/scatrans/data/Hs_GO_Biological_Process_2026.txt +14209 -0
  11. scatrans-0.8.0.dev0/src/scatrans/data/Hs_KEGG_2026.txt +223 -0
  12. scatrans-0.8.0.dev0/src/scatrans/data/Mm_GO_Biological_Process_2026.txt +14957 -0
  13. scatrans-0.8.0.dev0/src/scatrans/data/Mm_KEGG_2026.txt +219 -0
  14. scatrans-0.8.0.dev0/src/scatrans/enrich.py +1449 -0
  15. scatrans-0.8.0.dev0/src/scatrans/pl.py +1754 -0
  16. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/tl.py +483 -225
  17. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/PKG-INFO +109 -144
  18. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/SOURCES.txt +6 -1
  19. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/tests/test_basic.py +398 -0
  20. scatrans-0.7.0.dev0/.github/workflows/pypi-publish.yml +0 -28
  21. scatrans-0.7.0.dev0/src/scatrans/data/Hs_KEGG_2026.txt +0 -223
  22. scatrans-0.7.0.dev0/src/scatrans/data/Mm_KEGG_2026.txt +0 -219
  23. scatrans-0.7.0.dev0/src/scatrans/enrich.py +0 -735
  24. scatrans-0.7.0.dev0/src/scatrans/pl.py +0 -1168
  25. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/.gitignore +0 -0
  26. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/LICENSE +0 -0
  27. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/examples/memento_de_example.py +0 -0
  28. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/examples/real_data_template.py +0 -0
  29. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/examples/synthetic_active_transcription.py +0 -0
  30. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/setup.cfg +0 -0
  31. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_bias.py +0 -0
  32. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_de.py +0 -0
  33. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_permutation.py +0 -0
  34. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_utils.py +0 -0
  35. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/_velocity.py +0 -0
  36. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/data/Mus_musculus.GRCm39.115_gene_features.parquet +0 -0
  37. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/data/README.md +0 -0
  38. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/data/mouse_2020A_gene_features.parquet +0 -0
  39. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/generate_gene_features.py +0 -0
  40. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/pp_bias.py +0 -0
  41. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans/qc.py +0 -0
  42. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/dependency_links.txt +0 -0
  43. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/entry_points.txt +0 -0
  44. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/requires.txt +0 -0
  45. {scatrans-0.7.0.dev0 → scatrans-0.8.0.dev0}/src/scatrans.egg-info/top_level.txt +0 -0
@@ -22,6 +22,8 @@ jobs:
22
22
 
23
23
  steps:
24
24
  - uses: actions/checkout@v4
25
+ with:
26
+ fetch-depth: 0 # Good practice for setuptools_scm (version detection)
25
27
 
26
28
  - name: Set up Python ${{ matrix.python-version }}
27
29
  uses: actions/setup-python@v5
@@ -0,0 +1,74 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+ release:
8
+ types: [published]
9
+ workflow_dispatch:
10
+ inputs:
11
+ version:
12
+ description: "Force a specific version (SETUPTOOLS_SCM_PRETEND_VERSION). Useful for dev releases when not on a tag."
13
+ required: false
14
+ default: ""
15
+
16
+ jobs:
17
+ build:
18
+ name: Build distribution 📦
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ with:
23
+ fetch-depth: 0 # Critical for setuptools_scm to detect tags and produce correct version
24
+
25
+ - name: Set up Python
26
+ uses: actions/setup-python@v5
27
+ with:
28
+ python-version: "3.11"
29
+
30
+ - name: Install build tools
31
+ run: python -m pip install --upgrade build
32
+
33
+ - name: Build source and wheel distributions
34
+ run: |
35
+ if [ -n "${{ github.event.inputs.version }}" ]; then
36
+ echo "Using forced version: ${{ github.event.inputs.version }}"
37
+ SETUPTOOLS_SCM_PRETEND_VERSION="${{ github.event.inputs.version }}" python -m build
38
+ else
39
+ python -m build
40
+ fi
41
+
42
+ - name: Upload distribution artifacts
43
+ uses: actions/upload-artifact@v4
44
+ with:
45
+ name: python-package-distributions
46
+ path: dist/
47
+
48
+ publish:
49
+ name: Publish to PyPI
50
+ needs: build
51
+ runs-on: ubuntu-latest
52
+
53
+ # Required for Trusted Publishing (OIDC) - no API token secret needed
54
+ permissions:
55
+ id-token: write
56
+
57
+ # Recommended: tie to a protected GitHub Environment (create "pypi" environment in repo settings)
58
+ # You can add required reviewers or branch restrictions in the environment settings.
59
+ # environment:
60
+ # name: pypi
61
+
62
+ steps:
63
+ - name: Download all dists
64
+ uses: actions/download-artifact@v4
65
+ with:
66
+ name: python-package-distributions
67
+ path: dist/
68
+
69
+ - name: Publish distribution 📦 to PyPI
70
+ uses: pypa/gh-action-pypi-publish@release/v1
71
+ # For publishing to TestPyPI instead (for testing the workflow):
72
+ # with:
73
+ # repository-url: https://test.pypi.org/legacy/
74
+ # verbose: true
@@ -5,6 +5,31 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.8.0] - 2026-06-14
9
+
10
+ ### Added (enrichment module — major paper-readiness upgrade)
11
+ - `run_go(ontology="BP"|"CC"|"MF"|"ALL", ...)` — direct wrapper analogous to clusterProfiler `enrichGO`. Supports `adjust_across_all=True` for a single BH correction across all GO terms when using "ALL".
12
+ - `save_enrichment_report(res, prefix=..., save_excel=True, save_csv=True, save_tsv=True, save_metadata=True, save_term_gene_table=True)` — one-call export of main table, term-gene long table (via `expand_enrichment_genes`), and rich `metadata.json` + xlsx sheet. Auto-creates parent directories. List columns (e.g. `Genes_list`) are sanitized to `;` strings for clean export.
13
+ - `expand_enrichment_genes(res)` — expands the `Genes` (semicolon) column into a long-format Term–Gene table (one row per gene). Preserves `Ontology` column when input came from `run_go(..., "ALL")`.
14
+ - Rich provenance in every result `.attrs` (success and empty):
15
+ - `analysis_info`: package, version, timestamp, module
16
+ - `gene_set_info`: `requested`/`resolved`, `requested_source` vs `actual_source` ("bundled", "gseapy", "gmt", "dict"), `library_name`, `n_terms`, `n_unique_genes`
17
+ - `universe_info`: full details of background handling (provided size, restricted, dropped_by_annotation, force_universe, mapping counts)
18
+ - Empty results now carry `reason` ("gene_list_empty", "universe_empty", "no_term_overlap_after_filters", ...) + the above fields so users can diagnose why nothing came back.
19
+ - New `run_enrichment` / `run_kegg` / `run_go` parameters: `padj_cutoff` (preferred modern name), `include_gene_list` (adds `Genes_list` python-list column), `adjust_across_all`.
20
+ - `list_bundled_gene_sets()` now clearly documents the 2026 organism-specific defaults.
21
+ - Improved low-mapping-rate warning (includes input examples + gene-set examples).
22
+ - `background` is now a documented deprecated alias of `universe`; passing both raises immediately.
23
+ - All empty-result DataFrames preserve consistent columns (including optional `Genes_list` when requested) and full diagnostic attrs.
24
+
25
+ ### Changed / Improved
26
+ - `_load_gene_sets` now returns `(term_to_genes, term_to_desc, load_info)` so `actual_source` is always recorded accurately (even on gseapy fallback after bundled attempt).
27
+ - `run_kegg` fully synchronized with new parameters (`padj_cutoff`, `include_gene_list`, etc.).
28
+ - `enrich_dotplot` (pl.py) and various tl.py flows updated for new columns/attrs.
29
+ - Version unified to 0.8.0 for this release.
30
+ - README and docstrings extensively updated with manuscript-export examples, `run_go`, provenance details, and `adjust_across_all` guidance.
31
+ - Full test coverage for new paths (per-ontology attrs, within_ontology p.adjust, save+tsv+dir creation, expand with Ontology, dual-cutoff warning, etc.). All tests pass.
32
+
8
33
  ## [Unreleased]
9
34
 
10
35
  ### Added
@@ -0,0 +1,15 @@
1
+ # Control what goes into the source distribution (sdist).
2
+ # The wheel only contains the runtime package (src/scatrans + data).
3
+
4
+ # Standard important files
5
+ include LICENSE
6
+ include README.md
7
+ include CHANGELOG.md
8
+ include pyproject.toml
9
+
10
+ # Include the GitHub Actions workflows (requested)
11
+ include .github/workflows/ci.yml
12
+ include .github/workflows/publish.yml
13
+
14
+ # If more workflows are added in the future, this will catch them:
15
+ include .github/workflows/*.yml
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scatrans
3
- Version: 0.7.0.dev0
3
+ Version: 0.8.0.dev0
4
4
  Summary: Single-cell Active Transcription Analysis
5
5
  Author: scATrans Developers
6
6
  License: MIT
@@ -42,9 +42,9 @@ Dynamic: license-file
42
42
 
43
43
  # scATrans
44
44
 
45
- scATrans computes a composite score that integrates differential expression with a simple reference-based measure of excess unspliced (nascent) RNA between two groups. It is designed for users working with single-cell spliced/unspliced or mature/nascent data who want to rank genes according to this combined signal.
45
+ scATrans computes a composite score from differential expression and reference-based excess unspliced (nascent) RNA between groups. It ranks genes in single-cell spliced/unspliced or mature/nascent data.
46
46
 
47
- The package supplies a basic analysis path together with several optional extensions. All methods have limitations; results should be interpreted in light of the diagnostics and the experimental design. The tool does not claim to be a gold standard or to recover "truly active" genes in an absolute sense.
47
+ Results must be interpreted using the provided diagnostics. The method has known limitations and does not guarantee recovery of truly active genes.
48
48
 
49
49
  ## Installation
50
50
 
@@ -88,109 +88,60 @@ ufrac = scat.qc.unspliced_global(adata) # logs INFO + WARNING if > 50%
88
88
 
89
89
  `active_score` automatically runs this check and records the value in diagnostics.
90
90
 
91
- ### Preserving raw counts + original spliced/unspliced layers (strongly recommended)
92
-
93
- scATrans (especially the Memento backend and velocity/active-transcription calculations) works best when you still have access to the original raw counts and the original spliced/unspliced (or mature/nascent) matrices on as many genes as possible.
94
-
95
- Call this **early** (right after loading + basic QC, before any HVG, normalize or log1p):
91
+ ## Quick Start (Minimal Default Flow)
96
92
 
97
93
  ```python
94
+ import scanpy as sc
98
95
  import scatrans as scat
99
96
 
100
- # Save raw counts + the original velocity layers for later use
97
+ adata = sc.read_h5ad("your_data.h5ad")
98
+
99
+ # Preserve original counts and spliced/unspliced layers before HVG or normalization.
101
100
  scat.store_raw_counts(adata, layer="counts", save_raw=False)
102
101
 
103
- # Now you can safely do the usual Scanpy preprocessing for visualization
104
102
  sc.pp.highly_variable_genes(adata, n_top_genes=3000)
105
- # ... normalize_total, log1p, neighbors, umap, leiden ...
106
- ```
103
+ # ... normalize, log1p, neighbors, UMAP, clustering ...
107
104
 
108
- What `store_raw_counts` does:
109
- - Saves the current `.X` (your raw counts at that moment) into `layers["counts"]`.
110
- - If your adata contains `"spliced"` / `"unspliced"` (or `"mature"` / `"nascent"`) layers, it also saves them under `raw_spliced`, `raw_unspliced` etc. These preserved layers survive later HVG subsetting of the main object.
111
- - `save_raw=False` is now the default (we do **not** automatically set `adata.raw` unless you explicitly ask for it with `save_raw=True`).
105
+ # Optional: attach bundled gene features for bias correction.
106
+ adata = scat.add_gene_features(adata)
112
107
 
113
- This way:
114
- - Your visualization pipeline can use a small HVG + log1p `.X`.
115
- - Later you can still run `differential_expression(..., use_memento_de=True)` or `active_score` using the full-gene raw counts and the original spliced/unspliced data from the saved layers.
116
- - When doing enrichment, pass the gene list from the preserved full set as `universe` (see the enrichment section below for details and warnings).
108
+ adata_res, significant, all_results = scat.active_score(
109
+ adata_input=adata,
110
+ groupby="condition",
111
+ target_group="Disease",
112
+ reference_group="Control",
113
+ )
117
114
 
118
- See also the "Additional Capability: Standalone Differential Expression" section and the HVG-vs-velocity-layers note below.
115
+ print(all_results.head())
116
+ ```
119
117
 
120
- **Impact of HVG filtering on spliced/unspliced layers (important)**
118
+ Default parameters require no choices for bias correction, effective gamma, or mixed models. Pseudobulk mode and DE method (`de_method`) are configurable options. The built-in `significant` list is strict and often small or empty; use the full ranked table in `all_results`.
121
119
 
122
- In standard Scanpy operations:
120
+ ### Preserving raw counts and layers
123
121
 
124
- ```python
125
- sc.pp.highly_variable_genes(adata, n_top_genes=3000)
126
- adata = adata[:, adata.var.highly_variable].copy()
127
- ```
122
+ Call `store_raw_counts` immediately after loading and basic QC, before HVG selection or normalization. It preserves the full raw counts and the original spliced/unspliced (or mature/nascent) layers.
128
123
 
129
- **This will also affect the spliced/unspliced layers**:
124
+ `store_raw_counts` writes the current `.X` to `layers["counts"]` and copies spliced/unspliced layers to `raw_spliced` / `raw_unspliced`. These survive HVG subsetting. The default `save_raw=False` avoids setting `adata.raw`.
130
125
 
131
- - AnnData's `.layers` (including the "spliced" and "unspliced" you stored) are automatically subset together with the genes.
132
- - This is standard AnnData behavior and is usually **desired**, because velocity calculations (gamma estimation, unspliced excess, active_score) require the same gene set as the main expression matrix.
133
- - If you want to use HVGs only for **visualization/clustering**, but use more genes (the full post-QC gene set or a large collection) for **differential analysis (especially Memento)**, the recommended workflow is:
134
- 1. Immediately after loading + basic QC, call `scat.store_raw_counts(adata)` (preserves the full/large gene raw counts into the layer + .raw at that time).
135
- 2. Make a copy for HVG + visualization: `adata_viz = adata.copy(); ... HVG on adata_viz ...`
136
- 3. For DE, use the **original adata** (or the restored version), at which point it can still retrieve the corresponding raw counts from the layer (the number of genes depends on how many genes the adata had when you called store).
137
- 4. If you have already performed HVG subset on the main adata, the layer will also only contain raw counts for those HVGs. In that case DE can only be performed on these genes (consistent with the principle of "user performs filtering before store").
126
+ After HVG-based visualization on a copy, restore or use the preserved layers for full-gene DE, active scoring, or enrichment (pass `adata=` to `run_enrichment` or `run_kegg` to use the stored gene list as background).
138
127
 
139
- In short: HVG subset will reduce the genes retained in spliced/unspliced, keeping it consistent with .X. If you want to use more genes for DE, you should call the DE function before HVG subset (or on a copy that has not been subset).
128
+ HVG subsetting also subsets the saved layers. This keeps velocity calculations consistent with `.X`. To analyze more genes than the HVG set, store before subsetting or operate on the unfiltered object for DE and enrichment steps.
140
129
 
141
- Optionally, if you have done HVG + log1p for visualization but later want the raw counts back in `.X` (for the genes currently selected), you can use:
130
+ To restore raw counts into `.X` for the current gene set:
142
131
 
143
132
  ```python
144
- # Restore raw counts into .X (non-destructive by default)
145
133
  adata_raw = scat.restore_raw_counts(adata, layer="counts", inplace=False)
146
- # or inplace=True to modify the current adata
147
134
  ```
148
135
 
149
- See also the "Additional Capability: Standalone Differential Expression" section below for the pure-DE (no velocity) use case.
136
+ See the standalone differential expression section for the no-velocity use case.
150
137
 
151
138
  ---
152
139
 
153
- ## Core Positioning
154
-
155
- scATrans helps users extract **condition-wise nascent RNA relative excess** signals (a lightweight proxy for differential active transcription) from single-cell velocity-style data.
156
-
157
- - **Basic pipeline (on by default):** DE + unspliced excess after reference gamma correction + optional light bias correction for length/intron number + composite scoring + gene filtering + enrichment + plotting.
158
- - **Advanced options are opt-in:** They are powerful but add complexity and information overload. New users should start with defaults.
159
- - **Honest by design:** The default `significant` list is deliberately strict (often empty or very small on real data). The primary deliverable is the full ranked table (`all_results`). Diagnostics are always provided so you can judge whether the signals are trustworthy in your data.
160
-
161
- ---
162
-
163
- ## Quick Start (Minimal Default Flow) — Recommended
164
-
165
- ```python
166
- import scanpy as sc
167
- import scatrans as scat
168
-
169
- # 1. Load data that contains spliced/unspliced or mature/nascent layers
170
- adata = sc.read_h5ad("your_data.h5ad")
171
-
172
- # 2. (Optional but recommended) Attach gene features for bias correction
173
- # Uses the bundled mouse table by default.
174
- adata = scat.add_gene_features(adata)
175
-
176
- # 3. Run the analysis with default parameters — no need to worry about
177
- # bias_correction, effective_gamma, mixed models, etc.
178
- adata_res, significant, all_results = scat.active_score(
179
- adata_input=adata,
180
- groupby="condition",
181
- target_group="Disease",
182
- reference_group="Control",
183
- )
184
-
185
- # 4. The most important output for almost everyone is all_results (full ranked table)
186
- print(all_results.head())
187
- ```
188
-
189
- **Key point:** The default settings run a basic analysis without requiring decisions about `bias_correction`, `effective_gamma`, `use_mixed_model`, or `use_permutation`.
140
+ ## Core Workflow
190
141
 
191
- Pseudobulk analysis (`use_pseudobulk`) and choice of differential expression test (`de_method`, e.g. "wilcoxon") are standard configuration options that can be selected according to the experimental design (see the section on common basic switches).
142
+ The default path performs differential expression, reference-gamma unspliced excess, optional length/intron bias correction, composite scoring, gene filtering, enrichment, and plotting.
192
143
 
193
- The built-in `significant` list uses a strict conjunction of thresholds and is frequently small or empty. This behavior is expected. The primary output for most users is the full ranked table returned as `all_results`.
144
+ Advanced options are disabled by default. The internal `significant` list applies strict thresholds and is frequently empty or small. Return the complete ranked table in `all_results` and apply custom filters. Diagnostics are stored in `adata_res.uns["scatrans"]["diagnostics"]`.
194
145
 
195
146
  ---
196
147
 
@@ -208,16 +159,11 @@ adata_res, significant, all_results = scat.active_score(
208
159
  )
209
160
  ```
210
161
 
211
- This performs:
212
- - Differential expression between the two groups
213
- - Velocity delta (nascent excess) using a reference-group gamma
214
- - Light Huber bias correction on gene length + intron number (default)
215
- - Composite active_score (0–100)
216
- - Rich diagnostics written to `adata_res.uns["scatrans"]["diagnostics"]`
162
+ This computes differential expression, reference-group gamma excess for the unspliced layer, optional Huber bias correction on gene length and intron number, a composite active score, and stores diagnostics in `adata_res.uns["scatrans"]["diagnostics"]`.
217
163
 
218
164
  ### 3.1.1 Common basic switches: pseudobulk and DE test method
219
165
 
220
- These two are **standard basic options**, not advanced exploration features. You can turn them on freely depending on your data and analysis preferences:
166
+ These are standard options available for most analyses.
221
167
 
222
168
  **Pseudobulk mode** (recommended when you have multiple biological replicates per condition):
223
169
 
@@ -258,7 +204,7 @@ The `filter_active_genes` helper has a `preset="pseudobulk"` that applies more l
258
204
 
259
205
  ### 3.2 Gene filtering with filter_active_genes (core output tool)
260
206
 
261
- Because the built-in `significant` list is strict, most users derive their final list from `all_results` using `filter_active_genes`.
207
+ The internal `significant` list is strict. Most users filter the full table returned in `all_results` with `filter_active_genes`.
262
208
 
263
209
  ```python
264
210
  # Start permissive, then tighten based on your data
@@ -314,14 +260,7 @@ kegg_res = scat.run_kegg(
314
260
 
315
261
  ### Default: use the package's bundled gene sets (clearest logic)
316
262
 
317
- The package now **defaults to the new organism-specific built-in libraries** (4 files added to data/):
318
-
319
- - `Hs_GO_Biological_Process_2026.txt` + `Hs_KEGG_2026.txt` for human
320
- - `Mm_GO_Biological_Process_2026.txt` + `Mm_KEGG_2026.txt` for mouse
321
-
322
- You only need to specify `organism=` (for KEGG especially). Base names like "GO_Biological_Process", "KEGG", "GO_BP" are automatically resolved to the correct organism + 2026 built-in file.
323
-
324
- If you want a specific historical Enrichr version (e.g. GO_Biological_Process_2023), just write the full name — it will be treated as an Enrichr request.
263
+ The package defaults to organism-specific bundled sets (`Hs_GO_Biological_Process_2026.txt`, `Hs_KEGG_2026.txt`, and the corresponding mouse files). Specify `organism=` for KEGG or base GO names. Historical Enrichr names (e.g., `GO_Biological_Process_2023`) are passed through when supplied explicitly.
325
264
 
326
265
  ```python
327
266
  # KEGG — just specify organism, gets the correct built-in (Hs/Mm_2026) automatically
@@ -391,6 +330,61 @@ simplified = scat.simplify_enrichment(
391
330
 
392
331
  `run_kegg` and `simplify_enrichment` are convenience wrappers around the core `run_enrichment` function.
393
332
 
333
+ ### run_go (GO enrichment, clusterProfiler-style)
334
+
335
+ ```python
336
+ # Biological Process (defaults to the bundled Mm/Hs_GO_Biological_Process_2026)
337
+ go_bp = scat.run_go(
338
+ gene_list=markers,
339
+ ontology="BP", # "BP", "CC", "MF", or "ALL"
340
+ organism="mouse",
341
+ adata=adata, # recommended for correct universe
342
+ return_all=True,
343
+ )
344
+
345
+ # ALL three ontologies + unified multiple-testing correction across them
346
+ go_all = scat.run_go(
347
+ markers, ontology="ALL", organism="mouse",
348
+ return_all=True,
349
+ adjust_across_all=True, # re-compute BH on all terms together (stricter)
350
+ )
351
+ # go_all.attrs["per_ontology_attrs"] contains full diagnostics for BP/CC/MF separately
352
+ ```
353
+
354
+ `run_go` automatically resolves to the organism-specific bundled sets when possible (BP is bundled; CC/MF fall back to gseapy/Enrichr if the library is installed).
355
+
356
+ ### Exporting results for manuscripts / supplementary materials
357
+
358
+ The new helpers make it trivial to produce clean, reproducible tables:
359
+
360
+ ```python
361
+ res = scat.run_kegg(genes, organism="mouse", return_all=True, include_gene_list=True)
362
+
363
+ saved = scat.save_enrichment_report(
364
+ res,
365
+ prefix="cluster1_kegg", # or "results/suppl/my_enrich" (directories created automatically)
366
+ save_excel=True,
367
+ save_csv=True,
368
+ save_tsv=True, # often preferred for gene symbols + Excel locale safety
369
+ save_metadata=True,
370
+ save_term_gene_table=True,
371
+ )
372
+
373
+ # saved -> {'results_csv': ..., 'results_tsv': ..., 'term_gene_table_csv': ..., 'metadata_json': ..., 'results_xlsx': ...}
374
+
375
+ # Long-format term–gene table (one row per gene; perfect for networks, follow-up stats, etc.)
376
+ long_table = scat.expand_enrichment_genes(res)
377
+ # If the input was from run_go(ontology="ALL"), long_table will have an "Ontology" column first.
378
+ ```
379
+
380
+ `save_enrichment_report` also writes a rich `metadata.json` (and a "metadata" sheet in the xlsx) containing:
381
+ - `analysis_info` (package, version, timestamp)
382
+ - `gene_set_info` (requested/resolved + `requested_source` vs `actual_source`: "bundled", "gseapy", "gmt", "dict")
383
+ - `universe_info` (effective N, dropped genes, restrict behavior, etc.)
384
+ - Full `.attrs` from the enrichment call (including per-ontology details for GO ALL)
385
+
386
+ All empty results still carry diagnostic `.attrs` (`reason`, `gene_set_info`, `universe_info`, etc.) so you never lose information when a call returns no terms.
387
+
394
388
  ### 3.4 Visualization
395
389
 
396
390
  ```python
@@ -501,28 +495,11 @@ This function looks for common gene list columns (`Genes`, `Lead_genes`, etc.) a
501
495
 
502
496
  ---
503
497
 
504
- ## Result Interpretation and Notes
505
-
506
- ### Default `significant` is often empty or very small — this is normal
507
-
508
- The internal significance mask is a strict conjunction:
509
- - `p_adj < pval_cutoff`
510
- - `logFC > logfc_cutoff`
511
- - `velocity_residual > 0`
512
- - sufficient expression
513
- - `active_score > 0`
514
- - (if `use_permutation`) `active_score_fdr < active_fdr_cutoff`
515
- - (if `use_delta_variance_pval`) `delta_var_pval < cutoff`
516
-
517
- On real data this frequently returns 0–few genes. **Use `all_results`** and apply your own biologically motivated filters.
498
+ ## Result Interpretation
518
499
 
519
- ### Always start from `all_results`
500
+ The internal significance mask applies a strict conjunction of thresholds. On real data it often returns zero or few genes. Use the full table in `all_results`, which is sorted by `active_score` descending and retains every gene that passed initial expression filters.
520
501
 
521
- It is already sorted by `active_score` descending and contains every gene that passed basic expression filters together with all computed values.
522
-
523
- ### Diagnostics (always inspect these)
524
-
525
- After every run look at:
502
+ After each run inspect the diagnostics:
526
503
 
527
504
  ```python
528
505
  meta = adata_res.uns["scatrans"]
@@ -531,25 +508,23 @@ print(meta["diagnostics"]["bias_correction"])
531
508
  print(meta.get("permutation_approximation_note"))
532
509
  ```
533
510
 
534
- - **unspliced_global_fraction**: > ~50% often indicates technical problems (nuclear enrichment, gDNA contamination).
535
- - **bias_correction**: number of genes used for the fit, coefficients, whether median fallback was used.
536
- - **permutation_approximation_note**: only present when `use_permutation=True`. Records that velocity layers/gamma were fixed for speed.
511
+ Global unspliced fractions above ~50% frequently indicate technical issues. Bias-correction diagnostics report the number of genes used and any fallback behavior. The permutation note records that velocity layers and the reference gamma were fixed for speed.
537
512
 
538
513
  ---
539
514
 
540
- ## Optional Advanced Features (Opt-in)
515
+ ## Optional Advanced Features
541
516
 
542
- The following options can be enabled when relevant to the analysis goals:
517
+ The following flags are disabled by default and should be enabled only when required by the experimental design:
543
518
 
544
- - `use_permutation=True`: compute a permutation-based FDR for the composite score. When enabled, a note describing the approximation (velocity layers and reference gamma are fixed from the original labeling) is stored in the results.
545
- - `bias_correction="none"`: disable the length/intron correction on the velocity delta. The raw delta is then used directly as `velocity_residual`.
546
- - `show_effective_gamma=True`: include the per-gene reference-group U/S ratio (used internally for the delta calculation) in the output tables.
547
- - `use_mixed_model=True`: fit a mixed linear model with sample as random intercept and obtain `delta_variance` (fraction of modeled variance attributed to condition) along with a likelihood-ratio p-value.
548
- - `prioritize_velocity=True`: convenience flag that increases the relative weight given to the velocity_residual (nascent excess) term while decreasing the weights on the differential expression terms. This option is provided for analyses whose primary goal is to highlight differences in unspliced abundance after reference correction. It is documented under advanced features because it changes the balance of the composite score.
519
+ - `use_permutation=True`
520
+ - `bias_correction="none"`
521
+ - `show_effective_gamma=True`
522
+ - `use_mixed_model=True`
523
+ - `prioritize_velocity=True`
549
524
 
550
- A helper function `diagnose_design` is available to summarize cell and sample counts, global unspliced fraction, and to surface warnings and suggestions before or between runs of `active_score`.
525
+ `diagnose_design` summarizes cell and sample counts plus global unspliced fraction and returns warnings and a suggested `filter_active_genes` preset. It runs automatically when `sample_col` or `use_pseudobulk=True` is supplied.
551
526
 
552
- These options are not enabled by default. When used, the corresponding diagnostics should be examined.
527
+ Inspect the corresponding diagnostics after enabling any advanced option.
553
528
 
554
529
  ### use_permutation=True
555
530
 
@@ -598,17 +573,11 @@ Recommended only when you have a reasonable number of cells and want noise reduc
598
573
 
599
574
  ## Limitations
600
575
 
601
- The method implements a composite score based on a simplified, reference-group gamma excess calculation for the unspliced layer together with standard differential expression statistics.
576
+ The unspliced excess term is a group-contrast proxy derived from a reference-group gamma calculation. It is not a full stochastic or dynamical model.
602
577
 
603
- - The unspliced excess term is a group-contrast proxy and is not equivalent to scVelo's full stochastic or dynamical models.
604
- - The approach is most straightforward to interpret for clear binary group contrasts. Heterogeneity within the target group can reduce the observed signal.
605
- - When `use_permutation=True`, only the group labels are permuted; the velocity layers and reference gamma are computed once on the original data for computational efficiency. This approximation is recorded in the results metadata.
606
- - Global unspliced fractions above ~50% are flagged by the package, as they may indicate technical issues affecting the velocity layers.
607
- - Bias correction performance depends on the number and quality of genes with length and intron annotations.
608
- - With small numbers of biological replicates, power for the velocity component and for permutation-based FDR is limited. Users should examine the full distributions in `all_results`.
609
- - `delta_variance` and the associated mixed-model p-values tend to be conservative in the presence of substantial between-sample variation.
578
+ Interpretation is simplest for clear binary contrasts. Within-group heterogeneity reduces observed signal. The permutation approximation (used when `use_permutation=True`) fixes velocity layers and the reference gamma on the original labels; the note is recorded in the results. Global unspliced fractions above ~50% are flagged as potential technical artifacts. Bias-correction quality depends on the number of genes with length and intron annotations. With few biological replicates, power for the velocity term and permutation-based FDR is limited. Mixed-model statistics tend to be conservative when between-sample variation is large.
610
579
 
611
- Users should examine the diagnostics stored under `adata.uns["scatrans"]["diagnostics"]`, the distributions of scores in the returned tables, and (where possible) the raw spliced/unspliced counts for candidate genes before biological interpretation.
580
+ Always examine diagnostics, score distributions, and (when available) the original spliced/unspliced counts before biological interpretation.
612
581
 
613
582
  ---
614
583
 
@@ -651,7 +620,7 @@ Full signatures and all parameters are documented in the function docstrings and
651
620
  - `add_gene_features(adata, organism="mouse", ...)` — attach length/intron info
652
621
  - `list_available_gene_features()`
653
622
  - `diagnose_design(adata, groupby, target_group, reference_group, sample_col=None)` — analyzes cell/sample counts and global unspliced fraction; returns warnings, recommendations, and a suggested `filter_active_genes` preset. Automatically called internally when `sample_col` or `use_pseudobulk=True` is used.
654
- - `run_enrichment(...)`, `run_kegg(...)`, `simplify_enrichment(...)`, `list_bundled_gene_sets()`
623
+ - `run_enrichment(...)`, `run_kegg(...)`, `run_go(...)`, `simplify_enrichment(...)`, `save_enrichment_report(...)`, `expand_enrichment_genes(...)`, `list_bundled_gene_sets()`
655
624
  - `scat.pl.*` plotting functions (comet_plot, volcano_plot, bias_diagnostic_plot, ...)
656
625
  - `scat.qc.unspliced_global(adata)`
657
626
 
@@ -691,12 +660,10 @@ All `scat.pl.*` functions support `ax=` / `axes=` (for embedding in multi-panel
691
660
  Recommended: log fold change vs. bias-corrected unspliced residual (velocity_residual), sized and colored by active_score.
692
661
  - `s=3` (or 1-5): force **fixed** small point size for everything (direct, simple control).
693
662
  - `point_scale=0.2` + `min_size=1`: for variable sizing, make tiniest background points truly small.
694
- (Size API modeled after flexible controls seen in omicverse.pl.* )
695
663
 
696
664
  - `scat.pl.volcano_plot(results_df, top_n=10, label_genes=None, point_scale=1.0, min_size=2, s=None, ...)`
697
665
  2D volcano (logFC vs. -log10(p_adj)). Supports `label_genes=[...]` for manual gene labels
698
- (combined with top_n) — ggVolcano style flexibility. Classic up/down/ns coloring when
699
- not using active_score. See https://github.com/BioSenior/ggVolcano for style inspiration.
666
+ (combined with top_n). Classic up/down/ns coloring when not using active_score.
700
667
  Use `s=2` for uniformly small points, or min_size + point_scale for score/p-value sized tiny backgrounds.
701
668
  Especially helpful for pure DE results (no active_score).
702
669
 
@@ -778,9 +745,7 @@ scat.pl.enrich_dotplot(enrich)
778
745
 
779
746
  `differential_expression` supports the same flexible backends as `active_score` (scanpy methods, PyDESeq2 pseudobulk, mixed models, and optionally Memento as a method-of-moments estimator). The returned table is directly compatible with `filter_active_genes`, enrichment functions, and all `scat.pl.*` plotting helpers.
780
747
 
781
- This makes the package useful even if you only need modern DE + enrichment + visualization, while the core `active_score` workflow remains the recommended path when you have velocity information.
782
-
783
- See `examples/memento_de_example.py` for a complete demonstration of both the velocity-focused and pure-DE usage patterns.
748
+ The package therefore supports both velocity-based active transcription analysis and conventional DE + enrichment workflows. See `examples/memento_de_example.py` for a complete demonstration of the pure-DE path.
784
749
 
785
750
  **Important: raw counts requirement**
786
751
 
@@ -823,4 +788,4 @@ MIT License.
823
788
 
824
789
  ---
825
790
 
826
- *This README emphasizes the basic, honest, low-ceremony workflow centered on active transcription analysis from velocity data. Advanced capabilities (including standalone DE with Memento support) remain available for users who need them.*
791
+