cis-gs 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. cis_gs-1.1.0/LICENSE +21 -0
  2. cis_gs-1.1.0/MANIFEST.in +4 -0
  3. cis_gs-1.1.0/PKG-INFO +359 -0
  4. cis_gs-1.1.0/README.md +296 -0
  5. cis_gs-1.1.0/animaltfdb_importer.py +1461 -0
  6. cis_gs-1.1.0/app_v4_open.py +10218 -0
  7. cis_gs-1.1.0/chromosome_utils.py +418 -0
  8. cis_gs-1.1.0/cis_gs/__init__.py +12 -0
  9. cis_gs-1.1.0/cis_gs/__main__.py +5 -0
  10. cis_gs-1.1.0/cis_gs/animaltfdb_importer.py +1461 -0
  11. cis_gs-1.1.0/cis_gs/app.py +6356 -0
  12. cis_gs-1.1.0/cis_gs/assets/banner.png +0 -0
  13. cis_gs-1.1.0/cis_gs/assets/favicon.ico +0 -0
  14. cis_gs-1.1.0/cis_gs/assets/favicon.png +0 -0
  15. cis_gs-1.1.0/cis_gs/assets/logo.png +0 -0
  16. cis_gs-1.1.0/cis_gs/chromosome_utils.py +418 -0
  17. cis_gs-1.1.0/cis_gs/cli.py +1870 -0
  18. cis_gs-1.1.0/cis_gs/cli_enrichment.py +169 -0
  19. cis_gs-1.1.0/cis_gs/cli_interactive.py +865 -0
  20. cis_gs-1.1.0/cis_gs/create_assets.py +104 -0
  21. cis_gs-1.1.0/cis_gs/enrichment/__init__.py +36 -0
  22. cis_gs-1.1.0/cis_gs/enrichment/core.py +281 -0
  23. cis_gs-1.1.0/cis_gs/enrichment/idmap.py +474 -0
  24. cis_gs-1.1.0/cis_gs/enrichment/kegg.py +225 -0
  25. cis_gs-1.1.0/cis_gs/enrichment/plots.py +87 -0
  26. cis_gs-1.1.0/cis_gs/planttfdb_importer.py +1056 -0
  27. cis_gs-1.1.0/cis_gs.egg-info/PKG-INFO +359 -0
  28. cis_gs-1.1.0/cis_gs.egg-info/SOURCES.txt +35 -0
  29. cis_gs-1.1.0/cis_gs.egg-info/dependency_links.txt +1 -0
  30. cis_gs-1.1.0/cis_gs.egg-info/entry_points.txt +5 -0
  31. cis_gs-1.1.0/cis_gs.egg-info/requires.txt +29 -0
  32. cis_gs-1.1.0/cis_gs.egg-info/top_level.txt +5 -0
  33. cis_gs-1.1.0/planttfdb_importer.py +1056 -0
  34. cis_gs-1.1.0/pyproject.toml +104 -0
  35. cis_gs-1.1.0/requirements.txt +35 -0
  36. cis_gs-1.1.0/setup.cfg +4 -0
  37. cis_gs-1.1.0/tests/test_smoke.py +59 -0
cis_gs-1.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Plant Signaling Lab, IISER Tirupati
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,4 @@
1
+ include LICENSE
2
+ include README.md
3
+ include requirements.txt
4
+ recursive-include cis_gs/assets *
cis_gs-1.1.0/PKG-INFO ADDED
@@ -0,0 +1,359 @@
1
+ Metadata-Version: 2.4
2
+ Name: cis-gs
3
+ Version: 1.1.0
4
+ Summary: Cis-regulatory Element Genome Scanner - whole-genome cis-element discovery, expression coupling, and KEGG enrichment in one PyQt5 + CLI pipeline.
5
+ Author: Plant Signaling Lab, IISER Tirupati
6
+ Author-email: Ayushman Mallick <ayushmania2002@gmail.com>
7
+ Maintainer-email: Ayushman Mallick <ayushmania2002@gmail.com>
8
+ License: MIT
9
+ Project-URL: Homepage, https://github.com/Ayushmania2002/Cis-GS
10
+ Project-URL: Documentation, https://Ayushmania2002.github.io/Cis-GS/
11
+ Project-URL: Repository, https://github.com/Ayushmania2002/Cis-GS
12
+ Project-URL: Issues, https://github.com/Ayushmania2002/Cis-GS/issues
13
+ Project-URL: Changelog, https://github.com/Ayushmania2002/Cis-GS/releases
14
+ Keywords: bioinformatics,genomics,cis-regulatory-elements,promoter-analysis,motif-discovery,transcription-factor,TFBS,gene-expression,co-expression,WGCNA,KEGG,enrichment,PlantTFDB,AnimalTFDB,JASPAR,HOCOMOCO
15
+ Classifier: Development Status :: 5 - Production/Stable
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: Intended Audience :: Education
18
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
19
+ Classifier: Topic :: Scientific/Engineering :: Visualization
20
+ Classifier: License :: OSI Approved :: MIT License
21
+ Classifier: Programming Language :: Python :: 3
22
+ Classifier: Programming Language :: Python :: 3.9
23
+ Classifier: Programming Language :: Python :: 3.10
24
+ Classifier: Programming Language :: Python :: 3.11
25
+ Classifier: Programming Language :: Python :: 3.12
26
+ Classifier: Operating System :: OS Independent
27
+ Classifier: Operating System :: Microsoft :: Windows
28
+ Classifier: Operating System :: MacOS
29
+ Classifier: Operating System :: POSIX :: Linux
30
+ Classifier: Environment :: X11 Applications :: Qt
31
+ Classifier: Environment :: Console
32
+ Requires-Python: >=3.9
33
+ Description-Content-Type: text/markdown
34
+ License-File: LICENSE
35
+ Requires-Dist: PyQt5>=5.15
36
+ Requires-Dist: numpy>=1.23
37
+ Requires-Dist: pandas>=1.5
38
+ Requires-Dist: scipy>=1.9
39
+ Requires-Dist: scikit-learn>=1.1
40
+ Requires-Dist: matplotlib>=3.6
41
+ Requires-Dist: seaborn>=0.12
42
+ Requires-Dist: matplotlib-venn>=0.11
43
+ Requires-Dist: biopython>=1.79
44
+ Requires-Dist: bcbio-gff>=0.6
45
+ Requires-Dist: logomaker>=0.8
46
+ Requires-Dist: networkx>=3.0
47
+ Requires-Dist: python-louvain>=0.16
48
+ Requires-Dist: Pillow>=9.0
49
+ Requires-Dist: requests>=2.28
50
+ Requires-Dist: tqdm>=4.64
51
+ Requires-Dist: PyMuPDF>=1.22
52
+ Provides-Extra: dev
53
+ Requires-Dist: pyinstaller>=6.0; extra == "dev"
54
+ Requires-Dist: build; extra == "dev"
55
+ Requires-Dist: twine; extra == "dev"
56
+ Requires-Dist: pytest>=7.0; extra == "dev"
57
+ Provides-Extra: docs
58
+ Requires-Dist: sphinx>=7.0; extra == "docs"
59
+ Requires-Dist: sphinx-rtd-theme>=2.0; extra == "docs"
60
+ Requires-Dist: myst-parser>=2.0; extra == "docs"
61
+ Requires-Dist: sphinx-copybutton>=0.5; extra == "docs"
62
+ Dynamic: license-file
63
+
64
+ <div align="center">
65
+
66
+ <img src="assets/banner.png" alt="Cis-GS banner" width="780"/>
67
+
68
+ # Cis-GS &nbsp;&middot;&nbsp; Cis-regulatory Element Genome Scanner
69
+
70
+ **A whole-genome pipeline for discovering cis-regulatory elements, coupling them to expression, and finishing with KEGG enrichment — in one PyQt5 desktop app *and* one interactive CLI.**
71
+
72
+ [![PyPI version](https://img.shields.io/pypi/v/cis-gs.svg?color=16A085)](https://pypi.org/project/cis-gs/)
73
+ [![Python](https://img.shields.io/pypi/pyversions/cis-gs.svg)](https://pypi.org/project/cis-gs/)
74
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
75
+ [![Docs](https://img.shields.io/badge/docs-GitHub%20Pages-16A085)](https://Ayushmania2002.github.io/Cis-GS/)
76
+ [![Build](https://github.com/Ayushmania2002/Cis-GS/actions/workflows/docs.yml/badge.svg)](https://github.com/Ayushmania2002/Cis-GS/actions)
77
+ [![DOI](https://img.shields.io/badge/DOI-pending-lightgrey)](#citation)
78
+
79
+ </div>
80
+
81
+ ---
82
+
83
+ ## Table of Contents
84
+
85
+ - [What Cis-GS Does](#what-cis-gs-does)
86
+ - [Highlights of v1.1](#highlights-of-v11)
87
+ - [Installation](#installation)
88
+ - [Quick Start](#quick-start)
89
+ - [The 7-Step Workflow](#the-7-step-workflow)
90
+ - [Supported Motif Databases](#supported-motif-databases)
91
+ - [CLI Reference](#cli-reference)
92
+ - [Programmatic API](#programmatic-api)
93
+ - [Screenshots](#screenshots)
94
+ - [Troubleshooting](#troubleshooting)
95
+ - [Contributing](#contributing)
96
+ - [Citation](#citation)
97
+ - [License](#license)
98
+ - [Contact](#contact)
99
+
100
+ ---
101
+
102
+ ## What Cis-GS Does
103
+
104
+ Cis-GS automates the full **promoter &rarr; motif &rarr; expression &rarr; function** journey that
105
+ plant- and animal-genomics labs run by hand today:
106
+
107
+ 1. **Fetch** a reference genome + annotation directly from NCBI (live Assembly search).
108
+ 2. **Extract** promoter sequences (configurable length, strand-aware, intergenic-clipped) from any GFF3.
109
+ 3. **Scan** those promoters for transcription-factor binding motifs imported from PlantTFDB, AnimalTFDB, JASPAR 2024, or HOCOMOCO v11 — or any user-supplied IUPAC consensus.
110
+ 4. **Render** publication-ready sequence logos and per-gene hit tables with hypergeometric p-values and BH-FDR.
111
+ 5. **Couple** the hits to your expression table (RNA-seq, microarray, qPCR) to flag motifs whose presence tracks expression direction.
112
+ 6. **Build** a co-expression network (Pearson / Spearman / WGCNA-style soft-thresholding), detect modules via Louvain or hierarchical clustering, and visualise eigengenes.
113
+ 7. **Enrich** the top module / cluster against KEGG (live REST queries, 11 700+ organisms) with one-sided hypergeometric ORA + Benjamini-Hochberg FDR.
114
+
115
+ Everything runs **locally**, **offline-friendly after the first network fetch**, and exports CSV / SVG / PDF at every step.
116
+
117
+ ---
118
+
119
+ ## Highlights of v1.1
120
+
121
+ - **Live KEGG dropdown** — every one of the 11 700+ organisms KEGG knows about, fetched on demand. No more stale species tables.
122
+ - **Live NCBI Taxonomy search** — type any common or Latin name; results stream back as you type.
123
+ - **60× faster ID conversion** — MyGene.info batched POST + progress bar (previously 60+ minutes for 10 k genes; now ~60 s).
124
+ - **Interactive CLI wizards** — `cis-gs wizard` walks you through every step with arrow-key menus. Every subcommand also accepts `-i / --interactive`.
125
+ - **Fuzzy "did you mean...?"** for CLI typos.
126
+ - **Brand-icon Contact tab** with real-website logos (LinkedIn, GitHub, KEGG, NCBI, PlantTFDB, AnimalTFDB, MyGene).
127
+ - **Modern single-color theme** (teal `#16A085`) with instant light / dark toggle — no more 1-2 s freeze.
128
+ - **First-run NCBI email prompt** — required by the Entrez API, stored only on your machine.
129
+ - **Three Gene-ID-Mapping methods** for the annoying NCBI-LOC vs species-database mismatch (column swap, mapping CSV, GFF3 Dbxref expansion).
130
+
131
+ See the [full release notes](https://github.com/Ayushmania2002/Cis-GS/releases) for the v1.0 &rarr; v1.1 diff.
132
+
133
+ ---
134
+
135
+ ## Installation
136
+
137
+ ### Option 1 — PyPI (Linux / macOS / Windows)
138
+
139
+ ```bash
140
+ pip install cis-gs
141
+ cis-gs --help # CLI
142
+ cis-gs-gui # GUI
143
+ ```
144
+
145
+ Python 3.9+ required. The first GUI launch will pop up a one-time NCBI email prompt.
146
+
147
+ ### Option 2 — Standalone Windows executable
148
+
149
+ Download `Cis-GS.exe` from the [latest release](https://github.com/Ayushmania2002/Cis-GS/releases) page.
150
+ Double-click. No Python install needed. Roughly 120 MB.
151
+
152
+ ### Option 3 — From source
153
+
154
+ ```bash
155
+ git clone https://github.com/Ayushmania2002/Cis-GS.git
156
+ cd Cis-GS
157
+ pip install -e ".[dev,docs]"
158
+ python app_v4_open.py # GUI
159
+ python -m cis_gs --help # CLI
160
+ ```
161
+
162
+ Full build details (PyInstaller spec, build scripts for all 3 OSes, PyPI release workflow):
163
+ see [`BUILD.md`](BUILD.md).
164
+
165
+ ---
166
+
167
+ ## Quick Start
168
+
169
+ ### GUI (one minute)
170
+
171
+ ```bash
172
+ cis-gs-gui
173
+ ```
174
+
175
+ 1. **Step 1 — Promoters**: drop a FASTA + GFF3, set promoter length (default 2 kb), click *Extract*.
176
+ 2. **Step 2 — Motif Search**: click *Import from PlantTFDB* (or AnimalTFDB), pick your species, tick the motifs you want, *Import Selected*.
177
+ 3. **Step 7 — KEGG Enrichment**: pick a KEGG organism from the live dropdown, paste your gene list, run.
178
+
179
+ Done. CSVs and SVGs land in `~/CisGS-Workspace/`.
180
+
181
+ ### CLI (interactive wizard)
182
+
183
+ ```bash
184
+ cis-gs wizard
185
+ ```
186
+
187
+ The wizard auto-detects what you've already produced and offers the next sensible step.
188
+
189
+ ### CLI (one-liners)
190
+
191
+ ```bash
192
+ # Extract 2 kb promoters from a GFF3 + FASTA
193
+ cis-gs extract --fasta genome.fa --gff annot.gff3 --upstream 2000 --out promoters.fa
194
+
195
+ # Scan promoters with a MEME motif file
196
+ cis-gs search --promoters promoters.fa --motifs motifs.meme --out hits.csv
197
+
198
+ # KEGG enrichment
199
+ cis-gs enrich-kegg --organism ath --genes top_module.txt --out kegg.csv
200
+ ```
201
+
202
+ Every command supports `-i / --interactive` if you want to be walked through it.
203
+
204
+ ---
205
+
206
+ ## The 7-Step Workflow
207
+
208
+ | Step | What it does | Output |
209
+ |---|---|---|
210
+ | **1. Promoters** | Strand-aware promoter extraction from any FASTA + GFF3 | `promoters.fa` |
211
+ | **2. Motif Search** | IUPAC / MEME / PlantTFDB / AnimalTFDB scanning with hypergeometric p-values + BH-FDR | `hits.csv`, significance summary |
212
+ | **3. Motif Logos** | logomaker sequence logos with information-content shading | per-motif SVG / PNG |
213
+ | **4. Expression Feeding** | Joins hits with an expression CSV via three Gene-ID-Mapping methods (LOC swap, mapping CSV, GFF3 Dbxref expansion) | `expression_matched.csv` |
214
+ | **5. Coexpression** | Pearson / Spearman / WGCNA-style soft-thresholding, Louvain / hierarchical module detection | `network.gexf`, eigengene plot |
215
+ | **6. K-means** | Elbow + silhouette, deterministic seeding, exportable per-cluster gene lists | `clusters/*.txt` |
216
+ | **7. KEGG Enrichment** | Live REST query against any of 11 700+ KEGG organisms, hypergeometric ORA, BH-FDR, fold-enrichment | `kegg_enrichment.csv` |
217
+
218
+ A full description of each step's algorithm and parameters lives in the
219
+ [online documentation](https://Ayushmania2002.github.io/Cis-GS/).
220
+
221
+ ---
222
+
223
+ ## Supported Motif Databases
224
+
225
+ | Database | Coverage | Access |
226
+ |---|---|---|
227
+ | **PlantTFDB v5** | 157 plant species, ~6 000 motifs | Built-in importer with live species list |
228
+ | **AnimalTFDB v4** | Human, mouse, zebrafish, insects | Built-in importer |
229
+ | **JASPAR 2024 (non-redundant)** | 575 vertebrate + 99 insect motifs | Direct REST download |
230
+ | **HOCOMOCO v11** | ~700 human + ~400 mouse ChIP-Seq motifs | Direct REST download |
231
+ | **Custom IUPAC / MEME** | Anything you can write down | Paste into Step 2 |
232
+
233
+ ---
234
+
235
+ ## CLI Reference
236
+
237
+ ```text
238
+ cis-gs --help
239
+
240
+ usage: cis-gs [-h] {wizard,fetch,extract,search,feed,coexpr,kmeans,enrich-kegg,id-convert} ...
241
+
242
+ wizard Step-by-step wizard (recommended for new users)
243
+ fetch Download a genome + annotation from NCBI
244
+ extract Extract promoter sequences from FASTA + GFF3
245
+ search Scan promoters for motif occurrences
246
+ feed Couple motif hits with an expression table
247
+ coexpr Build a co-expression network
248
+ kmeans K-means clustering with elbow / silhouette
249
+ enrich-kegg KEGG over-representation analysis
250
+ id-convert Convert gene IDs across namespaces (MyGene.info, batched)
251
+ ```
252
+
253
+ Every subcommand accepts `-i / --interactive` for a guided run, and `--help` for full flags.
254
+
255
+ ---
256
+
257
+ ## Programmatic API
258
+
259
+ ```python
260
+ from cis_gs.enrichment import KEGGEnricher
261
+
262
+ e = KEGGEnricher(organism="ath") # Arabidopsis
263
+ result = e.enrich(["AT1G01010", "AT2G18790", "AT3G09600"])
264
+ print(result.table.head())
265
+ ```
266
+
267
+ ```python
268
+ from cis_gs.enrichment.idmap import IDConverter
269
+
270
+ idc = IDConverter(species="human")
271
+ mapping = idc.convert(["TP53", "BRCA1", "MYC"], target="entrez")
272
+ ```
273
+
274
+ Full API reference: [`Ayushmania2002.github.io/Cis-GS/api`](https://Ayushmania2002.github.io/Cis-GS/api.html).
275
+
276
+ ---
277
+
278
+ ## Screenshots
279
+
280
+ <div align="center">
281
+
282
+ | Step 1: Promoter extraction | Step 2: Motif search |
283
+ |---|---|
284
+ | <img src="docs/source/_static/screenshot_step1.png" width="350"/> | <img src="docs/source/_static/screenshot_step2.png" width="350"/> |
285
+
286
+ | Step 5: Coexpression network | Step 7: KEGG enrichment |
287
+ |---|---|
288
+ | <img src="docs/source/_static/screenshot_step5.png" width="350"/> | <img src="docs/source/_static/screenshot_step7.png" width="350"/> |
289
+
290
+ </div>
291
+
292
+ > *Screenshots are placeholder paths; replace with actual PNGs in `docs/source/_static/` before publishing.*
293
+
294
+ ---
295
+
296
+ ## Troubleshooting
297
+
298
+ | Symptom | Likely cause | Fix |
299
+ |---|---|---|
300
+ | `cis-gs-gui: command not found` after `pip install` | Scripts dir not on `PATH` | `python -m cis_gs` works, or add `pip --user` bin dir to PATH |
301
+ | First NCBI Fetch returns 0 results | NCBI email not set | Settings &rarr; Set NCBI Email, then retry |
302
+ | `KEGG REST unreachable` | Firewall or VPN | Set `HTTPS_PROXY` env var, or use the *Browse & Import* tab with a manually downloaded MEME |
303
+ | Motif hits CSV has empty `gene_symbol` column | Annotation GFF3 not loaded in Step 2 | Re-run with the same GFF3 from Step 1 in *Gene ID Resolution* |
304
+ | Coexpression freezes on > 30k genes | All-vs-all correlation is O(n²) | Pre-filter to expressed genes (TPM > 1) before Step 5 |
305
+
306
+ Open an [issue](https://github.com/Ayushmania2002/Cis-GS/issues) with the log file from `~/CisGS-Workspace/cisgs.log` if you hit anything else.
307
+
308
+ ---
309
+
310
+ ## Contributing
311
+
312
+ Bug reports, feature requests, and pull requests are welcome.
313
+ For substantial contributions please open an issue first to discuss the change.
314
+
315
+ ```bash
316
+ git clone https://github.com/Ayushmania2002/Cis-GS.git
317
+ cd Cis-GS
318
+ pip install -e ".[dev]"
319
+ pytest # run the test suite
320
+ ```
321
+
322
+ ---
323
+
324
+ ## Citation
325
+
326
+ If Cis-GS contributes to a publication, please cite:
327
+
328
+ > Mallick A. *Cis-GS: a unified pipeline for whole-genome cis-regulatory element
329
+ > discovery, expression coupling, and KEGG enrichment.* (manuscript in preparation,
330
+ > Plant Signaling Lab, IISER Tirupati, 2026).
331
+
332
+ BibTeX:
333
+
334
+ ```bibtex
335
+ @software{mallick_cisgs_2026,
336
+ author = {Mallick, Ayushman},
337
+ title = {{Cis-GS}: Cis-regulatory Element Genome Scanner},
338
+ year = {2026},
339
+ url = {https://github.com/Ayushmania2002/Cis-GS},
340
+ version = {1.1.0}
341
+ }
342
+ ```
343
+
344
+ A `CITATION.cff` is included for GitHub's automatic citation widget.
345
+
346
+ ---
347
+
348
+ ## License
349
+
350
+ Released under the [MIT License](LICENSE). Free for academic and commercial use.
351
+
352
+ ---
353
+
354
+ ## Contact
355
+
356
+ **Ayushman Mallick** &middot; ayushmania2002@gmail.com
357
+ Plant Signaling Lab, [IISER Tirupati](https://www.iisertirupati.ac.in/)
358
+
359
+ <sub>&copy; 2026 Ayushman Mallick &middot; Plant Signaling Lab &middot; Cis-GS</sub>
cis_gs-1.1.0/README.md ADDED
@@ -0,0 +1,296 @@
1
+ <div align="center">
2
+
3
+ <img src="assets/banner.png" alt="Cis-GS banner" width="780"/>
4
+
5
+ # Cis-GS &nbsp;&middot;&nbsp; Cis-regulatory Element Genome Scanner
6
+
7
+ **A whole-genome pipeline for discovering cis-regulatory elements, coupling them to expression, and finishing with KEGG enrichment — in one PyQt5 desktop app *and* one interactive CLI.**
8
+
9
+ [![PyPI version](https://img.shields.io/pypi/v/cis-gs.svg?color=16A085)](https://pypi.org/project/cis-gs/)
10
+ [![Python](https://img.shields.io/pypi/pyversions/cis-gs.svg)](https://pypi.org/project/cis-gs/)
11
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
12
+ [![Docs](https://img.shields.io/badge/docs-GitHub%20Pages-16A085)](https://Ayushmania2002.github.io/Cis-GS/)
13
+ [![Build](https://github.com/Ayushmania2002/Cis-GS/actions/workflows/docs.yml/badge.svg)](https://github.com/Ayushmania2002/Cis-GS/actions)
14
+ [![DOI](https://img.shields.io/badge/DOI-pending-lightgrey)](#citation)
15
+
16
+ </div>
17
+
18
+ ---
19
+
20
+ ## Table of Contents
21
+
22
+ - [What Cis-GS Does](#what-cis-gs-does)
23
+ - [Highlights of v1.1](#highlights-of-v11)
24
+ - [Installation](#installation)
25
+ - [Quick Start](#quick-start)
26
+ - [The 7-Step Workflow](#the-7-step-workflow)
27
+ - [Supported Motif Databases](#supported-motif-databases)
28
+ - [CLI Reference](#cli-reference)
29
+ - [Programmatic API](#programmatic-api)
30
+ - [Screenshots](#screenshots)
31
+ - [Troubleshooting](#troubleshooting)
32
+ - [Contributing](#contributing)
33
+ - [Citation](#citation)
34
+ - [License](#license)
35
+ - [Contact](#contact)
36
+
37
+ ---
38
+
39
+ ## What Cis-GS Does
40
+
41
+ Cis-GS automates the full **promoter &rarr; motif &rarr; expression &rarr; function** journey that
42
+ plant- and animal-genomics labs run by hand today:
43
+
44
+ 1. **Fetch** a reference genome + annotation directly from NCBI (live Assembly search).
45
+ 2. **Extract** promoter sequences (configurable length, strand-aware, intergenic-clipped) from any GFF3.
46
+ 3. **Scan** those promoters for transcription-factor binding motifs imported from PlantTFDB, AnimalTFDB, JASPAR 2024, or HOCOMOCO v11 — or any user-supplied IUPAC consensus.
47
+ 4. **Render** publication-ready sequence logos and per-gene hit tables with hypergeometric p-values and BH-FDR.
48
+ 5. **Couple** the hits to your expression table (RNA-seq, microarray, qPCR) to flag motifs whose presence tracks expression direction.
49
+ 6. **Build** a co-expression network (Pearson / Spearman / WGCNA-style soft-thresholding), detect modules via Louvain or hierarchical clustering, and visualise eigengenes.
50
+ 7. **Enrich** the top module / cluster against KEGG (live REST queries, 11 700+ organisms) with one-sided hypergeometric ORA + Benjamini-Hochberg FDR.
51
+
52
+ Everything runs **locally**, **offline-friendly after the first network fetch**, and exports CSV / SVG / PDF at every step.
53
+
54
+ ---
55
+
56
+ ## Highlights of v1.1
57
+
58
+ - **Live KEGG dropdown** — every one of the 11 700+ organisms KEGG knows about, fetched on demand. No more stale species tables.
59
+ - **Live NCBI Taxonomy search** — type any common or Latin name; results stream back as you type.
60
+ - **60× faster ID conversion** — MyGene.info batched POST + progress bar (previously 60+ minutes for 10 k genes; now ~60 s).
61
+ - **Interactive CLI wizards** — `cis-gs wizard` walks you through every step with arrow-key menus. Every subcommand also accepts `-i / --interactive`.
62
+ - **Fuzzy "did you mean...?"** for CLI typos.
63
+ - **Brand-icon Contact tab** with real-website logos (LinkedIn, GitHub, KEGG, NCBI, PlantTFDB, AnimalTFDB, MyGene).
64
+ - **Modern single-color theme** (teal `#16A085`) with instant light / dark toggle — no more 1-2 s freeze.
65
+ - **First-run NCBI email prompt** — required by the Entrez API, stored only on your machine.
66
+ - **Three Gene-ID-Mapping methods** for the annoying NCBI-LOC vs species-database mismatch (column swap, mapping CSV, GFF3 Dbxref expansion).
67
+
68
+ See the [full release notes](https://github.com/Ayushmania2002/Cis-GS/releases) for the v1.0 &rarr; v1.1 diff.
69
+
70
+ ---
71
+
72
+ ## Installation
73
+
74
+ ### Option 1 — PyPI (Linux / macOS / Windows)
75
+
76
+ ```bash
77
+ pip install cis-gs
78
+ cis-gs --help # CLI
79
+ cis-gs-gui # GUI
80
+ ```
81
+
82
+ Python 3.9+ required. The first GUI launch will pop up a one-time NCBI email prompt.
83
+
84
+ ### Option 2 — Standalone Windows executable
85
+
86
+ Download `Cis-GS.exe` from the [latest release](https://github.com/Ayushmania2002/Cis-GS/releases) page.
87
+ Double-click. No Python install needed. Roughly 120 MB.
88
+
89
+ ### Option 3 — From source
90
+
91
+ ```bash
92
+ git clone https://github.com/Ayushmania2002/Cis-GS.git
93
+ cd Cis-GS
94
+ pip install -e ".[dev,docs]"
95
+ python app_v4_open.py # GUI
96
+ python -m cis_gs --help # CLI
97
+ ```
98
+
99
+ Full build details (PyInstaller spec, build scripts for all 3 OSes, PyPI release workflow):
100
+ see [`BUILD.md`](BUILD.md).
101
+
102
+ ---
103
+
104
+ ## Quick Start
105
+
106
+ ### GUI (one minute)
107
+
108
+ ```bash
109
+ cis-gs-gui
110
+ ```
111
+
112
+ 1. **Step 1 — Promoters**: drop a FASTA + GFF3, set promoter length (default 2 kb), click *Extract*.
113
+ 2. **Step 2 — Motif Search**: click *Import from PlantTFDB* (or AnimalTFDB), pick your species, tick the motifs you want, *Import Selected*.
114
+ 3. **Step 7 — KEGG Enrichment**: pick a KEGG organism from the live dropdown, paste your gene list, run.
115
+
116
+ Done. CSVs and SVGs land in `~/CisGS-Workspace/`.
117
+
118
+ ### CLI (interactive wizard)
119
+
120
+ ```bash
121
+ cis-gs wizard
122
+ ```
123
+
124
+ The wizard auto-detects what you've already produced and offers the next sensible step.
125
+
126
+ ### CLI (one-liners)
127
+
128
+ ```bash
129
+ # Extract 2 kb promoters from a GFF3 + FASTA
130
+ cis-gs extract --fasta genome.fa --gff annot.gff3 --upstream 2000 --out promoters.fa
131
+
132
+ # Scan promoters with a MEME motif file
133
+ cis-gs search --promoters promoters.fa --motifs motifs.meme --out hits.csv
134
+
135
+ # KEGG enrichment
136
+ cis-gs enrich-kegg --organism ath --genes top_module.txt --out kegg.csv
137
+ ```
138
+
139
+ Every command supports `-i / --interactive` if you want to be walked through it.
140
+
141
+ ---
142
+
143
+ ## The 7-Step Workflow
144
+
145
+ | Step | What it does | Output |
146
+ |---|---|---|
147
+ | **1. Promoters** | Strand-aware promoter extraction from any FASTA + GFF3 | `promoters.fa` |
148
+ | **2. Motif Search** | IUPAC / MEME / PlantTFDB / AnimalTFDB scanning with hypergeometric p-values + BH-FDR | `hits.csv`, significance summary |
149
+ | **3. Motif Logos** | logomaker sequence logos with information-content shading | per-motif SVG / PNG |
150
+ | **4. Expression Feeding** | Joins hits with an expression CSV via three Gene-ID-Mapping methods (LOC swap, mapping CSV, GFF3 Dbxref expansion) | `expression_matched.csv` |
151
+ | **5. Coexpression** | Pearson / Spearman / WGCNA-style soft-thresholding, Louvain / hierarchical module detection | `network.gexf`, eigengene plot |
152
+ | **6. K-means** | Elbow + silhouette, deterministic seeding, exportable per-cluster gene lists | `clusters/*.txt` |
153
+ | **7. KEGG Enrichment** | Live REST query against any of 11 700+ KEGG organisms, hypergeometric ORA, BH-FDR, fold-enrichment | `kegg_enrichment.csv` |
154
+
155
+ A full description of each step's algorithm and parameters lives in the
156
+ [online documentation](https://Ayushmania2002.github.io/Cis-GS/).
157
+
158
+ ---
159
+
160
+ ## Supported Motif Databases
161
+
162
+ | Database | Coverage | Access |
163
+ |---|---|---|
164
+ | **PlantTFDB v5** | 157 plant species, ~6 000 motifs | Built-in importer with live species list |
165
+ | **AnimalTFDB v4** | Human, mouse, zebrafish, insects | Built-in importer |
166
+ | **JASPAR 2024 (non-redundant)** | 575 vertebrate + 99 insect motifs | Direct REST download |
167
+ | **HOCOMOCO v11** | ~700 human + ~400 mouse ChIP-Seq motifs | Direct REST download |
168
+ | **Custom IUPAC / MEME** | Anything you can write down | Paste into Step 2 |
169
+
170
+ ---
171
+
172
+ ## CLI Reference
173
+
174
+ ```text
175
+ cis-gs --help
176
+
177
+ usage: cis-gs [-h] {wizard,fetch,extract,search,feed,coexpr,kmeans,enrich-kegg,id-convert} ...
178
+
179
+ wizard Step-by-step wizard (recommended for new users)
180
+ fetch Download a genome + annotation from NCBI
181
+ extract Extract promoter sequences from FASTA + GFF3
182
+ search Scan promoters for motif occurrences
183
+ feed Couple motif hits with an expression table
184
+ coexpr Build a co-expression network
185
+ kmeans K-means clustering with elbow / silhouette
186
+ enrich-kegg KEGG over-representation analysis
187
+ id-convert Convert gene IDs across namespaces (MyGene.info, batched)
188
+ ```
189
+
190
+ Every subcommand accepts `-i / --interactive` for a guided run, and `--help` for full flags.
191
+
192
+ ---
193
+
194
+ ## Programmatic API
195
+
196
+ ```python
197
+ from cis_gs.enrichment import KEGGEnricher
198
+
199
+ e = KEGGEnricher(organism="ath") # Arabidopsis
200
+ result = e.enrich(["AT1G01010", "AT2G18790", "AT3G09600"])
201
+ print(result.table.head())
202
+ ```
203
+
204
+ ```python
205
+ from cis_gs.enrichment.idmap import IDConverter
206
+
207
+ idc = IDConverter(species="human")
208
+ mapping = idc.convert(["TP53", "BRCA1", "MYC"], target="entrez")
209
+ ```
210
+
211
+ Full API reference: [`Ayushmania2002.github.io/Cis-GS/api`](https://Ayushmania2002.github.io/Cis-GS/api.html).
212
+
213
+ ---
214
+
215
+ ## Screenshots
216
+
217
+ <div align="center">
218
+
219
+ | Step 1: Promoter extraction | Step 2: Motif search |
220
+ |---|---|
221
+ | <img src="docs/source/_static/screenshot_step1.png" width="350"/> | <img src="docs/source/_static/screenshot_step2.png" width="350"/> |
222
+
223
+ | Step 5: Coexpression network | Step 7: KEGG enrichment |
224
+ |---|---|
225
+ | <img src="docs/source/_static/screenshot_step5.png" width="350"/> | <img src="docs/source/_static/screenshot_step7.png" width="350"/> |
226
+
227
+ </div>
228
+
229
+ > *Screenshots are placeholder paths; replace with actual PNGs in `docs/source/_static/` before publishing.*
230
+
231
+ ---
232
+
233
+ ## Troubleshooting
234
+
235
+ | Symptom | Likely cause | Fix |
236
+ |---|---|---|
237
+ | `cis-gs-gui: command not found` after `pip install` | Scripts dir not on `PATH` | `python -m cis_gs` works, or add `pip --user` bin dir to PATH |
238
+ | First NCBI Fetch returns 0 results | NCBI email not set | Settings &rarr; Set NCBI Email, then retry |
239
+ | `KEGG REST unreachable` | Firewall or VPN | Set `HTTPS_PROXY` env var, or use the *Browse & Import* tab with a manually downloaded MEME |
240
+ | Motif hits CSV has empty `gene_symbol` column | Annotation GFF3 not loaded in Step 2 | Re-run with the same GFF3 from Step 1 in *Gene ID Resolution* |
241
+ | Coexpression freezes on > 30k genes | All-vs-all correlation is O(n²) | Pre-filter to expressed genes (TPM > 1) before Step 5 |
242
+
243
+ Open an [issue](https://github.com/Ayushmania2002/Cis-GS/issues) with the log file from `~/CisGS-Workspace/cisgs.log` if you hit anything else.
244
+
245
+ ---
246
+
247
+ ## Contributing
248
+
249
+ Bug reports, feature requests, and pull requests are welcome.
250
+ For substantial contributions please open an issue first to discuss the change.
251
+
252
+ ```bash
253
+ git clone https://github.com/Ayushmania2002/Cis-GS.git
254
+ cd Cis-GS
255
+ pip install -e ".[dev]"
256
+ pytest # run the test suite
257
+ ```
258
+
259
+ ---
260
+
261
+ ## Citation
262
+
263
+ If Cis-GS contributes to a publication, please cite:
264
+
265
+ > Mallick A. *Cis-GS: a unified pipeline for whole-genome cis-regulatory element
266
+ > discovery, expression coupling, and KEGG enrichment.* (manuscript in preparation,
267
+ > Plant Signaling Lab, IISER Tirupati, 2026).
268
+
269
+ BibTeX:
270
+
271
+ ```bibtex
272
+ @software{mallick_cisgs_2026,
273
+ author = {Mallick, Ayushman},
274
+ title = {{Cis-GS}: Cis-regulatory Element Genome Scanner},
275
+ year = {2026},
276
+ url = {https://github.com/Ayushmania2002/Cis-GS},
277
+ version = {1.1.0}
278
+ }
279
+ ```
280
+
281
+ A `CITATION.cff` is included for GitHub's automatic citation widget.
282
+
283
+ ---
284
+
285
+ ## License
286
+
287
+ Released under the [MIT License](LICENSE). Free for academic and commercial use.
288
+
289
+ ---
290
+
291
+ ## Contact
292
+
293
+ **Ayushman Mallick** &middot; ayushmania2002@gmail.com
294
+ Plant Signaling Lab, [IISER Tirupati](https://www.iisertirupati.ac.in/)
295
+
296
+ <sub>&copy; 2026 Ayushman Mallick &middot; Plant Signaling Lab &middot; Cis-GS</sub>