@yangfei_93sky/biocli 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -3
- package/dist/cli-manifest.json +102 -0
- package/dist/clis/aggregate/workflow-annotate.d.ts +15 -0
- package/dist/clis/aggregate/workflow-annotate.js +323 -0
- package/dist/clis/aggregate/workflow-profile.d.ts +17 -0
- package/dist/clis/aggregate/workflow-profile.js +326 -0
- package/dist/clis/geo/download.js +19 -1
- package/dist/clis/sra/download.js +24 -1
- package/dist/commander-adapter.js +62 -11
- package/dist/output.js +52 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -5,13 +5,13 @@ Query biological databases from the terminal. Agent-first design.
|
|
|
5
5
|
```
|
|
6
6
|
biocli v0.2.0
|
|
7
7
|
NCBI · UniProt · KEGG · STRING · Ensembl · Enrichr
|
|
8
|
-
|
|
8
|
+
44 commands · 6 database backends · 10 workflow commands · 4 download commands
|
|
9
9
|
```
|
|
10
10
|
|
|
11
11
|
## Install
|
|
12
12
|
|
|
13
13
|
```bash
|
|
14
|
-
npm install -g @biocli
|
|
14
|
+
npm install -g @yangfei_93sky/biocli
|
|
15
15
|
```
|
|
16
16
|
|
|
17
17
|
Requires Node.js >= 20. No API keys needed (optional NCBI key increases rate limit).
|
|
@@ -75,7 +75,7 @@ Designed for **AI agents** (Claude Code, Codex CLI, etc.) — structured JSON ou
|
|
|
75
75
|
|
|
76
76
|
</details>
|
|
77
77
|
|
|
78
|
-
> All three tools were installed (`npm install -g @biocli
|
|
78
|
+
> All three tools were installed (`npm install -g @yangfei_93sky/biocli`, `pip install gget==0.30.3`, `uv tool install biomcp-cli==0.8.19`) and executed on the same machine with the same inputs. Raw stdout/stderr, scoring scripts, and runner scripts are in [`benchmarks/`](benchmarks/). BioMCP excels at biomedical entity breadth (drugs, trials, diseases) not covered by this task set; gget excels at sequence analysis (BLAST, AlphaFold) not covered here.
|
|
79
79
|
|
|
80
80
|
## Quick start
|
|
81
81
|
|
|
@@ -118,6 +118,8 @@ biocli aggregate gene-profile TP53
|
|
|
118
118
|
| `aggregate gene-profile <gene>` | NCBI+UniProt+KEGG+STRING | Gene profile (no literature) |
|
|
119
119
|
| `aggregate workflow-scout <query>` | GEO+SRA | Scout datasets for a research question |
|
|
120
120
|
| `aggregate workflow-prepare <dataset>` | GEO+NCBI+UniProt+KEGG | Prepare research-ready directory with data + annotations |
|
|
121
|
+
| `aggregate workflow-annotate <genes>` | NCBI+UniProt+KEGG+Enrichr | Annotate gene list → genes.csv + pathways.csv + enrichment.csv + report.md |
|
|
122
|
+
| `aggregate workflow-profile <genes>` | NCBI+UniProt+KEGG+STRING+Enrichr | Gene set functional profile → shared pathways, interactions, GO terms |
|
|
121
123
|
|
|
122
124
|
### Database commands (atomic)
|
|
123
125
|
|
package/dist/cli-manifest.json
CHANGED
|
@@ -224,6 +224,57 @@
|
|
|
224
224
|
"type": "ts",
|
|
225
225
|
"modulePath": "aggregate/variant-interpret.js"
|
|
226
226
|
},
|
|
227
|
+
{
|
|
228
|
+
"site": "aggregate",
|
|
229
|
+
"name": "workflow-annotate",
|
|
230
|
+
"description": "Annotate a gene list into a research-ready directory",
|
|
231
|
+
"database": "aggregate",
|
|
232
|
+
"strategy": "public",
|
|
233
|
+
"args": [
|
|
234
|
+
{
|
|
235
|
+
"name": "genes",
|
|
236
|
+
"type": "str",
|
|
237
|
+
"required": true,
|
|
238
|
+
"positional": true,
|
|
239
|
+
"help": "Gene symbols: comma-separated (TP53,BRCA1) or use --input file"
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
"name": "outdir",
|
|
243
|
+
"type": "str",
|
|
244
|
+
"required": true,
|
|
245
|
+
"help": "Output directory for annotation results"
|
|
246
|
+
},
|
|
247
|
+
{
|
|
248
|
+
"name": "organism",
|
|
249
|
+
"type": "str",
|
|
250
|
+
"default": "human",
|
|
251
|
+
"required": false,
|
|
252
|
+
"help": "Organism (human, mouse, rat, etc.)"
|
|
253
|
+
},
|
|
254
|
+
{
|
|
255
|
+
"name": "library",
|
|
256
|
+
"type": "str",
|
|
257
|
+
"default": "KEGG_2021_Human",
|
|
258
|
+
"required": false,
|
|
259
|
+
"help": "Enrichr library for enrichment analysis"
|
|
260
|
+
},
|
|
261
|
+
{
|
|
262
|
+
"name": "plan",
|
|
263
|
+
"type": "boolean",
|
|
264
|
+
"default": false,
|
|
265
|
+
"required": false,
|
|
266
|
+
"help": "Preview steps without executing"
|
|
267
|
+
}
|
|
268
|
+
],
|
|
269
|
+
"columns": [
|
|
270
|
+
"step",
|
|
271
|
+
"status",
|
|
272
|
+
"detail"
|
|
273
|
+
],
|
|
274
|
+
"timeout": 120,
|
|
275
|
+
"type": "ts",
|
|
276
|
+
"modulePath": "aggregate/workflow-annotate.js"
|
|
277
|
+
},
|
|
227
278
|
{
|
|
228
279
|
"site": "aggregate",
|
|
229
280
|
"name": "workflow-prepare",
|
|
@@ -267,6 +318,57 @@
|
|
|
267
318
|
"type": "ts",
|
|
268
319
|
"modulePath": "aggregate/workflow-prepare.js"
|
|
269
320
|
},
|
|
321
|
+
{
|
|
322
|
+
"site": "aggregate",
|
|
323
|
+
"name": "workflow-profile",
|
|
324
|
+
"description": "Functional profile for a gene set (interactions, GO terms, shared pathways)",
|
|
325
|
+
"database": "aggregate",
|
|
326
|
+
"strategy": "public",
|
|
327
|
+
"args": [
|
|
328
|
+
{
|
|
329
|
+
"name": "genes",
|
|
330
|
+
"type": "str",
|
|
331
|
+
"required": true,
|
|
332
|
+
"positional": true,
|
|
333
|
+
"help": "Gene symbols: comma-separated (TP53,BRCA1,EGFR,MYC,CDK2)"
|
|
334
|
+
},
|
|
335
|
+
{
|
|
336
|
+
"name": "outdir",
|
|
337
|
+
"type": "str",
|
|
338
|
+
"required": true,
|
|
339
|
+
"help": "Output directory"
|
|
340
|
+
},
|
|
341
|
+
{
|
|
342
|
+
"name": "organism",
|
|
343
|
+
"type": "str",
|
|
344
|
+
"default": "human",
|
|
345
|
+
"required": false,
|
|
346
|
+
"help": "Organism (human, mouse, rat, etc.)"
|
|
347
|
+
},
|
|
348
|
+
{
|
|
349
|
+
"name": "library",
|
|
350
|
+
"type": "str",
|
|
351
|
+
"default": "KEGG_2021_Human",
|
|
352
|
+
"required": false,
|
|
353
|
+
"help": "Enrichr library"
|
|
354
|
+
},
|
|
355
|
+
{
|
|
356
|
+
"name": "plan",
|
|
357
|
+
"type": "boolean",
|
|
358
|
+
"default": false,
|
|
359
|
+
"required": false,
|
|
360
|
+
"help": "Preview steps without executing"
|
|
361
|
+
}
|
|
362
|
+
],
|
|
363
|
+
"columns": [
|
|
364
|
+
"step",
|
|
365
|
+
"status",
|
|
366
|
+
"detail"
|
|
367
|
+
],
|
|
368
|
+
"timeout": 180,
|
|
369
|
+
"type": "ts",
|
|
370
|
+
"modulePath": "aggregate/workflow-profile.js"
|
|
371
|
+
},
|
|
270
372
|
{
|
|
271
373
|
"site": "aggregate",
|
|
272
374
|
"name": "workflow-scout",
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* aggregate/workflow-annotate — Annotate a gene list into a research-ready directory.
|
|
3
|
+
*
|
|
4
|
+
* Input: gene list (comma-separated, --input file, or stdin)
|
|
5
|
+
* Output directory:
|
|
6
|
+
* summary.json — high-level overview (gene count, sources, warnings)
|
|
7
|
+
* genes.csv — per-gene annotations (symbol, name, function, chromosome, etc.)
|
|
8
|
+
* pathways.csv — all KEGG pathways linked to any input gene
|
|
9
|
+
* enrichment.csv — Enrichr pathway enrichment results
|
|
10
|
+
* report.md — human-readable Markdown report
|
|
11
|
+
* manifest.json — full provenance (biocli version, run timestamp, sources, inputs)
|
|
12
|
+
*
|
|
13
|
+
* Cross-queries: NCBI Gene + UniProt + KEGG + Enrichr
|
|
14
|
+
*/
|
|
15
|
+
export {};
|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* aggregate/workflow-annotate — Annotate a gene list into a research-ready directory.
|
|
3
|
+
*
|
|
4
|
+
* Input: gene list (comma-separated, --input file, or stdin)
|
|
5
|
+
* Output directory:
|
|
6
|
+
* summary.json — high-level overview (gene count, sources, warnings)
|
|
7
|
+
* genes.csv — per-gene annotations (symbol, name, function, chromosome, etc.)
|
|
8
|
+
* pathways.csv — all KEGG pathways linked to any input gene
|
|
9
|
+
* enrichment.csv — Enrichr pathway enrichment results
|
|
10
|
+
* report.md — human-readable Markdown report
|
|
11
|
+
* manifest.json — full provenance (biocli version, run timestamp, sources, inputs)
|
|
12
|
+
*
|
|
13
|
+
* Cross-queries: NCBI Gene + UniProt + KEGG + Enrichr
|
|
14
|
+
*/
|
|
15
|
+
import { cli, Strategy } from '../../registry.js';
|
|
16
|
+
import { CliError } from '../../errors.js';
|
|
17
|
+
import { wrapResult } from '../../types.js';
|
|
18
|
+
import { createHttpContextForDatabase } from '../../databases/index.js';
|
|
19
|
+
import { buildEutilsUrl } from '../../databases/ncbi.js';
|
|
20
|
+
import { buildUniprotUrl } from '../../databases/uniprot.js';
|
|
21
|
+
import { buildKeggUrl, parseKeggTsv } from '../../databases/kegg.js';
|
|
22
|
+
import { submitGeneList, getEnrichment } from '../../databases/enrichr.js';
|
|
23
|
+
import { parseGeneSummaries } from '../_shared/xml-helpers.js';
|
|
24
|
+
import { resolveOrganism } from '../_shared/organism-db.js';
|
|
25
|
+
import { mkdirSync, existsSync, writeFileSync } from 'node:fs';
|
|
26
|
+
import { join } from 'node:path';
|
|
27
|
+
import { getVersion } from '../../version.js';
|
|
28
|
+
// ── CSV helper ───────────────────────────────────────────────────────────────
|
|
29
|
+
function toCsv(headers, rows) {
|
|
30
|
+
const escape = (v) => {
|
|
31
|
+
const s = String(v ?? '');
|
|
32
|
+
return s.includes(',') || s.includes('"') || s.includes('\n')
|
|
33
|
+
? `"${s.replace(/"/g, '""')}"` : s;
|
|
34
|
+
};
|
|
35
|
+
const lines = [headers.join(',')];
|
|
36
|
+
for (const row of rows) {
|
|
37
|
+
lines.push(headers.map(h => escape(row[h])).join(','));
|
|
38
|
+
}
|
|
39
|
+
return lines.join('\n') + '\n';
|
|
40
|
+
}
|
|
41
|
+
// ── Main ─────────────────────────────────────────────────────────────────────
|
|
42
|
+
cli({
|
|
43
|
+
site: 'aggregate',
|
|
44
|
+
name: 'workflow-annotate',
|
|
45
|
+
description: 'Annotate a gene list into a research-ready directory',
|
|
46
|
+
database: 'aggregate',
|
|
47
|
+
strategy: Strategy.PUBLIC,
|
|
48
|
+
defaultFormat: 'json',
|
|
49
|
+
timeoutSeconds: 120,
|
|
50
|
+
args: [
|
|
51
|
+
{ name: 'genes', positional: true, required: true, help: 'Gene symbols: comma-separated (TP53,BRCA1) or use --input file' },
|
|
52
|
+
{ name: 'outdir', required: true, help: 'Output directory for annotation results' },
|
|
53
|
+
{ name: 'organism', default: 'human', help: 'Organism (human, mouse, rat, etc.)' },
|
|
54
|
+
{ name: 'library', default: 'KEGG_2021_Human', help: 'Enrichr library for enrichment analysis' },
|
|
55
|
+
{ name: 'plan', type: 'boolean', default: false, help: 'Preview steps without executing' },
|
|
56
|
+
],
|
|
57
|
+
columns: ['step', 'status', 'detail'],
|
|
58
|
+
func: async (_ctx, args) => {
|
|
59
|
+
const geneInput = String(args.genes);
|
|
60
|
+
const genes = geneInput.split(',').map(s => s.trim()).filter(Boolean);
|
|
61
|
+
const outdir = String(args.outdir);
|
|
62
|
+
const orgInput = String(args.organism);
|
|
63
|
+
const library = String(args.library);
|
|
64
|
+
const planOnly = Boolean(args.plan);
|
|
65
|
+
if (genes.length === 0) {
|
|
66
|
+
throw new CliError('ARGUMENT', 'At least one gene symbol is required', 'Example: biocli aggregate workflow-annotate TP53,BRCA1,EGFR --outdir ./results');
|
|
67
|
+
}
|
|
68
|
+
const org = resolveOrganism(orgInput);
|
|
69
|
+
const sources = [];
|
|
70
|
+
const warnings = [];
|
|
71
|
+
const steps = [];
|
|
72
|
+
// ── Plan mode ───────────────────────────────────────────────────────
|
|
73
|
+
if (planOnly) {
|
|
74
|
+
return wrapResult({
|
|
75
|
+
plan: [
|
|
76
|
+
{ step: 'gene-annotations', detail: `Query NCBI Gene + UniProt for ${genes.length} gene(s)` },
|
|
77
|
+
{ step: 'pathways', detail: `Query KEGG pathways for each gene` },
|
|
78
|
+
{ step: 'enrichment', detail: `Run Enrichr enrichment (${library}) for gene set` },
|
|
79
|
+
{ step: 'output', detail: `Write genes.csv, pathways.csv, enrichment.csv, report.md, summary.json, manifest.json → ${outdir}` },
|
|
80
|
+
],
|
|
81
|
+
genes,
|
|
82
|
+
organism: org.name,
|
|
83
|
+
outdir,
|
|
84
|
+
}, { ids: {}, sources: [], warnings: [], query: genes.join(','), organism: org.name });
|
|
85
|
+
}
|
|
86
|
+
// Create output directory
|
|
87
|
+
if (!existsSync(outdir))
|
|
88
|
+
mkdirSync(outdir, { recursive: true });
|
|
89
|
+
const ncbiCtx = createHttpContextForDatabase('ncbi');
|
|
90
|
+
const uniprotCtx = createHttpContextForDatabase('uniprot');
|
|
91
|
+
const keggCtx = createHttpContextForDatabase('kegg');
|
|
92
|
+
// ── Step 1: Gene annotations (NCBI + UniProt) ───────────────────────
|
|
93
|
+
const geneAnnotations = [];
|
|
94
|
+
for (const gene of genes) {
|
|
95
|
+
const annot = {
|
|
96
|
+
symbol: gene, ncbiGeneId: '', name: '', summary: '',
|
|
97
|
+
chromosome: '', location: '', uniprotAccession: '',
|
|
98
|
+
proteinFunction: '', subcellularLocation: '', goTerms: '',
|
|
99
|
+
};
|
|
100
|
+
// NCBI Gene
|
|
101
|
+
try {
|
|
102
|
+
const searchResult = await ncbiCtx.fetchJson(buildEutilsUrl('esearch.fcgi', {
|
|
103
|
+
db: 'gene', term: `${gene}[Gene Name] AND ${org.name}[Organism]`,
|
|
104
|
+
retmax: '5', retmode: 'json',
|
|
105
|
+
}));
|
|
106
|
+
const ids = searchResult?.esearchresult?.idlist ?? [];
|
|
107
|
+
if (ids.length > 0) {
|
|
108
|
+
const summaryResult = await ncbiCtx.fetchJson(buildEutilsUrl('esummary.fcgi', {
|
|
109
|
+
db: 'gene', id: ids.join(','), retmode: 'json',
|
|
110
|
+
}));
|
|
111
|
+
const parsed = parseGeneSummaries(summaryResult);
|
|
112
|
+
const best = parsed.find(g => g.symbol.toUpperCase() === gene.toUpperCase()) ?? parsed[0];
|
|
113
|
+
if (best) {
|
|
114
|
+
annot.ncbiGeneId = best.geneId;
|
|
115
|
+
annot.name = best.name;
|
|
116
|
+
annot.summary = best.summary;
|
|
117
|
+
annot.chromosome = best.chromosome;
|
|
118
|
+
annot.location = best.location;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
catch (err) {
|
|
123
|
+
warnings.push(`NCBI Gene ${gene}: ${err instanceof Error ? err.message : String(err)}`);
|
|
124
|
+
}
|
|
125
|
+
// UniProt
|
|
126
|
+
try {
|
|
127
|
+
const upResult = await uniprotCtx.fetchJson(buildUniprotUrl('/uniprotkb/search', {
|
|
128
|
+
query: `gene:${gene} AND organism_id:${org.taxId} AND reviewed:true`,
|
|
129
|
+
format: 'json', size: '5',
|
|
130
|
+
}));
|
|
131
|
+
const results = (upResult?.results ?? []);
|
|
132
|
+
if (results.length > 0) {
|
|
133
|
+
const getGeneName = (e) => {
|
|
134
|
+
const gs = e.genes;
|
|
135
|
+
const gn = gs?.[0];
|
|
136
|
+
return String(gn?.geneName?.value ?? '');
|
|
137
|
+
};
|
|
138
|
+
const exact = results.find(e => getGeneName(e).toUpperCase() === gene.toUpperCase());
|
|
139
|
+
const entry = exact ?? results[0];
|
|
140
|
+
annot.uniprotAccession = String(entry.primaryAccession ?? '');
|
|
141
|
+
const comments = (entry.comments ?? []);
|
|
142
|
+
const funcComment = comments.find(c => c.commentType === 'FUNCTION');
|
|
143
|
+
const funcTexts = (funcComment?.texts ?? []);
|
|
144
|
+
annot.proteinFunction = funcTexts.map(t => String(t.value ?? '')).join(' ');
|
|
145
|
+
const locComment = comments.find(c => c.commentType === 'SUBCELLULAR LOCATION');
|
|
146
|
+
const locEntries = (locComment?.subcellularLocations ?? []);
|
|
147
|
+
annot.subcellularLocation = locEntries.map(l => String(l.location?.value ?? '')).filter(Boolean).join(', ');
|
|
148
|
+
const xrefs = (entry.uniProtKBCrossReferences ?? []);
|
|
149
|
+
const goTerms = xrefs.filter(x => x.database === 'GO').map(x => {
|
|
150
|
+
const props = (x.properties ?? []);
|
|
151
|
+
const termProp = props.find(p => p.key === 'GoTerm');
|
|
152
|
+
return String(termProp?.value ?? '');
|
|
153
|
+
});
|
|
154
|
+
annot.goTerms = goTerms.slice(0, 10).join('; ');
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
catch (err) {
|
|
158
|
+
warnings.push(`UniProt ${gene}: ${err instanceof Error ? err.message : String(err)}`);
|
|
159
|
+
}
|
|
160
|
+
geneAnnotations.push(annot);
|
|
161
|
+
}
|
|
162
|
+
if (geneAnnotations.some(a => a.ncbiGeneId))
|
|
163
|
+
sources.push('NCBI Gene');
|
|
164
|
+
if (geneAnnotations.some(a => a.uniprotAccession))
|
|
165
|
+
sources.push('UniProt');
|
|
166
|
+
writeFileSync(join(outdir, 'genes.csv'), toCsv(['symbol', 'ncbiGeneId', 'name', 'chromosome', 'location', 'uniprotAccession', 'proteinFunction', 'subcellularLocation', 'goTerms', 'summary'], geneAnnotations));
|
|
167
|
+
steps.push({ step: 'gene-annotations', status: 'done', detail: `${geneAnnotations.length} gene(s) → genes.csv` });
|
|
168
|
+
// ── Step 2: KEGG pathways ───────────────────────────────────────────
|
|
169
|
+
const pathwayLinks = [];
|
|
170
|
+
const pathIdSet = new Set();
|
|
171
|
+
for (const annot of geneAnnotations) {
|
|
172
|
+
if (!annot.ncbiGeneId)
|
|
173
|
+
continue;
|
|
174
|
+
try {
|
|
175
|
+
const linkText = await keggCtx.fetchText(buildKeggUrl(`/link/pathway/${org.keggOrg}:${annot.ncbiGeneId}`));
|
|
176
|
+
if (linkText && linkText.trim()) {
|
|
177
|
+
const links = parseKeggTsv(linkText);
|
|
178
|
+
for (const l of links) {
|
|
179
|
+
const pid = l.value.replace(/^path:/, '');
|
|
180
|
+
pathIdSet.add(pid);
|
|
181
|
+
pathwayLinks.push({ gene: annot.symbol, pathwayId: pid, pathwayName: '' });
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
catch { /* non-fatal */ }
|
|
186
|
+
}
|
|
187
|
+
// Resolve pathway names
|
|
188
|
+
if (pathIdSet.size > 0) {
|
|
189
|
+
try {
|
|
190
|
+
const listText = await keggCtx.fetchText(buildKeggUrl(`/list/pathway/${org.keggOrg}`));
|
|
191
|
+
const allPaths = parseKeggTsv(listText);
|
|
192
|
+
const nameMap = new Map(allPaths.map(p => [p.key, p.value.replace(/ - .*$/, '')]));
|
|
193
|
+
for (const link of pathwayLinks) {
|
|
194
|
+
link.pathwayName = nameMap.get(link.pathwayId) ?? link.pathwayId;
|
|
195
|
+
}
|
|
196
|
+
sources.push('KEGG');
|
|
197
|
+
}
|
|
198
|
+
catch (err) {
|
|
199
|
+
warnings.push(`KEGG pathway names: ${err instanceof Error ? err.message : String(err)}`);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
writeFileSync(join(outdir, 'pathways.csv'), toCsv(['gene', 'pathwayId', 'pathwayName'], pathwayLinks));
|
|
203
|
+
steps.push({ step: 'pathways', status: 'done', detail: `${pathwayLinks.length} pathway links (${pathIdSet.size} unique) → pathways.csv` });
|
|
204
|
+
// ── Step 3: Enrichment (Enrichr) ────────────────────────────────────
|
|
205
|
+
const enrichmentRows = [];
|
|
206
|
+
if (genes.length >= 2) {
|
|
207
|
+
try {
|
|
208
|
+
const userListId = await submitGeneList(genes);
|
|
209
|
+
const results = await getEnrichment(userListId, library);
|
|
210
|
+
for (let i = 0; i < Math.min(results.length, 30); i++) {
|
|
211
|
+
const r = results[i];
|
|
212
|
+
enrichmentRows.push({
|
|
213
|
+
rank: i + 1,
|
|
214
|
+
term: String(r.term),
|
|
215
|
+
library,
|
|
216
|
+
adjustedPValue: Number(r.adjustedPValue).toExponential(2),
|
|
217
|
+
combinedScore: Number(r.combinedScore).toFixed(1),
|
|
218
|
+
genes: String(r.genes),
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
sources.push('Enrichr');
|
|
222
|
+
}
|
|
223
|
+
catch (err) {
|
|
224
|
+
warnings.push(`Enrichr: ${err instanceof Error ? err.message : String(err)}`);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
228
|
+
warnings.push('Enrichment skipped: at least 2 genes required');
|
|
229
|
+
}
|
|
230
|
+
writeFileSync(join(outdir, 'enrichment.csv'), toCsv(['rank', 'term', 'library', 'adjustedPValue', 'combinedScore', 'genes'], enrichmentRows));
|
|
231
|
+
steps.push({ step: 'enrichment', status: enrichmentRows.length > 0 ? 'done' : 'skipped',
|
|
232
|
+
detail: enrichmentRows.length > 0 ? `${enrichmentRows.length} terms → enrichment.csv` : 'skipped (need ≥ 2 genes)' });
|
|
233
|
+
// ── Step 4: report.md ───────────────────────────────────────────────
|
|
234
|
+
const reportLines = [
|
|
235
|
+
`# Gene Annotation Report`,
|
|
236
|
+
``,
|
|
237
|
+
`**Generated by biocli** v${getVersion()} on ${new Date().toISOString()}`,
|
|
238
|
+
``,
|
|
239
|
+
`## Input`,
|
|
240
|
+
``,
|
|
241
|
+
`- **Genes**: ${genes.join(', ')}`,
|
|
242
|
+
`- **Organism**: ${org.name}`,
|
|
243
|
+
`- **Sources**: ${sources.join(', ') || 'none'}`,
|
|
244
|
+
warnings.length > 0 ? `- **Warnings**: ${warnings.length}` : '',
|
|
245
|
+
``,
|
|
246
|
+
`## Gene Summary`,
|
|
247
|
+
``,
|
|
248
|
+
`| Symbol | Name | Chromosome | UniProt | Function |`,
|
|
249
|
+
`|--------|------|------------|---------|----------|`,
|
|
250
|
+
];
|
|
251
|
+
for (const g of geneAnnotations) {
|
|
252
|
+
const func = g.proteinFunction.length > 80 ? g.proteinFunction.slice(0, 80) + '...' : g.proteinFunction;
|
|
253
|
+
reportLines.push(`| ${g.symbol} | ${g.name} | ${g.chromosome} | ${g.uniprotAccession} | ${func} |`);
|
|
254
|
+
}
|
|
255
|
+
if (pathwayLinks.length > 0) {
|
|
256
|
+
// Deduplicate pathways
|
|
257
|
+
const uniquePathways = [...new Map(pathwayLinks.map(p => [p.pathwayId, p])).values()];
|
|
258
|
+
reportLines.push('', `## KEGG Pathways (${uniquePathways.length} unique)`, '', '| Pathway | Genes |', '|---------|-------|');
|
|
259
|
+
const pathwayGenes = new Map();
|
|
260
|
+
for (const link of pathwayLinks) {
|
|
261
|
+
const list = pathwayGenes.get(link.pathwayName) ?? [];
|
|
262
|
+
list.push(link.gene);
|
|
263
|
+
pathwayGenes.set(link.pathwayName, list);
|
|
264
|
+
}
|
|
265
|
+
for (const [name, gList] of [...pathwayGenes.entries()].slice(0, 20)) {
|
|
266
|
+
reportLines.push(`| ${name} | ${[...new Set(gList)].join(', ')} |`);
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
if (enrichmentRows.length > 0) {
|
|
270
|
+
reportLines.push('', `## Enrichment Analysis (${library})`, '', '| Rank | Term | Adj. P-value | Genes |', '|------|------|-------------|-------|');
|
|
271
|
+
for (const r of enrichmentRows.slice(0, 15)) {
|
|
272
|
+
reportLines.push(`| ${r.rank} | ${r.term} | ${r.adjustedPValue} | ${r.genes} |`);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
if (warnings.length > 0) {
|
|
276
|
+
reportLines.push('', '## Warnings', '');
|
|
277
|
+
for (const w of warnings)
|
|
278
|
+
reportLines.push(`- ${w}`);
|
|
279
|
+
}
|
|
280
|
+
reportLines.push('', '---', `*Report generated by [biocli](https://github.com/youngfly93/biocli)*`);
|
|
281
|
+
writeFileSync(join(outdir, 'report.md'), reportLines.filter(l => l !== undefined).join('\n') + '\n');
|
|
282
|
+
steps.push({ step: 'report', status: 'done', detail: `report.md → ${outdir}` });
|
|
283
|
+
// ── Step 5: summary.json + manifest.json ────────────────────────────
|
|
284
|
+
const summary = {
|
|
285
|
+
geneCount: genes.length,
|
|
286
|
+
annotatedCount: geneAnnotations.filter(a => a.ncbiGeneId).length,
|
|
287
|
+
pathwayCount: pathIdSet.size,
|
|
288
|
+
enrichmentTerms: enrichmentRows.length,
|
|
289
|
+
sources,
|
|
290
|
+
warnings,
|
|
291
|
+
};
|
|
292
|
+
writeFileSync(join(outdir, 'summary.json'), JSON.stringify(summary, null, 2));
|
|
293
|
+
const manifest = {
|
|
294
|
+
biocliVersion: getVersion(),
|
|
295
|
+
createdAt: new Date().toISOString(),
|
|
296
|
+
command: 'workflow-annotate',
|
|
297
|
+
input: { genes, organism: org.name, library },
|
|
298
|
+
output: {
|
|
299
|
+
'genes.csv': `${geneAnnotations.length} genes`,
|
|
300
|
+
'pathways.csv': `${pathwayLinks.length} pathway links`,
|
|
301
|
+
'enrichment.csv': `${enrichmentRows.length} terms`,
|
|
302
|
+
'report.md': 'Markdown report',
|
|
303
|
+
'summary.json': 'Overview statistics',
|
|
304
|
+
},
|
|
305
|
+
sources,
|
|
306
|
+
warnings,
|
|
307
|
+
};
|
|
308
|
+
steps.push({ step: 'manifest', status: 'done', detail: `summary.json + manifest.json → ${outdir}` });
|
|
309
|
+
writeFileSync(join(outdir, 'manifest.json'), JSON.stringify(manifest, null, 2));
|
|
310
|
+
return wrapResult({
|
|
311
|
+
outdir,
|
|
312
|
+
genes,
|
|
313
|
+
steps,
|
|
314
|
+
summary,
|
|
315
|
+
}, {
|
|
316
|
+
ids: {},
|
|
317
|
+
sources,
|
|
318
|
+
warnings,
|
|
319
|
+
query: genes.join(','),
|
|
320
|
+
organism: org.name,
|
|
321
|
+
});
|
|
322
|
+
},
|
|
323
|
+
});
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* aggregate/workflow-profile — Functional profile for a gene set.
|
|
3
|
+
*
|
|
4
|
+
* Unlike workflow-annotate (per-gene annotations), this command focuses on
|
|
5
|
+
* the SET-LEVEL view: shared pathways, interaction network, GO term
|
|
6
|
+
* distribution, and enrichment. Think "what does this gene set DO together?"
|
|
7
|
+
*
|
|
8
|
+
* Output directory:
|
|
9
|
+
* profiles.json — per-gene profile summaries (from gene-profile)
|
|
10
|
+
* interactions.csv — STRING protein-protein interaction network
|
|
11
|
+
* go_summary.csv — GO term frequency across the gene set
|
|
12
|
+
* shared_pathways.csv — KEGG pathways shared by 2+ input genes
|
|
13
|
+
* enrichment.csv — Enrichr enrichment results
|
|
14
|
+
* report.md — human-readable Markdown report
|
|
15
|
+
* manifest.json — provenance
|
|
16
|
+
*/
|
|
17
|
+
export {};
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* aggregate/workflow-profile — Functional profile for a gene set.
|
|
3
|
+
*
|
|
4
|
+
* Unlike workflow-annotate (per-gene annotations), this command focuses on
|
|
5
|
+
* the SET-LEVEL view: shared pathways, interaction network, GO term
|
|
6
|
+
* distribution, and enrichment. Think "what does this gene set DO together?"
|
|
7
|
+
*
|
|
8
|
+
* Output directory:
|
|
9
|
+
* profiles.json — per-gene profile summaries (from gene-profile)
|
|
10
|
+
* interactions.csv — STRING protein-protein interaction network
|
|
11
|
+
* go_summary.csv — GO term frequency across the gene set
|
|
12
|
+
* shared_pathways.csv — KEGG pathways shared by 2+ input genes
|
|
13
|
+
* enrichment.csv — Enrichr enrichment results
|
|
14
|
+
* report.md — human-readable Markdown report
|
|
15
|
+
* manifest.json — provenance
|
|
16
|
+
*/
|
|
17
|
+
import { cli, Strategy } from '../../registry.js';
|
|
18
|
+
import { CliError } from '../../errors.js';
|
|
19
|
+
import { wrapResult } from '../../types.js';
|
|
20
|
+
import { createHttpContextForDatabase } from '../../databases/index.js';
|
|
21
|
+
import { buildEutilsUrl } from '../../databases/ncbi.js';
|
|
22
|
+
import { buildUniprotUrl } from '../../databases/uniprot.js';
|
|
23
|
+
import { buildKeggUrl, parseKeggTsv } from '../../databases/kegg.js';
|
|
24
|
+
import { buildStringUrl } from '../../databases/string-db.js';
|
|
25
|
+
import { submitGeneList, getEnrichment } from '../../databases/enrichr.js';
|
|
26
|
+
import { parseGeneSummaries } from '../_shared/xml-helpers.js';
|
|
27
|
+
import { resolveOrganism } from '../_shared/organism-db.js';
|
|
28
|
+
import { mkdirSync, existsSync, writeFileSync } from 'node:fs';
|
|
29
|
+
import { join } from 'node:path';
|
|
30
|
+
import { getVersion } from '../../version.js';
|
|
31
|
+
// ── CSV helper ───────────────────────────────────────────────────────────────
|
|
32
|
+
function toCsv(headers, rows) {
|
|
33
|
+
const escape = (v) => {
|
|
34
|
+
const s = String(v ?? '');
|
|
35
|
+
return s.includes(',') || s.includes('"') || s.includes('\n')
|
|
36
|
+
? `"${s.replace(/"/g, '""')}"` : s;
|
|
37
|
+
};
|
|
38
|
+
return [headers.join(','), ...rows.map(r => headers.map(h => escape(r[h])).join(','))].join('\n') + '\n';
|
|
39
|
+
}
|
|
40
|
+
// ── Main ─────────────────────────────────────────────────────────────────────
|
|
41
|
+
cli({
|
|
42
|
+
site: 'aggregate',
|
|
43
|
+
name: 'workflow-profile',
|
|
44
|
+
description: 'Functional profile for a gene set (interactions, GO terms, shared pathways)',
|
|
45
|
+
database: 'aggregate',
|
|
46
|
+
strategy: Strategy.PUBLIC,
|
|
47
|
+
defaultFormat: 'json',
|
|
48
|
+
timeoutSeconds: 180,
|
|
49
|
+
args: [
|
|
50
|
+
{ name: 'genes', positional: true, required: true, help: 'Gene symbols: comma-separated (TP53,BRCA1,EGFR,MYC,CDK2)' },
|
|
51
|
+
{ name: 'outdir', required: true, help: 'Output directory' },
|
|
52
|
+
{ name: 'organism', default: 'human', help: 'Organism (human, mouse, rat, etc.)' },
|
|
53
|
+
{ name: 'library', default: 'KEGG_2021_Human', help: 'Enrichr library' },
|
|
54
|
+
{ name: 'plan', type: 'boolean', default: false, help: 'Preview steps without executing' },
|
|
55
|
+
],
|
|
56
|
+
columns: ['step', 'status', 'detail'],
|
|
57
|
+
func: async (_ctx, args) => {
|
|
58
|
+
const genes = String(args.genes).split(',').map(s => s.trim()).filter(Boolean);
|
|
59
|
+
const outdir = String(args.outdir);
|
|
60
|
+
const library = String(args.library);
|
|
61
|
+
const planOnly = Boolean(args.plan);
|
|
62
|
+
if (genes.length < 2) {
|
|
63
|
+
throw new CliError('ARGUMENT', 'At least 2 gene symbols required for profiling', 'Example: biocli aggregate workflow-profile TP53,BRCA1,EGFR,MYC,CDK2 --outdir ./profile');
|
|
64
|
+
}
|
|
65
|
+
const org = resolveOrganism(String(args.organism));
|
|
66
|
+
const sources = [];
|
|
67
|
+
const warnings = [];
|
|
68
|
+
const steps = [];
|
|
69
|
+
if (planOnly) {
|
|
70
|
+
return wrapResult({
|
|
71
|
+
plan: [
|
|
72
|
+
{ step: 'gene-profiles', detail: `Query NCBI Gene + UniProt for ${genes.length} gene(s)` },
|
|
73
|
+
{ step: 'interactions', detail: `Query STRING network for all ${genes.length} genes` },
|
|
74
|
+
{ step: 'pathways', detail: `Find KEGG pathways shared by 2+ genes` },
|
|
75
|
+
{ step: 'go-summary', detail: `Aggregate GO terms across gene set` },
|
|
76
|
+
{ step: 'enrichment', detail: `Run Enrichr (${library})` },
|
|
77
|
+
{ step: 'output', detail: `Write profiles.json, interactions.csv, go_summary.csv, shared_pathways.csv, enrichment.csv, report.md → ${outdir}` },
|
|
78
|
+
],
|
|
79
|
+
genes, organism: org.name, outdir,
|
|
80
|
+
}, { ids: {}, sources: [], warnings: [], query: genes.join(','), organism: org.name });
|
|
81
|
+
}
|
|
82
|
+
if (!existsSync(outdir))
|
|
83
|
+
mkdirSync(outdir, { recursive: true });
|
|
84
|
+
const ncbiCtx = createHttpContextForDatabase('ncbi');
|
|
85
|
+
const uniprotCtx = createHttpContextForDatabase('uniprot');
|
|
86
|
+
const keggCtx = createHttpContextForDatabase('kegg');
|
|
87
|
+
const stringCtx = createHttpContextForDatabase('string');
|
|
88
|
+
// ── Step 1: Per-gene profiles (NCBI + UniProt) ──────────────────────
|
|
89
|
+
const profiles = [];
|
|
90
|
+
const geneIds = {}; // symbol → ncbi gene id
|
|
91
|
+
const allGoTerms = [];
|
|
92
|
+
for (const gene of genes) {
|
|
93
|
+
const profile = { symbol: gene };
|
|
94
|
+
try {
|
|
95
|
+
const sr = await ncbiCtx.fetchJson(buildEutilsUrl('esearch.fcgi', {
|
|
96
|
+
db: 'gene', term: `${gene}[Gene Name] AND ${org.name}[Organism]`,
|
|
97
|
+
retmax: '5', retmode: 'json',
|
|
98
|
+
}));
|
|
99
|
+
const ids = sr?.esearchresult?.idlist ?? [];
|
|
100
|
+
if (ids.length > 0) {
|
|
101
|
+
const summ = await ncbiCtx.fetchJson(buildEutilsUrl('esummary.fcgi', { db: 'gene', id: ids.join(','), retmode: 'json' }));
|
|
102
|
+
const parsed = parseGeneSummaries(summ);
|
|
103
|
+
const best = parsed.find(g => g.symbol.toUpperCase() === gene.toUpperCase()) ?? parsed[0];
|
|
104
|
+
if (best) {
|
|
105
|
+
profile.ncbiGeneId = best.geneId;
|
|
106
|
+
profile.name = best.name;
|
|
107
|
+
profile.chromosome = best.chromosome;
|
|
108
|
+
geneIds[gene] = best.geneId;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
catch (err) {
|
|
113
|
+
warnings.push(`NCBI ${gene}: ${err instanceof Error ? err.message : String(err)}`);
|
|
114
|
+
}
|
|
115
|
+
try {
|
|
116
|
+
const upResult = await uniprotCtx.fetchJson(buildUniprotUrl('/uniprotkb/search', {
|
|
117
|
+
query: `gene:${gene} AND organism_id:${org.taxId} AND reviewed:true`, format: 'json', size: '5',
|
|
118
|
+
}));
|
|
119
|
+
const results = (upResult?.results ?? []);
|
|
120
|
+
if (results.length > 0) {
|
|
121
|
+
const getGN = (e) => String(e.genes?.[0]?.geneName?.value ?? '');
|
|
122
|
+
const entry = results.find(e => getGN(e).toUpperCase() === gene.toUpperCase()) ?? results[0];
|
|
123
|
+
profile.uniprotAccession = entry.primaryAccession;
|
|
124
|
+
const comments = (entry.comments ?? []);
|
|
125
|
+
const funcComment = comments.find(c => c.commentType === 'FUNCTION');
|
|
126
|
+
const funcTexts = (funcComment?.texts ?? []);
|
|
127
|
+
profile.function = funcTexts.map(t => String(t.value ?? '')).join(' ');
|
|
128
|
+
const xrefs = (entry.uniProtKBCrossReferences ?? []);
|
|
129
|
+
xrefs.filter(x => x.database === 'GO').forEach(x => {
|
|
130
|
+
const id = String(x.id ?? '');
|
|
131
|
+
const props = (x.properties ?? []);
|
|
132
|
+
const termProp = props.find(p => p.key === 'GoTerm');
|
|
133
|
+
const term = String(termProp?.value ?? '');
|
|
134
|
+
const aspectMap = { C: 'CC', F: 'MF', P: 'BP' };
|
|
135
|
+
const [aspect, ...nameParts] = term.split(':');
|
|
136
|
+
allGoTerms.push({ gene, id, name: nameParts.join(':'), aspect: aspectMap[aspect] ?? aspect });
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
catch (err) {
|
|
141
|
+
warnings.push(`UniProt ${gene}: ${err instanceof Error ? err.message : String(err)}`);
|
|
142
|
+
}
|
|
143
|
+
profiles.push(profile);
|
|
144
|
+
}
|
|
145
|
+
if (profiles.some(p => p.ncbiGeneId))
|
|
146
|
+
sources.push('NCBI Gene');
|
|
147
|
+
if (profiles.some(p => p.uniprotAccession))
|
|
148
|
+
sources.push('UniProt');
|
|
149
|
+
writeFileSync(join(outdir, 'profiles.json'), JSON.stringify(profiles, null, 2));
|
|
150
|
+
steps.push({ step: 'gene-profiles', status: 'done', detail: `${profiles.length} gene(s) → profiles.json` });
|
|
151
|
+
// ── Step 2: STRING interactions ──────────────────────────────────────
|
|
152
|
+
const interactions = [];
|
|
153
|
+
try {
|
|
154
|
+
const data = await stringCtx.fetchJson(buildStringUrl('network', {
|
|
155
|
+
identifiers: genes.join('%0d'),
|
|
156
|
+
species: String(org.taxId),
|
|
157
|
+
required_score: '400',
|
|
158
|
+
}));
|
|
159
|
+
if (Array.isArray(data)) {
|
|
160
|
+
for (const item of data) {
|
|
161
|
+
interactions.push({
|
|
162
|
+
geneA: String(item.preferredName_A ?? ''),
|
|
163
|
+
geneB: String(item.preferredName_B ?? ''),
|
|
164
|
+
score: Number(item.score ?? 0),
|
|
165
|
+
experimentalScore: Number(item.escore ?? 0),
|
|
166
|
+
databaseScore: Number(item.dscore ?? 0),
|
|
167
|
+
textminingScore: Number(item.tscore ?? 0),
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
sources.push('STRING');
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
catch (err) {
|
|
174
|
+
warnings.push(`STRING: ${err instanceof Error ? err.message : String(err)}`);
|
|
175
|
+
}
|
|
176
|
+
writeFileSync(join(outdir, 'interactions.csv'), toCsv(['geneA', 'geneB', 'score', 'experimentalScore', 'databaseScore', 'textminingScore'], interactions));
|
|
177
|
+
steps.push({ step: 'interactions', status: 'done', detail: `${interactions.length} interactions → interactions.csv` });
|
|
178
|
+
// ── Step 3: Shared KEGG pathways ────────────────────────────────────
|
|
179
|
+
const genePathways = {};
|
|
180
|
+
const pathwayGenes = {};
|
|
181
|
+
for (const gene of genes) {
|
|
182
|
+
const gid = geneIds[gene];
|
|
183
|
+
if (!gid)
|
|
184
|
+
continue;
|
|
185
|
+
try {
|
|
186
|
+
const linkText = await keggCtx.fetchText(buildKeggUrl(`/link/pathway/${org.keggOrg}:${gid}`));
|
|
187
|
+
if (linkText?.trim()) {
|
|
188
|
+
const links = parseKeggTsv(linkText);
|
|
189
|
+
genePathways[gene] = new Set(links.map(l => l.value.replace(/^path:/, '')));
|
|
190
|
+
for (const pid of genePathways[gene]) {
|
|
191
|
+
if (!pathwayGenes[pid])
|
|
192
|
+
pathwayGenes[pid] = new Set();
|
|
193
|
+
pathwayGenes[pid].add(gene);
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
catch { /* non-fatal */ }
|
|
198
|
+
}
|
|
199
|
+
// Resolve pathway names
|
|
200
|
+
let pathNameMap = new Map();
|
|
201
|
+
try {
|
|
202
|
+
const listText = await keggCtx.fetchText(buildKeggUrl(`/list/pathway/${org.keggOrg}`));
|
|
203
|
+
pathNameMap = new Map(parseKeggTsv(listText).map(p => [p.key, p.value.replace(/ - .*$/, '')]));
|
|
204
|
+
if (Object.keys(pathwayGenes).length > 0)
|
|
205
|
+
sources.push('KEGG');
|
|
206
|
+
}
|
|
207
|
+
catch { /* non-fatal */ }
|
|
208
|
+
// Only pathways shared by 2+ genes
|
|
209
|
+
const sharedPathways = Object.entries(pathwayGenes)
|
|
210
|
+
.filter(([, gSet]) => gSet.size >= 2)
|
|
211
|
+
.map(([pid, gSet]) => ({
|
|
212
|
+
pathwayId: pid,
|
|
213
|
+
pathwayName: pathNameMap.get(pid) ?? pid,
|
|
214
|
+
geneCount: gSet.size,
|
|
215
|
+
genes: [...gSet].join(', '),
|
|
216
|
+
}))
|
|
217
|
+
.sort((a, b) => b.geneCount - a.geneCount);
|
|
218
|
+
writeFileSync(join(outdir, 'shared_pathways.csv'), toCsv(['pathwayId', 'pathwayName', 'geneCount', 'genes'], sharedPathways));
|
|
219
|
+
steps.push({ step: 'shared-pathways', status: 'done', detail: `${sharedPathways.length} pathways shared by 2+ genes → shared_pathways.csv` });
|
|
220
|
+
// ── Step 4: GO term frequency ───────────────────────────────────────
|
|
221
|
+
const goFreq = {};
|
|
222
|
+
for (const gt of allGoTerms) {
|
|
223
|
+
if (!goFreq[gt.id])
|
|
224
|
+
goFreq[gt.id] = { id: gt.id, name: gt.name, aspect: gt.aspect, genes: new Set() };
|
|
225
|
+
goFreq[gt.id].genes.add(gt.gene);
|
|
226
|
+
}
|
|
227
|
+
const goSummary = Object.values(goFreq)
|
|
228
|
+
.map(g => ({ id: g.id, name: g.name, aspect: g.aspect, geneCount: g.genes.size, genes: [...g.genes].join(', ') }))
|
|
229
|
+
.sort((a, b) => b.geneCount - a.geneCount);
|
|
230
|
+
writeFileSync(join(outdir, 'go_summary.csv'), toCsv(['id', 'name', 'aspect', 'geneCount', 'genes'], goSummary));
|
|
231
|
+
steps.push({ step: 'go-summary', status: 'done', detail: `${goSummary.length} GO terms → go_summary.csv` });
|
|
232
|
+
// ── Step 5: Enrichment ──────────────────────────────────────────────
|
|
233
|
+
const enrichmentRows = [];
|
|
234
|
+
try {
|
|
235
|
+
const userListId = await submitGeneList(genes);
|
|
236
|
+
const results = await getEnrichment(userListId, library);
|
|
237
|
+
for (let i = 0; i < Math.min(results.length, 30); i++) {
|
|
238
|
+
const r = results[i];
|
|
239
|
+
enrichmentRows.push({
|
|
240
|
+
rank: i + 1, term: r.term, library,
|
|
241
|
+
adjustedPValue: Number(r.adjustedPValue).toExponential(2),
|
|
242
|
+
combinedScore: Number(r.combinedScore).toFixed(1),
|
|
243
|
+
genes: r.genes,
|
|
244
|
+
});
|
|
245
|
+
}
|
|
246
|
+
sources.push('Enrichr');
|
|
247
|
+
}
|
|
248
|
+
catch (err) {
|
|
249
|
+
warnings.push(`Enrichr: ${err instanceof Error ? err.message : String(err)}`);
|
|
250
|
+
}
|
|
251
|
+
writeFileSync(join(outdir, 'enrichment.csv'), toCsv(['rank', 'term', 'library', 'adjustedPValue', 'combinedScore', 'genes'], enrichmentRows));
|
|
252
|
+
steps.push({ step: 'enrichment', status: enrichmentRows.length > 0 ? 'done' : 'skipped',
|
|
253
|
+
detail: `${enrichmentRows.length} terms → enrichment.csv` });
|
|
254
|
+
// ── Step 6: report.md ───────────────────────────────────────────────
|
|
255
|
+
const lines = [
|
|
256
|
+
`# Gene Set Functional Profile`, '',
|
|
257
|
+
`**Generated by biocli** v${getVersion()} on ${new Date().toISOString()}`, '',
|
|
258
|
+
`## Input`, '',
|
|
259
|
+
`- **Genes**: ${genes.join(', ')} (${genes.length})`,
|
|
260
|
+
`- **Organism**: ${org.name}`,
|
|
261
|
+
`- **Sources**: ${sources.join(', ')}`,
|
|
262
|
+
warnings.length > 0 ? `- **Warnings**: ${warnings.length}` : '', '',
|
|
263
|
+
];
|
|
264
|
+
if (sharedPathways.length > 0) {
|
|
265
|
+
lines.push(`## Shared Pathways (${sharedPathways.length})`, '', '| Pathway | Genes | Count |', '|---------|-------|-------|');
|
|
266
|
+
for (const p of sharedPathways.slice(0, 20)) {
|
|
267
|
+
lines.push(`| ${p.pathwayName} | ${p.genes} | ${p.geneCount} |`);
|
|
268
|
+
}
|
|
269
|
+
lines.push('');
|
|
270
|
+
}
|
|
271
|
+
if (interactions.length > 0) {
|
|
272
|
+
lines.push(`## Protein Interactions (${interactions.length})`, '', '| Gene A | Gene B | Score |', '|--------|--------|-------|');
|
|
273
|
+
for (const i of interactions.slice(0, 20)) {
|
|
274
|
+
lines.push(`| ${i.geneA} | ${i.geneB} | ${i.score} |`);
|
|
275
|
+
}
|
|
276
|
+
lines.push('');
|
|
277
|
+
}
|
|
278
|
+
if (goSummary.length > 0) {
|
|
279
|
+
const topGo = goSummary.filter(g => g.geneCount >= 2).slice(0, 15);
|
|
280
|
+
if (topGo.length > 0) {
|
|
281
|
+
lines.push(`## GO Terms Shared by 2+ Genes (${topGo.length})`, '', '| GO Term | Aspect | Genes | Count |', '|---------|--------|-------|-------|');
|
|
282
|
+
for (const g of topGo) {
|
|
283
|
+
lines.push(`| ${g.name} | ${g.aspect} | ${g.genes} | ${g.geneCount} |`);
|
|
284
|
+
}
|
|
285
|
+
lines.push('');
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
if (enrichmentRows.length > 0) {
|
|
289
|
+
lines.push(`## Enrichment (${library})`, '', '| Rank | Term | Adj. P-value | Genes |', '|------|------|-------------|-------|');
|
|
290
|
+
for (const r of enrichmentRows.slice(0, 15)) {
|
|
291
|
+
lines.push(`| ${r.rank} | ${r.term} | ${r.adjustedPValue} | ${r.genes} |`);
|
|
292
|
+
}
|
|
293
|
+
lines.push('');
|
|
294
|
+
}
|
|
295
|
+
if (warnings.length > 0) {
|
|
296
|
+
lines.push('## Warnings', '');
|
|
297
|
+
for (const w of warnings)
|
|
298
|
+
lines.push(`- ${w}`);
|
|
299
|
+
lines.push('');
|
|
300
|
+
}
|
|
301
|
+
lines.push('---', `*Generated by [biocli](https://github.com/youngfly93/biocli)*`);
|
|
302
|
+
writeFileSync(join(outdir, 'report.md'), lines.filter(l => l !== undefined).join('\n') + '\n');
|
|
303
|
+
steps.push({ step: 'report', status: 'done', detail: `report.md → ${outdir}` });
|
|
304
|
+
// ── manifest.json ───────────────────────────────────────────────────
|
|
305
|
+
const manifest = {
|
|
306
|
+
biocliVersion: getVersion(), createdAt: new Date().toISOString(),
|
|
307
|
+
command: 'workflow-profile', input: { genes, organism: org.name, library },
|
|
308
|
+
output: {
|
|
309
|
+
'profiles.json': `${profiles.length} gene profiles`,
|
|
310
|
+
'interactions.csv': `${interactions.length} interactions`,
|
|
311
|
+
'shared_pathways.csv': `${sharedPathways.length} shared pathways`,
|
|
312
|
+
'go_summary.csv': `${goSummary.length} GO terms`,
|
|
313
|
+
'enrichment.csv': `${enrichmentRows.length} enrichment terms`,
|
|
314
|
+
'report.md': 'Markdown report',
|
|
315
|
+
},
|
|
316
|
+
sources, warnings,
|
|
317
|
+
};
|
|
318
|
+
steps.push({ step: 'manifest', status: 'done', detail: `manifest.json → ${outdir}` });
|
|
319
|
+
writeFileSync(join(outdir, 'manifest.json'), JSON.stringify(manifest, null, 2));
|
|
320
|
+
return wrapResult({ outdir, genes, steps, summary: {
|
|
321
|
+
geneCount: genes.length, interactionCount: interactions.length,
|
|
322
|
+
sharedPathwayCount: sharedPathways.length, goTermCount: goSummary.length,
|
|
323
|
+
enrichmentTerms: enrichmentRows.length, sources, warnings,
|
|
324
|
+
} }, { ids: {}, sources, warnings, query: genes.join(','), organism: org.name });
|
|
325
|
+
},
|
|
326
|
+
});
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
*/
|
|
11
11
|
import { cli, Strategy } from '../../registry.js';
|
|
12
12
|
import { CliError } from '../../errors.js';
|
|
13
|
-
import { mkdirSync, existsSync, createWriteStream } from 'node:fs';
|
|
13
|
+
import { mkdirSync, existsSync, createWriteStream, statSync } from 'node:fs';
|
|
14
14
|
import { join } from 'node:path';
|
|
15
15
|
import { pipeline } from 'node:stream/promises';
|
|
16
16
|
import { Readable } from 'node:stream';
|
|
@@ -96,6 +96,24 @@ cli({
|
|
|
96
96
|
for (const file of files) {
|
|
97
97
|
const fileUrl = `${supplUrl}${file.name}`;
|
|
98
98
|
const destPath = join(outdir, file.name);
|
|
99
|
+
// Resume: skip only if local file matches expected remote size
|
|
100
|
+
if (existsSync(destPath) && statSync(destPath).size > 0) {
|
|
101
|
+
try {
|
|
102
|
+
const head = await fetch(fileUrl, { method: 'HEAD' });
|
|
103
|
+
if (head.ok) {
|
|
104
|
+
const expectedSize = Number(head.headers.get('content-length') ?? 0);
|
|
105
|
+
const localSize = statSync(destPath).size;
|
|
106
|
+
if (expectedSize > 0 && localSize === expectedSize) {
|
|
107
|
+
rows.push({ file: file.name, size: file.size, status: `skipped (complete)` });
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
// Incomplete or mismatched — will re-download below
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
catch {
|
|
114
|
+
// HEAD failed — proceed with download
|
|
115
|
+
}
|
|
116
|
+
}
|
|
99
117
|
try {
|
|
100
118
|
const response = await fetch(fileUrl);
|
|
101
119
|
if (!response.ok || !response.body) {
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
*/
|
|
14
14
|
import { cli, Strategy } from '../../registry.js';
|
|
15
15
|
import { CliError } from '../../errors.js';
|
|
16
|
-
import { mkdirSync, existsSync, createWriteStream } from 'node:fs';
|
|
16
|
+
import { mkdirSync, existsSync, createWriteStream, statSync } from 'node:fs';
|
|
17
17
|
import { join } from 'node:path';
|
|
18
18
|
import { pipeline } from 'node:stream/promises';
|
|
19
19
|
import { Readable } from 'node:stream';
|
|
@@ -141,6 +141,29 @@ cli({
|
|
|
141
141
|
catch { /* skip */ }
|
|
142
142
|
continue;
|
|
143
143
|
}
|
|
144
|
+
// Resume: skip if file already exists AND matches expected size (HEAD check)
|
|
145
|
+
if (existsSync(destPath) && statSync(destPath).size > 0) {
|
|
146
|
+
try {
|
|
147
|
+
const head = await fetch(url, { method: 'HEAD' });
|
|
148
|
+
if (head.ok) {
|
|
149
|
+
const expectedSize = Number(head.headers.get('content-length') ?? 0);
|
|
150
|
+
const localSize = statSync(destPath).size;
|
|
151
|
+
if (expectedSize > 0 && localSize === expectedSize) {
|
|
152
|
+
rows.push({ file: fileName, size: formatSize(localSize), status: `skipped (complete, ${formatSize(localSize)})` });
|
|
153
|
+
continue;
|
|
154
|
+
}
|
|
155
|
+
else if (expectedSize > 0 && localSize < expectedSize) {
|
|
156
|
+
// Incomplete file — delete and re-download
|
|
157
|
+
const { rmSync } = await import('node:fs');
|
|
158
|
+
rmSync(destPath);
|
|
159
|
+
}
|
|
160
|
+
// localSize > expectedSize or expectedSize unknown: re-download
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
catch {
|
|
164
|
+
// HEAD failed — proceed with download
|
|
165
|
+
}
|
|
166
|
+
}
|
|
144
167
|
try {
|
|
145
168
|
// Max-size check: HEAD request first to get size
|
|
146
169
|
if (maxSizeBytes < Infinity) {
|
|
@@ -68,7 +68,8 @@ export function registerCommandToProgram(siteCmd, cmd) {
|
|
|
68
68
|
.option('-A, --all-columns', 'Show all available columns', false)
|
|
69
69
|
.option('-v, --verbose', 'Debug output', false)
|
|
70
70
|
.option('--input <file>', 'Batch input: file with one ID per line, or - for stdin')
|
|
71
|
-
.option('--no-cache', 'Skip cache and fetch fresh data')
|
|
71
|
+
.option('--no-cache', 'Skip cache and fetch fresh data')
|
|
72
|
+
.option('--retry <n>', 'Retry failed batch items N times (default: 0)', '0');
|
|
72
73
|
subCmd.action(async (...actionArgs) => {
|
|
73
74
|
const actionOpts = actionArgs[positionalArgs.length] ?? {};
|
|
74
75
|
const optionsRecord = typeof actionOpts === 'object' && actionOpts !== null ? actionOpts : {};
|
|
@@ -92,6 +93,18 @@ export function registerCommandToProgram(siteCmd, cmd) {
|
|
|
92
93
|
}
|
|
93
94
|
const verbose = optionsRecord.verbose === true;
|
|
94
95
|
const inputFile = typeof optionsRecord.input === 'string' ? optionsRecord.input : undefined;
|
|
96
|
+
// If --input is provided, read file and inject into positional arg.
|
|
97
|
+
// Only for commands whose positional arg is named "genes" (multi-entity pattern).
|
|
98
|
+
// Single-entity commands (gene-dossier, variant-dossier, etc.) use batch mode instead.
|
|
99
|
+
const primaryArgName = positionalArgs[0]?.name;
|
|
100
|
+
const supportsInputInject = primaryArgName === 'genes';
|
|
101
|
+
if (inputFile && supportsInputInject && !kwargs[primaryArgName]) {
|
|
102
|
+
const { parseBatchInput: parseInput } = await import('./batch.js');
|
|
103
|
+
const items = parseInput(undefined, inputFile);
|
|
104
|
+
if (items && items.length > 0) {
|
|
105
|
+
kwargs[primaryArgName] = items.join(',');
|
|
106
|
+
}
|
|
107
|
+
}
|
|
95
108
|
// Validate required positional args (unless --input provides batch input)
|
|
96
109
|
if (!inputFile) {
|
|
97
110
|
for (const arg of positionalArgs) {
|
|
@@ -113,17 +126,21 @@ export function registerCommandToProgram(siteCmd, cmd) {
|
|
|
113
126
|
// Commander's --no-cache sets optionsRecord.cache to false
|
|
114
127
|
const noCache = optionsRecord.cache === false;
|
|
115
128
|
// ── Batch mode: --input or comma-separated positional ────────────
|
|
129
|
+
// Skip batch for aggregate commands — they handle their own multi-input parsing
|
|
116
130
|
const primaryArg = positionalArgs[0]; // first positional = primary ID/query
|
|
117
|
-
const
|
|
131
|
+
const skipBatch = cmd.database === 'aggregate';
|
|
132
|
+
const batchItems = (primaryArg && !skipBatch)
|
|
118
133
|
? parseBatchInput(kwargs[primaryArg.name], inputFile)
|
|
119
134
|
: null;
|
|
135
|
+
const retryCount = Math.max(0, parseInt(String(optionsRecord.retry ?? '0'), 10) || 0);
|
|
120
136
|
let result;
|
|
121
137
|
if (batchItems && primaryArg) {
|
|
122
138
|
const spinnerLabel = `Batch ${fullName(cmd)} (${batchItems.length} items)…`;
|
|
123
139
|
const spinner = startSpinner(spinnerLabel);
|
|
124
140
|
const batchResults = [];
|
|
125
|
-
|
|
141
|
+
let failedItems = [];
|
|
126
142
|
try {
|
|
143
|
+
// First pass
|
|
127
144
|
for (const item of batchItems) {
|
|
128
145
|
try {
|
|
129
146
|
const batchKwargs = { ...kwargs, [primaryArg.name]: item };
|
|
@@ -132,19 +149,35 @@ export function registerCommandToProgram(siteCmd, cmd) {
|
|
|
132
149
|
batchResults.push(r);
|
|
133
150
|
}
|
|
134
151
|
catch (err) {
|
|
135
|
-
|
|
152
|
+
failedItems.push(item);
|
|
136
153
|
if (verbose)
|
|
137
154
|
console.error(chalk.yellow(`[Batch] ${item} failed: ${err instanceof Error ? err.message : String(err)}`));
|
|
138
155
|
}
|
|
139
156
|
}
|
|
157
|
+
// Retry failed items
|
|
158
|
+
for (let attempt = 1; attempt <= retryCount && failedItems.length > 0; attempt++) {
|
|
159
|
+
if (verbose)
|
|
160
|
+
console.error(chalk.dim(`[Batch] Retry ${attempt}/${retryCount}: ${failedItems.length} item(s)…`));
|
|
161
|
+
const stillFailed = [];
|
|
162
|
+
for (const item of failedItems) {
|
|
163
|
+
try {
|
|
164
|
+
const batchKwargs = { ...kwargs, [primaryArg.name]: item };
|
|
165
|
+
const r = await executeCommand(cmd, batchKwargs, verbose, { noCache: true });
|
|
166
|
+
if (r !== null && r !== undefined)
|
|
167
|
+
batchResults.push(r);
|
|
168
|
+
}
|
|
169
|
+
catch {
|
|
170
|
+
stillFailed.push(item);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
failedItems = stillFailed;
|
|
174
|
+
}
|
|
140
175
|
}
|
|
141
176
|
finally {
|
|
142
177
|
spinner.stop();
|
|
143
178
|
}
|
|
144
|
-
if (
|
|
145
|
-
console.error(chalk.yellow(`[Batch] ${
|
|
146
|
-
if (verbose)
|
|
147
|
-
errors.forEach(e => console.error(chalk.dim(` ${e}`)));
|
|
179
|
+
if (failedItems.length > 0) {
|
|
180
|
+
console.error(chalk.yellow(`[Batch] ${failedItems.length}/${batchItems.length} failed${retryCount > 0 ? ` (after ${retryCount} retries)` : ''}: ${failedItems.join(', ')}`));
|
|
148
181
|
}
|
|
149
182
|
if (!batchResults.length) {
|
|
150
183
|
console.error(chalk.red(`All ${batchItems.length} batch items failed.`));
|
|
@@ -168,7 +201,8 @@ export function registerCommandToProgram(siteCmd, cmd) {
|
|
|
168
201
|
if (result === null || result === undefined) {
|
|
169
202
|
return;
|
|
170
203
|
}
|
|
171
|
-
// Extract display metadata if the command returned ResultWithMeta
|
|
204
|
+
// Extract display metadata if the command returned ResultWithMeta or BiocliResult
|
|
205
|
+
let biocliResultColumns = false;
|
|
172
206
|
let renderData = result;
|
|
173
207
|
let totalCount;
|
|
174
208
|
let query;
|
|
@@ -177,6 +211,22 @@ export function registerCommandToProgram(siteCmd, cmd) {
|
|
|
177
211
|
totalCount = result.meta.totalCount;
|
|
178
212
|
query = result.meta.query;
|
|
179
213
|
}
|
|
214
|
+
else if (typeof result === 'object' && result !== null && 'data' in result && 'sources' in result) {
|
|
215
|
+
// BiocliResult envelope — for report/table/csv, render the data payload
|
|
216
|
+
const biocliResult = result;
|
|
217
|
+
query = String(biocliResult.query ?? '');
|
|
218
|
+
if (format === 'json' || format === 'yaml' || format === 'yml') {
|
|
219
|
+
// JSON/YAML: render the full envelope (agent-friendly)
|
|
220
|
+
renderData = result;
|
|
221
|
+
}
|
|
222
|
+
else {
|
|
223
|
+
// table/csv/report/md: render the data payload with actual keys
|
|
224
|
+
renderData = biocliResult.data;
|
|
225
|
+
// Override columns to use data's actual keys (command-declared columns
|
|
226
|
+
// may not match the BiocliResult data payload field names)
|
|
227
|
+
biocliResultColumns = true;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
180
230
|
const resolved = getRegistry().get(fullName(cmd)) ?? cmd;
|
|
181
231
|
if (format === 'table' && resolved.defaultFormat) {
|
|
182
232
|
format = resolved.defaultFormat;
|
|
@@ -192,8 +242,9 @@ export function registerCommandToProgram(siteCmd, cmd) {
|
|
|
192
242
|
// --columns pmid,title,abstract → user-specified subset
|
|
193
243
|
// --all-columns / -A → all keys from first row
|
|
194
244
|
// (default) → adapter-declared columns
|
|
195
|
-
|
|
196
|
-
|
|
245
|
+
// For BiocliResult data, use actual keys from the data payload
|
|
246
|
+
let displayColumns = biocliResultColumns ? undefined : resolved.columns;
|
|
247
|
+
const allColumns = optionsRecord.allColumns === true || biocliResultColumns;
|
|
197
248
|
const userColumns = typeof optionsRecord.columns === 'string' ? optionsRecord.columns : undefined;
|
|
198
249
|
if (userColumns) {
|
|
199
250
|
displayColumns = userColumns.split(',').map((s) => s.trim()).filter(Boolean);
|
package/dist/output.js
CHANGED
|
@@ -194,6 +194,9 @@ export function render(data, opts = {}) {
|
|
|
194
194
|
case 'markdown':
|
|
195
195
|
renderMarkdown(data, opts);
|
|
196
196
|
break;
|
|
197
|
+
case 'report':
|
|
198
|
+
renderReport(data, opts);
|
|
199
|
+
break;
|
|
197
200
|
case 'csv':
|
|
198
201
|
renderCsv(data, opts);
|
|
199
202
|
break;
|
|
@@ -386,6 +389,55 @@ function renderCsv(data, opts) {
|
|
|
386
389
|
}).join(','));
|
|
387
390
|
}
|
|
388
391
|
}
|
|
392
|
+
function renderReport(data, opts) {
|
|
393
|
+
const rows = normalizeRows(data);
|
|
394
|
+
const columns = resolveColumns(rows, opts);
|
|
395
|
+
// Title
|
|
396
|
+
const title = opts.title ?? opts.source ?? 'biocli Report';
|
|
397
|
+
console.log(`# ${title}`);
|
|
398
|
+
console.log();
|
|
399
|
+
console.log(`*Generated on ${new Date().toISOString()}*`);
|
|
400
|
+
console.log();
|
|
401
|
+
// Metadata
|
|
402
|
+
if (opts.query)
|
|
403
|
+
console.log(`**Query**: ${opts.query}`);
|
|
404
|
+
if (opts.totalCount !== undefined)
|
|
405
|
+
console.log(`**Total results**: ${opts.totalCount} (showing ${rows.length})`);
|
|
406
|
+
console.log(`**Columns**: ${columns.join(', ')}`);
|
|
407
|
+
console.log();
|
|
408
|
+
if (!rows.length) {
|
|
409
|
+
console.log('*No results found.*');
|
|
410
|
+
return;
|
|
411
|
+
}
|
|
412
|
+
// Data table
|
|
413
|
+
console.log('## Results');
|
|
414
|
+
console.log();
|
|
415
|
+
console.log('| ' + columns.join(' | ') + ' |');
|
|
416
|
+
console.log('| ' + columns.map(() => '---').join(' | ') + ' |');
|
|
417
|
+
for (const row of rows) {
|
|
418
|
+
const cells = columns.map(c => {
|
|
419
|
+
const raw = row[c];
|
|
420
|
+
// Format nested objects/arrays for report readability
|
|
421
|
+
let v;
|
|
422
|
+
if (Array.isArray(raw)) {
|
|
423
|
+
v = `${raw.length} items`;
|
|
424
|
+
}
|
|
425
|
+
else if (raw !== null && typeof raw === 'object') {
|
|
426
|
+
v = JSON.stringify(raw);
|
|
427
|
+
}
|
|
428
|
+
else {
|
|
429
|
+
v = String(raw ?? '');
|
|
430
|
+
}
|
|
431
|
+
// Truncate long values and escape pipes for Markdown
|
|
432
|
+
v = v.replace(/\|/g, '\\|');
|
|
433
|
+
return v.length > 80 ? v.slice(0, 80) + '...' : v;
|
|
434
|
+
});
|
|
435
|
+
console.log('| ' + cells.join(' | ') + ' |');
|
|
436
|
+
}
|
|
437
|
+
console.log();
|
|
438
|
+
console.log('---');
|
|
439
|
+
console.log('*Generated by [biocli](https://github.com/youngfly93/biocli)*');
|
|
440
|
+
}
|
|
389
441
|
function renderYaml(data) {
|
|
390
442
|
console.log(yaml.dump(data, { sortKeys: false, lineWidth: 120, noRefs: true }));
|
|
391
443
|
}
|