@yangfei_93sky/biocli 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +197 -0
  3. package/dist/batch.d.ts +20 -0
  4. package/dist/batch.js +69 -0
  5. package/dist/build-manifest.d.ts +38 -0
  6. package/dist/build-manifest.js +186 -0
  7. package/dist/cache.d.ts +28 -0
  8. package/dist/cache.js +126 -0
  9. package/dist/cli-manifest.json +1500 -0
  10. package/dist/cli.d.ts +7 -0
  11. package/dist/cli.js +336 -0
  12. package/dist/clis/_shared/common.d.ts +8 -0
  13. package/dist/clis/_shared/common.js +13 -0
  14. package/dist/clis/_shared/eutils.d.ts +9 -0
  15. package/dist/clis/_shared/eutils.js +9 -0
  16. package/dist/clis/_shared/organism-db.d.ts +23 -0
  17. package/dist/clis/_shared/organism-db.js +58 -0
  18. package/dist/clis/_shared/xml-helpers.d.ts +58 -0
  19. package/dist/clis/_shared/xml-helpers.js +266 -0
  20. package/dist/clis/aggregate/enrichment.d.ts +7 -0
  21. package/dist/clis/aggregate/enrichment.js +105 -0
  22. package/dist/clis/aggregate/gene-dossier.d.ts +13 -0
  23. package/dist/clis/aggregate/gene-dossier.js +248 -0
  24. package/dist/clis/aggregate/gene-profile.d.ts +16 -0
  25. package/dist/clis/aggregate/gene-profile.js +305 -0
  26. package/dist/clis/aggregate/literature-brief.d.ts +7 -0
  27. package/dist/clis/aggregate/literature-brief.js +79 -0
  28. package/dist/clis/aggregate/variant-dossier.d.ts +11 -0
  29. package/dist/clis/aggregate/variant-dossier.js +161 -0
  30. package/dist/clis/aggregate/variant-interpret.d.ts +10 -0
  31. package/dist/clis/aggregate/variant-interpret.js +210 -0
  32. package/dist/clis/aggregate/workflow-prepare.d.ts +12 -0
  33. package/dist/clis/aggregate/workflow-prepare.js +228 -0
  34. package/dist/clis/aggregate/workflow-scout.d.ts +13 -0
  35. package/dist/clis/aggregate/workflow-scout.js +175 -0
  36. package/dist/clis/clinvar/search.d.ts +8 -0
  37. package/dist/clis/clinvar/search.js +61 -0
  38. package/dist/clis/clinvar/variant.d.ts +7 -0
  39. package/dist/clis/clinvar/variant.js +53 -0
  40. package/dist/clis/enrichr/analyze.d.ts +7 -0
  41. package/dist/clis/enrichr/analyze.js +48 -0
  42. package/dist/clis/ensembl/lookup.d.ts +6 -0
  43. package/dist/clis/ensembl/lookup.js +38 -0
  44. package/dist/clis/ensembl/vep.d.ts +7 -0
  45. package/dist/clis/ensembl/vep.js +86 -0
  46. package/dist/clis/ensembl/xrefs.d.ts +6 -0
  47. package/dist/clis/ensembl/xrefs.js +36 -0
  48. package/dist/clis/gene/fetch.d.ts +10 -0
  49. package/dist/clis/gene/fetch.js +96 -0
  50. package/dist/clis/gene/info.d.ts +7 -0
  51. package/dist/clis/gene/info.js +37 -0
  52. package/dist/clis/gene/search.d.ts +7 -0
  53. package/dist/clis/gene/search.js +71 -0
  54. package/dist/clis/geo/dataset.d.ts +7 -0
  55. package/dist/clis/geo/dataset.js +55 -0
  56. package/dist/clis/geo/download.d.ts +17 -0
  57. package/dist/clis/geo/download.js +115 -0
  58. package/dist/clis/geo/samples.d.ts +7 -0
  59. package/dist/clis/geo/samples.js +57 -0
  60. package/dist/clis/geo/search.d.ts +8 -0
  61. package/dist/clis/geo/search.js +66 -0
  62. package/dist/clis/kegg/convert.d.ts +7 -0
  63. package/dist/clis/kegg/convert.js +37 -0
  64. package/dist/clis/kegg/disease.d.ts +6 -0
  65. package/dist/clis/kegg/disease.js +57 -0
  66. package/dist/clis/kegg/link.d.ts +7 -0
  67. package/dist/clis/kegg/link.js +36 -0
  68. package/dist/clis/kegg/pathway.d.ts +6 -0
  69. package/dist/clis/kegg/pathway.js +37 -0
  70. package/dist/clis/pubmed/abstract.d.ts +7 -0
  71. package/dist/clis/pubmed/abstract.js +42 -0
  72. package/dist/clis/pubmed/cited-by.d.ts +7 -0
  73. package/dist/clis/pubmed/cited-by.js +77 -0
  74. package/dist/clis/pubmed/fetch.d.ts +6 -0
  75. package/dist/clis/pubmed/fetch.js +36 -0
  76. package/dist/clis/pubmed/info.yaml +22 -0
  77. package/dist/clis/pubmed/related.d.ts +7 -0
  78. package/dist/clis/pubmed/related.js +81 -0
  79. package/dist/clis/pubmed/search.d.ts +8 -0
  80. package/dist/clis/pubmed/search.js +63 -0
  81. package/dist/clis/snp/lookup.d.ts +7 -0
  82. package/dist/clis/snp/lookup.js +57 -0
  83. package/dist/clis/sra/download.d.ts +18 -0
  84. package/dist/clis/sra/download.js +217 -0
  85. package/dist/clis/sra/run.d.ts +8 -0
  86. package/dist/clis/sra/run.js +77 -0
  87. package/dist/clis/sra/search.d.ts +8 -0
  88. package/dist/clis/sra/search.js +83 -0
  89. package/dist/clis/string/enrichment.d.ts +7 -0
  90. package/dist/clis/string/enrichment.js +50 -0
  91. package/dist/clis/string/network.d.ts +7 -0
  92. package/dist/clis/string/network.js +47 -0
  93. package/dist/clis/string/partners.d.ts +4 -0
  94. package/dist/clis/string/partners.js +44 -0
  95. package/dist/clis/taxonomy/lookup.d.ts +8 -0
  96. package/dist/clis/taxonomy/lookup.js +54 -0
  97. package/dist/clis/uniprot/fetch.d.ts +7 -0
  98. package/dist/clis/uniprot/fetch.js +82 -0
  99. package/dist/clis/uniprot/search.d.ts +6 -0
  100. package/dist/clis/uniprot/search.js +65 -0
  101. package/dist/clis/uniprot/sequence.d.ts +7 -0
  102. package/dist/clis/uniprot/sequence.js +51 -0
  103. package/dist/commander-adapter.d.ts +27 -0
  104. package/dist/commander-adapter.js +286 -0
  105. package/dist/completion.d.ts +19 -0
  106. package/dist/completion.js +117 -0
  107. package/dist/config.d.ts +57 -0
  108. package/dist/config.js +94 -0
  109. package/dist/databases/enrichr.d.ts +28 -0
  110. package/dist/databases/enrichr.js +131 -0
  111. package/dist/databases/ensembl.d.ts +14 -0
  112. package/dist/databases/ensembl.js +106 -0
  113. package/dist/databases/index.d.ts +45 -0
  114. package/dist/databases/index.js +49 -0
  115. package/dist/databases/kegg.d.ts +26 -0
  116. package/dist/databases/kegg.js +136 -0
  117. package/dist/databases/ncbi.d.ts +28 -0
  118. package/dist/databases/ncbi.js +144 -0
  119. package/dist/databases/string-db.d.ts +19 -0
  120. package/dist/databases/string-db.js +105 -0
  121. package/dist/databases/uniprot.d.ts +13 -0
  122. package/dist/databases/uniprot.js +110 -0
  123. package/dist/discovery.d.ts +32 -0
  124. package/dist/discovery.js +235 -0
  125. package/dist/doctor.d.ts +19 -0
  126. package/dist/doctor.js +151 -0
  127. package/dist/errors.d.ts +68 -0
  128. package/dist/errors.js +105 -0
  129. package/dist/execution.d.ts +15 -0
  130. package/dist/execution.js +178 -0
  131. package/dist/hooks.d.ts +48 -0
  132. package/dist/hooks.js +58 -0
  133. package/dist/main.d.ts +13 -0
  134. package/dist/main.js +31 -0
  135. package/dist/ncbi-fetch.d.ts +10 -0
  136. package/dist/ncbi-fetch.js +10 -0
  137. package/dist/output.d.ts +18 -0
  138. package/dist/output.js +394 -0
  139. package/dist/pipeline/executor.d.ts +22 -0
  140. package/dist/pipeline/executor.js +40 -0
  141. package/dist/pipeline/index.d.ts +6 -0
  142. package/dist/pipeline/index.js +6 -0
  143. package/dist/pipeline/registry.d.ts +16 -0
  144. package/dist/pipeline/registry.js +31 -0
  145. package/dist/pipeline/steps/fetch.d.ts +21 -0
  146. package/dist/pipeline/steps/fetch.js +160 -0
  147. package/dist/pipeline/steps/transform.d.ts +26 -0
  148. package/dist/pipeline/steps/transform.js +92 -0
  149. package/dist/pipeline/steps/xml-parse.d.ts +12 -0
  150. package/dist/pipeline/steps/xml-parse.js +27 -0
  151. package/dist/pipeline/template.d.ts +35 -0
  152. package/dist/pipeline/template.js +312 -0
  153. package/dist/rate-limiter.d.ts +56 -0
  154. package/dist/rate-limiter.js +120 -0
  155. package/dist/registry-api.d.ts +15 -0
  156. package/dist/registry-api.js +13 -0
  157. package/dist/registry.d.ts +90 -0
  158. package/dist/registry.js +100 -0
  159. package/dist/schema.d.ts +80 -0
  160. package/dist/schema.js +72 -0
  161. package/dist/spinner.d.ts +19 -0
  162. package/dist/spinner.js +37 -0
  163. package/dist/types.d.ts +101 -0
  164. package/dist/types.js +27 -0
  165. package/dist/utils.d.ts +16 -0
  166. package/dist/utils.js +40 -0
  167. package/dist/validate.d.ts +29 -0
  168. package/dist/validate.js +136 -0
  169. package/dist/verify.d.ts +20 -0
  170. package/dist/verify.js +131 -0
  171. package/dist/version.d.ts +13 -0
  172. package/dist/version.js +36 -0
  173. package/dist/xml-parser.d.ts +19 -0
  174. package/dist/xml-parser.js +119 -0
  175. package/dist/yaml-schema.d.ts +40 -0
  176. package/dist/yaml-schema.js +62 -0
  177. package/package.json +68 -0
@@ -0,0 +1,210 @@
1
+ /**
2
+ * aggregate/variant-interpret — Variant interpretation with clinical context.
3
+ *
4
+ * Builds on variant-dossier by adding:
5
+ * - UniProt protein function context for the affected gene
6
+ * - Structured interpretation summary (pathogenicity, impact, recommendation)
7
+ *
8
+ * Cross-queries: dbSNP + ClinVar + Ensembl VEP + UniProt
9
+ */
10
+ import { cli, Strategy } from '../../registry.js';
11
+ import { CliError } from '../../errors.js';
12
+ import { wrapResult } from '../../types.js';
13
+ import { createHttpContextForDatabase } from '../../databases/index.js';
14
+ import { buildEutilsUrl } from '../../databases/ncbi.js';
15
+ import { buildEnsemblUrl } from '../../databases/ensembl.js';
16
+ import { buildUniprotUrl } from '../../databases/uniprot.js';
17
+ // ── Impact severity mapping ──────────────────────────────────────────────────
18
+ const IMPACT_SEVERITY = {
19
+ HIGH: 4,
20
+ MODERATE: 3,
21
+ LOW: 2,
22
+ MODIFIER: 1,
23
+ };
24
+ function interpretImpact(impact) {
25
+ switch (impact) {
26
+ case 'HIGH': return 'Likely damaging — causes protein truncation, loss of function, or frameshift';
27
+ case 'MODERATE': return 'Possibly damaging — amino acid change that may affect protein function';
28
+ case 'LOW': return 'Likely benign — synonymous or non-coding change with minimal functional impact';
29
+ case 'MODIFIER': return 'Uncertain — regulatory or non-coding region variant';
30
+ default: return 'Unknown impact';
31
+ }
32
+ }
33
+ cli({
34
+ site: 'aggregate',
35
+ name: 'variant-interpret',
36
+ description: 'Variant interpretation with clinical context (dbSNP + ClinVar + VEP + UniProt)',
37
+ database: 'aggregate',
38
+ strategy: Strategy.PUBLIC,
39
+ defaultFormat: 'json',
40
+ timeoutSeconds: 90,
41
+ args: [
42
+ { name: 'variant', positional: true, required: true, help: 'Variant ID: rsID (rs334), HGVS, or genomic coordinate' },
43
+ ],
44
+ columns: ['variant', 'gene', 'consequence', 'clinicalSignificance', 'interpretation'],
45
+ func: async (_ctx, args) => {
46
+ const variant = String(args.variant).trim();
47
+ if (!variant)
48
+ throw new CliError('ARGUMENT', 'Variant ID is required');
49
+ const sources = [];
50
+ const warnings = [];
51
+ const ids = {};
52
+ const ncbiCtx = createHttpContextForDatabase('ncbi');
53
+ const ensemblCtx = createHttpContextForDatabase('ensembl');
54
+ const uniprotCtx = createHttpContextForDatabase('uniprot');
55
+ const isRsId = /^rs\d+$/i.test(variant);
56
+ if (isRsId)
57
+ ids.rsId = variant;
58
+ // Phase 1: Parallel queries (dbSNP + ClinVar + VEP)
59
+ const [snpResult, clinvarResult, vepResult] = await Promise.allSettled([
60
+ // dbSNP
61
+ isRsId ? (async () => {
62
+ const data = await ncbiCtx.fetchJson(buildEutilsUrl('esummary.fcgi', {
63
+ db: 'snp', id: variant.replace(/^rs/i, ''), retmode: 'json',
64
+ }));
65
+ const result = data?.result;
66
+ const snpId = variant.replace(/^rs/i, '');
67
+ const entry = result?.[snpId];
68
+ if (!entry)
69
+ return null;
70
+ return {
71
+ rsid: `rs${snpId}`,
72
+ gene: String((Array.isArray(entry.genes) && entry.genes.length > 0) ? entry.genes[0].name ?? '' : ''),
73
+ chromosome: String(entry.chr ?? ''),
74
+ position: String(entry.chrpos ?? ''),
75
+ alleles: String(entry.docsum ?? ''),
76
+ clinicalSignificance: Array.isArray(entry.clinical_significance)
77
+ ? entry.clinical_significance.join(', ')
78
+ : String(entry.clinical_significance ?? ''),
79
+ };
80
+ })() : Promise.resolve(null),
81
+ // ClinVar
82
+ isRsId ? (async () => {
83
+ const sr = await ncbiCtx.fetchJson(buildEutilsUrl('esearch.fcgi', {
84
+ db: 'clinvar', term: `${variant}[Variant ID]`, retmax: '5', retmode: 'json',
85
+ }));
86
+ const cvIds = sr?.esearchresult?.idlist ?? [];
87
+ if (!cvIds.length)
88
+ return [];
89
+ const summ = await ncbiCtx.fetchJson(buildEutilsUrl('esummary.fcgi', {
90
+ db: 'clinvar', id: cvIds.join(','), retmode: 'json',
91
+ }));
92
+ const resultObj = summ?.result;
93
+ const uids = resultObj?.uids ?? [];
94
+ return uids.map(uid => {
95
+ const item = (resultObj?.[uid] ?? {});
96
+ const sig = typeof item.clinical_significance === 'object'
97
+ ? String(item.clinical_significance?.description ?? '')
98
+ : String(item.clinical_significance ?? '');
99
+ const traits = Array.isArray(item.trait_set)
100
+ ? item.trait_set.map(t => String(t.trait_name ?? '')).join('; ')
101
+ : '';
102
+ return { significance: sig, condition: traits, accession: String(item.accession ?? '') };
103
+ });
104
+ })() : Promise.resolve([]),
105
+ // Ensembl VEP
106
+ (async () => {
107
+ const vepPath = isRsId
108
+ ? `/vep/human/id/${variant}`
109
+ : `/vep/human/hgvs/${encodeURIComponent(variant)}`;
110
+ const data = await ensemblCtx.fetchJson(buildEnsemblUrl(vepPath, { canonical: '1', hgvs: '1', protein: '1' }));
111
+ if (!Array.isArray(data) || !data.length)
112
+ return [];
113
+ const entry = data[0];
114
+ const tc = (entry.transcript_consequences ?? []);
115
+ const sorted = [...tc].sort((a, b) => (IMPACT_SEVERITY[String(b.impact ?? '')] ?? 0) - (IMPACT_SEVERITY[String(a.impact ?? '')] ?? 0));
116
+ return sorted.slice(0, 5).map(t => ({
117
+ gene: String(t.gene_symbol ?? ''),
118
+ transcript: String(t.transcript_id ?? ''),
119
+ consequence: (t.consequence_terms ?? []).join(', '),
120
+ impact: String(t.impact ?? ''),
121
+ aminoAcids: String(t.amino_acids ?? ''),
122
+ biotype: String(t.biotype ?? ''),
123
+ canonical: Boolean(t.canonical),
124
+ }));
125
+ })(),
126
+ ]);
127
+ // Extract results
128
+ const snpData = snpResult.status === 'fulfilled' ? snpResult.value : null;
129
+ if (snpData) {
130
+ sources.push('dbSNP');
131
+ if (snpData.gene)
132
+ ids.gene = snpData.gene;
133
+ }
134
+ else if (snpResult.status === 'rejected')
135
+ warnings.push(`dbSNP: ${snpResult.reason}`);
136
+ const clinvar = clinvarResult.status === 'fulfilled' ? clinvarResult.value : [];
137
+ if (clinvar.length)
138
+ sources.push('ClinVar');
139
+ else if (clinvarResult.status === 'rejected')
140
+ warnings.push(`ClinVar: ${clinvarResult.reason}`);
141
+ const vep = vepResult.status === 'fulfilled' ? vepResult.value : [];
142
+ if (vep.length)
143
+ sources.push('Ensembl VEP');
144
+ else if (vepResult.status === 'rejected')
145
+ warnings.push(`Ensembl VEP: ${vepResult.reason}`);
146
+ const geneName = snpData?.gene ?? vep[0]?.gene ?? '';
147
+ // Phase 2: UniProt lookup for gene context (if we have a gene name)
148
+ let proteinFunction = '';
149
+ if (geneName) {
150
+ try {
151
+ const searchData = await uniprotCtx.fetchJson(buildUniprotUrl('/uniprotkb/search', {
152
+ query: `gene_exact:${geneName} AND organism_id:9606`,
153
+ fields: 'accession,cc_function',
154
+ format: 'json',
155
+ size: '1',
156
+ }));
157
+ const results = (searchData?.results ?? []);
158
+ if (results.length > 0) {
159
+ sources.push('UniProt');
160
+ const entry = results[0];
161
+ ids.uniprotAccession = String(entry.primaryAccession ?? '');
162
+ const comments = (entry.comments ?? []);
163
+ const funcComment = comments.find(c => c.commentType === 'FUNCTION');
164
+ const texts = (funcComment?.texts ?? []);
165
+ proteinFunction = texts.map(t => String(t.value ?? '')).join(' ');
166
+ }
167
+ }
168
+ catch (err) {
169
+ warnings.push(`UniProt: ${err instanceof Error ? err.message : String(err)}`);
170
+ }
171
+ }
172
+ if (!snpData && !clinvar.length && !vep.length) {
173
+ throw new CliError('NOT_FOUND', `No data found for variant "${variant}"`, 'Check the variant ID format (e.g. rs334, NM_000518.5:c.20A>T)');
174
+ }
175
+ // Build interpretation
176
+ const topVep = vep[0];
177
+ const topClinvar = clinvar[0];
178
+ const highestImpact = topVep?.impact ?? 'Unknown';
179
+ const clinSig = topClinvar?.significance ?? snpData?.clinicalSignificance ?? 'Not reported';
180
+ const interpretation = {
181
+ clinicalSignificance: clinSig,
182
+ functionalImpact: interpretImpact(highestImpact),
183
+ consequence: topVep?.consequence ?? 'Unknown',
184
+ affectedGene: geneName,
185
+ proteinFunction: proteinFunction || 'No function annotation available',
186
+ conditions: clinvar.map(c => c.condition).filter(Boolean),
187
+ evidenceSources: sources,
188
+ recommendation: clinSig.toLowerCase().includes('pathogenic')
189
+ ? 'This variant has clinical significance. Consider genetic counseling and further clinical evaluation.'
190
+ : clinSig.toLowerCase().includes('benign')
191
+ ? 'This variant is classified as benign. No clinical action typically required.'
192
+ : 'Clinical significance is uncertain. Consider functional studies or additional evidence.',
193
+ };
194
+ return wrapResult({
195
+ variant,
196
+ gene: geneName,
197
+ chromosome: snpData?.chromosome ?? '',
198
+ position: snpData?.position ?? '',
199
+ interpretation,
200
+ vepConsequences: vep,
201
+ clinicalVariants: clinvar,
202
+ dbsnp: snpData ? { alleles: snpData.alleles } : null,
203
+ }, {
204
+ ids,
205
+ sources,
206
+ warnings,
207
+ query: variant,
208
+ });
209
+ },
210
+ });
@@ -0,0 +1,12 @@
1
+ /**
2
+ * aggregate/workflow-prepare — Prepare a research-ready working directory.
3
+ *
4
+ * Takes a user-selected dataset (from workflow-scout) and:
5
+ * 1. Downloads GEO supplementary files or SRA metadata
6
+ * 2. Fetches gene annotations (NCBI Gene + UniProt)
7
+ * 3. Fetches pathway context (KEGG)
8
+ * 4. Generates a structured manifest.json
9
+ *
10
+ * Output: a self-contained directory with data + annotations + manifest.
11
+ */
12
+ export {};
@@ -0,0 +1,228 @@
1
+ /**
2
+ * aggregate/workflow-prepare — Prepare a research-ready working directory.
3
+ *
4
+ * Takes a user-selected dataset (from workflow-scout) and:
5
+ * 1. Downloads GEO supplementary files or SRA metadata
6
+ * 2. Fetches gene annotations (NCBI Gene + UniProt)
7
+ * 3. Fetches pathway context (KEGG)
8
+ * 4. Generates a structured manifest.json
9
+ *
10
+ * Output: a self-contained directory with data + annotations + manifest.
11
+ */
12
+ import { cli, Strategy } from '../../registry.js';
13
+ import { CliError } from '../../errors.js';
14
+ import { wrapResult } from '../../types.js';
15
+ import { createHttpContextForDatabase } from '../../databases/index.js';
16
+ import { buildEutilsUrl } from '../../databases/ncbi.js';
17
+ import { buildUniprotUrl } from '../../databases/uniprot.js';
18
+ import { buildKeggUrl, parseKeggTsv } from '../../databases/kegg.js';
19
+ import { mkdirSync, existsSync, writeFileSync, createWriteStream } from 'node:fs';
20
+ import { join } from 'node:path';
21
+ import { pipeline } from 'node:stream/promises';
22
+ import { Readable } from 'node:stream';
23
+ cli({
24
+ site: 'aggregate',
25
+ name: 'workflow-prepare',
26
+ description: 'Prepare a research-ready directory from a selected dataset',
27
+ database: 'aggregate',
28
+ strategy: Strategy.PUBLIC,
29
+ defaultFormat: 'json',
30
+ timeoutSeconds: 300,
31
+ args: [
32
+ { name: 'dataset', positional: true, required: true, help: 'GEO accession (GSE*) or SRA accession (SRR*)' },
33
+ { name: 'gene', help: 'Focus gene symbol(s), comma-separated (e.g. TP53,BRCA1)' },
34
+ { name: 'outdir', required: true, help: 'Output directory for the prepared workspace' },
35
+ { name: 'skip-download', type: 'boolean', default: false, help: 'Skip data download, only fetch annotations' },
36
+ ],
37
+ columns: ['step', 'status', 'detail'],
38
+ func: async (_ctx, args) => {
39
+ const dataset = String(args.dataset).trim().toUpperCase();
40
+ const geneInput = args.gene ? String(args.gene) : undefined;
41
+ const genes = geneInput ? geneInput.split(',').map(s => s.trim()).filter(Boolean) : [];
42
+ const outdir = String(args.outdir);
43
+ const skipDownload = Boolean(args['skip-download']);
44
+ if (!dataset)
45
+ throw new CliError('ARGUMENT', 'Dataset accession is required');
46
+ const isGEO = /^GSE\d+$/.test(dataset);
47
+ const isSRA = /^[SDE]RR\d+$/i.test(dataset);
48
+ if (!isGEO && !isSRA) {
49
+ throw new CliError('ARGUMENT', `Unsupported dataset type: "${dataset}"`, 'Use a GSE accession (GEO) or SRR/ERR/DRR accession (SRA)');
50
+ }
51
+ // Create output directory structure
52
+ const dataDir = join(outdir, 'data');
53
+ const annotDir = join(outdir, 'annotations');
54
+ for (const dir of [outdir, dataDir, annotDir]) {
55
+ if (!existsSync(dir))
56
+ mkdirSync(dir, { recursive: true });
57
+ }
58
+ const sources = [];
59
+ const warnings = [];
60
+ const steps = [];
61
+ const ncbiCtx = createHttpContextForDatabase('ncbi');
62
+ // ── Step 1: Download dataset ────────────────────────────────────────
63
+ if (!skipDownload) {
64
+ if (isGEO) {
65
+ try {
66
+ const prefix = dataset.slice(0, -3) + 'nnn';
67
+ const supplUrl = `https://ftp.ncbi.nlm.nih.gov/geo/series/${prefix}/${dataset}/suppl/`;
68
+ const html = await ncbiCtx.fetchText(supplUrl);
69
+ // Parse file list
70
+ const linkRegex = /<a\s+href="([^"]+)">[^<]+<\/a>\s+[\d-]+\s+[\d:]+\s+([\d.]+[KMG]?)/gi;
71
+ const files = [];
72
+ let match;
73
+ while ((match = linkRegex.exec(html)) !== null) {
74
+ if (match[1] !== '../' && !match[1].endsWith('/')) {
75
+ files.push({ name: match[1], size: match[2] });
76
+ }
77
+ }
78
+ let downloaded = 0;
79
+ for (const file of files) {
80
+ try {
81
+ const resp = await fetch(`${supplUrl}${file.name}`);
82
+ if (resp.ok && resp.body) {
83
+ const ws = createWriteStream(join(dataDir, file.name));
84
+ await pipeline(Readable.fromWeb(resp.body), ws);
85
+ downloaded++;
86
+ }
87
+ }
88
+ catch (err) {
89
+ warnings.push(`Download ${file.name}: ${err instanceof Error ? err.message : String(err)}`);
90
+ }
91
+ }
92
+ steps.push({ step: 'download', status: 'done', detail: `${downloaded}/${files.length} GEO files → ${dataDir}` });
93
+ sources.push('GEO');
94
+ }
95
+ catch (err) {
96
+ steps.push({ step: 'download', status: 'failed', detail: String(err instanceof Error ? err.message : err) });
97
+ warnings.push(`GEO download: ${err instanceof Error ? err.message : String(err)}`);
98
+ }
99
+ }
100
+ else {
101
+ // SRA: just save metadata, actual FASTQ download is too large for prepare
102
+ steps.push({ step: 'download', status: 'skipped', detail: 'SRA FASTQ download too large for prepare — use `biocli sra download` separately' });
103
+ }
104
+ }
105
+ else {
106
+ steps.push({ step: 'download', status: 'skipped', detail: '--skip-download flag' });
107
+ }
108
+ // ── Step 2: Gene annotations ────────────────────────────────────────
109
+ if (genes.length > 0) {
110
+ const geneAnnotations = [];
111
+ for (const gene of genes) {
112
+ try {
113
+ // NCBI Gene search
114
+ const searchResult = await ncbiCtx.fetchJson(buildEutilsUrl('esearch.fcgi', {
115
+ db: 'gene', term: `${gene}[Gene Name] AND Homo sapiens[Organism]`, retmax: '1', retmode: 'json',
116
+ }));
117
+ const geneIds = searchResult?.esearchresult?.idlist ?? [];
118
+ if (geneIds.length > 0) {
119
+ const summaryResult = await ncbiCtx.fetchJson(buildEutilsUrl('esummary.fcgi', {
120
+ db: 'gene', id: geneIds[0], retmode: 'json',
121
+ }));
122
+ const resultObj = summaryResult?.result;
123
+ const entry = resultObj?.[geneIds[0]];
124
+ const annotation = {
125
+ symbol: gene,
126
+ ncbiGeneId: geneIds[0],
127
+ name: entry?.description ?? '',
128
+ chromosome: entry?.chromosome ?? '',
129
+ summary: entry?.summary ?? '',
130
+ };
131
+ // UniProt protein info — use reviewed:true + exact symbol match (same as gene-profile)
132
+ try {
133
+ const uniprotCtx = createHttpContextForDatabase('uniprot');
134
+ const upResult = await uniprotCtx.fetchJson(buildUniprotUrl('/uniprotkb/search', {
135
+ query: `gene:${gene} AND organism_id:9606 AND reviewed:true`,
136
+ format: 'json',
137
+ size: '5',
138
+ }));
139
+ const upEntries = (upResult?.results ?? []);
140
+ if (upEntries.length > 0) {
141
+ // Find exact gene name match among candidates
142
+ const getGeneName = (e) => {
143
+ const gs = e.genes;
144
+ const gn = gs?.[0];
145
+ const name = gn?.geneName;
146
+ return String(name?.value ?? '');
147
+ };
148
+ const exactMatch = upEntries.find(e => getGeneName(e).toUpperCase() === gene.toUpperCase());
149
+ const best = exactMatch ?? upEntries[0];
150
+ annotation.uniprotAccession = best.primaryAccession ?? '';
151
+ sources.push('UniProt');
152
+ }
153
+ }
154
+ catch { /* non-fatal */ }
155
+ geneAnnotations.push(annotation);
156
+ sources.push('NCBI Gene');
157
+ }
158
+ }
159
+ catch (err) {
160
+ warnings.push(`Gene ${gene}: ${err instanceof Error ? err.message : String(err)}`);
161
+ }
162
+ }
163
+ if (geneAnnotations.length > 0) {
164
+ writeFileSync(join(annotDir, 'genes.json'), JSON.stringify(geneAnnotations, null, 2));
165
+ steps.push({ step: 'gene-annotations', status: 'done', detail: `${geneAnnotations.length} gene(s) → annotations/genes.json` });
166
+ }
167
+ // KEGG pathways for genes — use NCBI Gene IDs (stable) instead of symbols
168
+ try {
169
+ const keggCtx = createHttpContextForDatabase('kegg');
170
+ const allPathways = [];
171
+ for (const annot of geneAnnotations) {
172
+ const geneId = annot.ncbiGeneId;
173
+ const symbol = annot.symbol;
174
+ if (!geneId)
175
+ continue;
176
+ try {
177
+ const linkText = await keggCtx.fetchText(buildKeggUrl(`/link/pathway/hsa:${geneId}`));
178
+ if (linkText && linkText.trim()) {
179
+ const links = parseKeggTsv(linkText);
180
+ allPathways.push(...links.map(l => ({ gene: symbol, ncbiGeneId: geneId, pathway: l.value })));
181
+ }
182
+ }
183
+ catch {
184
+ warnings.push(`KEGG pathway for ${symbol} (hsa:${geneId}): no results`);
185
+ }
186
+ }
187
+ if (allPathways.length > 0) {
188
+ writeFileSync(join(annotDir, 'pathways.json'), JSON.stringify(allPathways, null, 2));
189
+ steps.push({ step: 'pathways', status: 'done', detail: `${allPathways.length} pathway links → annotations/pathways.json` });
190
+ sources.push('KEGG');
191
+ }
192
+ }
193
+ catch (err) {
194
+ warnings.push(`KEGG pathways: ${err instanceof Error ? err.message : String(err)}`);
195
+ }
196
+ }
197
+ else {
198
+ steps.push({ step: 'gene-annotations', status: 'skipped', detail: 'No --gene specified' });
199
+ }
200
+ // ── Step 3: Generate manifest ───────────────────────────────────────
201
+ const manifest = {
202
+ biocliVersion: '0.2.0',
203
+ createdAt: new Date().toISOString(),
204
+ dataset,
205
+ genes,
206
+ organism: 'Homo sapiens',
207
+ sources: [...new Set(sources)],
208
+ warnings,
209
+ directories: {
210
+ data: 'data/',
211
+ annotations: 'annotations/',
212
+ },
213
+ steps,
214
+ };
215
+ steps.push({ step: 'manifest', status: 'done', detail: `manifest.json → ${outdir}` });
216
+ writeFileSync(join(outdir, 'manifest.json'), JSON.stringify(manifest, null, 2));
217
+ return wrapResult({
218
+ outdir,
219
+ dataset,
220
+ steps,
221
+ }, {
222
+ ids: { dataset, ...(genes.length === 1 ? { gene: genes[0] } : {}) },
223
+ sources: [...new Set(sources)],
224
+ warnings,
225
+ query: dataset,
226
+ });
227
+ },
228
+ });
@@ -0,0 +1,13 @@
1
+ /**
2
+ * aggregate/workflow-scout — Scout datasets for a research question.
3
+ *
4
+ * Searches GEO and SRA for relevant datasets based on gene + disease/topic,
5
+ * ranks candidates, and provides structured recommendations for the user
6
+ * to select before downloading with `workflow prepare`.
7
+ *
8
+ * Cross-queries:
9
+ * - GEO (datasets with expression data)
10
+ * - SRA (sequencing runs)
11
+ * - NCBI Gene (gene context for query refinement)
12
+ */
13
+ export {};
@@ -0,0 +1,175 @@
1
+ /**
2
+ * aggregate/workflow-scout — Scout datasets for a research question.
3
+ *
4
+ * Searches GEO and SRA for relevant datasets based on gene + disease/topic,
5
+ * ranks candidates, and provides structured recommendations for the user
6
+ * to select before downloading with `workflow prepare`.
7
+ *
8
+ * Cross-queries:
9
+ * - GEO (datasets with expression data)
10
+ * - SRA (sequencing runs)
11
+ * - NCBI Gene (gene context for query refinement)
12
+ */
13
+ import { cli, Strategy } from '../../registry.js';
14
+ import { CliError } from '../../errors.js';
15
+ import { wrapResult } from '../../types.js';
16
+ import { createHttpContextForDatabase } from '../../databases/index.js';
17
+ import { buildEutilsUrl } from '../../databases/ncbi.js';
18
+ cli({
19
+ site: 'aggregate',
20
+ name: 'workflow-scout',
21
+ description: 'Scout GEO/SRA datasets for a research question',
22
+ database: 'aggregate',
23
+ strategy: Strategy.PUBLIC,
24
+ defaultFormat: 'json',
25
+ timeoutSeconds: 60,
26
+ args: [
27
+ { name: 'query', positional: true, required: true, help: 'Research topic (e.g. "TP53 breast cancer RNA-seq")' },
28
+ { name: 'gene', help: 'Focus gene symbol (refines search)' },
29
+ { name: 'organism', default: 'Homo sapiens', help: 'Organism filter' },
30
+ { name: 'limit', type: 'int', default: 10, help: 'Max candidates per source' },
31
+ { name: 'type', default: 'gse', choices: ['gse', 'gds', 'all'], help: 'GEO entry type filter' },
32
+ ],
33
+ columns: ['rank', 'accession', 'title', 'type', 'samples', 'date', 'source'],
34
+ func: async (_ctx, args) => {
35
+ const query = String(args.query).trim();
36
+ const gene = args.gene ? String(args.gene).trim() : undefined;
37
+ const organism = String(args.organism);
38
+ const limit = Math.max(1, Math.min(Number(args.limit), 50));
39
+ const typeFilter = String(args.type).toUpperCase();
40
+ if (!query)
41
+ throw new CliError('ARGUMENT', 'Search query is required');
42
+ const sources = [];
43
+ const warnings = [];
44
+ const ncbiCtx = createHttpContextForDatabase('ncbi');
45
+ // Build refined search terms
46
+ const geneClause = gene ? `${gene}[Gene Name] AND ` : '';
47
+ const orgClause = organism ? `${organism}[Organism] AND ` : '';
48
+ // ── GEO search ──────────────────────────────────────────────────────
49
+ const geoCandidates = [];
50
+ try {
51
+ const geoTerm = typeFilter === 'ALL'
52
+ ? `${geneClause}${orgClause}${query}`
53
+ : `${geneClause}${orgClause}${query} AND ${typeFilter}[Entry Type]`;
54
+ const searchResult = await ncbiCtx.fetchJson(buildEutilsUrl('esearch.fcgi', {
55
+ db: 'gds',
56
+ term: geoTerm,
57
+ retmax: String(limit),
58
+ retmode: 'json',
59
+ sort: 'relevance',
60
+ }));
61
+ const esearch = searchResult?.esearchresult;
62
+ const ids = esearch?.idlist ?? [];
63
+ if (ids.length > 0) {
64
+ const summaryResult = await ncbiCtx.fetchJson(buildEutilsUrl('esummary.fcgi', {
65
+ db: 'gds',
66
+ id: ids.join(','),
67
+ retmode: 'json',
68
+ }));
69
+ const resultObj = summaryResult?.result;
70
+ const uids = resultObj?.uids ?? [];
71
+ for (let i = 0; i < uids.length; i++) {
72
+ const item = (resultObj?.[uids[i]] ?? {});
73
+ geoCandidates.push({
74
+ rank: i + 1,
75
+ accession: String(item.accession ?? `GDS${uids[i]}`),
76
+ title: String(item.title ?? ''),
77
+ organism: String(item.taxon ?? ''),
78
+ type: String(item.entrytype ?? ''),
79
+ samples: Number(item.n_samples ?? 0),
80
+ date: String(item.pdat ?? ''),
81
+ relevance: gene
82
+ ? (String(item.title ?? '').toLowerCase().includes(gene.toLowerCase()) ? 'gene in title' : 'keyword match')
83
+ : 'keyword match',
84
+ source: 'GEO',
85
+ });
86
+ }
87
+ sources.push('GEO');
88
+ }
89
+ }
90
+ catch (err) {
91
+ warnings.push(`GEO search failed: ${err instanceof Error ? err.message : String(err)}`);
92
+ }
93
+ // ── SRA search ──────────────────────────────────────────────────────
94
+ const sraCandidates = [];
95
+ try {
96
+ const sraTerm = `${geneClause}${orgClause}${query}`;
97
+ const searchResult = await ncbiCtx.fetchJson(buildEutilsUrl('esearch.fcgi', {
98
+ db: 'sra',
99
+ term: sraTerm,
100
+ retmax: String(limit),
101
+ retmode: 'json',
102
+ sort: 'relevance',
103
+ }));
104
+ const esearch = searchResult?.esearchresult;
105
+ const ids = esearch?.idlist ?? [];
106
+ if (ids.length > 0) {
107
+ const summaryResult = await ncbiCtx.fetchJson(buildEutilsUrl('esummary.fcgi', {
108
+ db: 'sra',
109
+ id: ids.join(','),
110
+ retmode: 'json',
111
+ }));
112
+ const resultObj = summaryResult?.result;
113
+ const uids = resultObj?.uids ?? [];
114
+ for (let i = 0; i < uids.length; i++) {
115
+ const item = (resultObj?.[uids[i]] ?? {});
116
+ const expXml = String(item.expxml ?? '');
117
+ const runsXml = String(item.runs ?? '');
118
+ // Extract from embedded XML
119
+ const titleMatch = expXml.match(/<Title>([^<]*)<\/Title>/);
120
+ const orgMatch = expXml.match(/taxname="([^"]*)"/);
121
+ const accMatch = runsXml.match(/acc="([^"]*)"/);
122
+ const strategyMatch = expXml.match(/<Library_strategy>([^<]*)<\/Library_strategy>/);
123
+ sraCandidates.push({
124
+ rank: i + 1,
125
+ accession: accMatch?.[1] ?? `SRA${uids[i]}`,
126
+ title: (titleMatch?.[1] ?? '').slice(0, 100),
127
+ organism: orgMatch?.[1] ?? '',
128
+ type: strategyMatch?.[1] ?? 'SRA',
129
+ samples: Number(item.total_runs ?? 1),
130
+ date: String(item.createdate ?? ''),
131
+ relevance: 'keyword match',
132
+ source: 'SRA',
133
+ });
134
+ }
135
+ sources.push('SRA');
136
+ }
137
+ }
138
+ catch (err) {
139
+ warnings.push(`SRA search failed: ${err instanceof Error ? err.message : String(err)}`);
140
+ }
141
+ // ── Merge and rank ──────────────────────────────────────────────────
142
+ const allCandidates = [...geoCandidates, ...sraCandidates];
143
+ if (!allCandidates.length) {
144
+ throw new CliError('NOT_FOUND', `No datasets found for "${query}"`, 'Try broader search terms, or remove --gene/--organism filters');
145
+ }
146
+ // Re-rank: prefer more samples, gene-in-title, recent date
147
+ allCandidates.sort((a, b) => {
148
+ // Gene in title gets priority
149
+ const aGeneBoost = a.relevance === 'gene in title' ? 1000 : 0;
150
+ const bGeneBoost = b.relevance === 'gene in title' ? 1000 : 0;
151
+ // More samples = better
152
+ const score = (bGeneBoost + b.samples) - (aGeneBoost + a.samples);
153
+ if (score !== 0)
154
+ return score;
155
+ // Tie-break by date (newer first)
156
+ return b.date.localeCompare(a.date);
157
+ });
158
+ // Re-assign ranks
159
+ allCandidates.forEach((c, i) => { c.rank = i + 1; });
160
+ const nextSteps = allCandidates.slice(0, 3).map(c => c.source === 'GEO'
161
+ ? `biocli geo download ${c.accession} --list-only`
162
+ : `biocli sra download ${c.accession} --dry-run`);
163
+ return wrapResult({
164
+ candidates: allCandidates,
165
+ summary: `Found ${geoCandidates.length} GEO + ${sraCandidates.length} SRA candidates for "${query}"`,
166
+ nextSteps,
167
+ }, {
168
+ ids: gene ? { gene } : {},
169
+ sources,
170
+ warnings,
171
+ query,
172
+ organism,
173
+ });
174
+ },
175
+ });
@@ -0,0 +1,8 @@
1
+ /**
2
+ * clinvar/search — Search ClinVar clinical variants.
3
+ *
4
+ * Uses the two-step esearch + esummary pattern:
5
+ * 1. esearch to retrieve matching ClinVar IDs
6
+ * 2. esummary (JSON) to get variant metadata
7
+ */
8
+ export {};