@sjcrh/proteinpaint-server 2.190.2-0 → 2.191.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,456 +0,0 @@
1
- import path from "path";
2
- import { run_R } from "@sjcrh/proteinpaint-r";
3
- import { run_rust } from "@sjcrh/proteinpaint-rust";
4
- import * as utils from "#src/utils.js";
5
- import serverconfig from "#src/serverconfig.js";
6
- import { gdc_validate_query_geneExpression } from "#src/mds3.gdc.js";
7
- import { mayLimitSamples } from "#src/mds3.filter.js";
8
- import { clusterMethodLst, distanceMethodLst } from "#shared/clustering.js";
9
- import { getData } from "#src/termdb.matrix.js";
10
- import {
11
- GENE_EXPRESSION,
12
- METABOLITE_INTENSITY,
13
- NUMERIC_DICTIONARY_TERM,
14
- termType2label,
15
- ISOFORM_EXPRESSION,
16
- SSGSEA,
17
- PROTEOME_ABUNDANCE
18
- } from "#shared/terms.js";
19
- import { formatElapsedTime } from "#shared/time.js";
20
- function init({ genomes }) {
21
- return async (req, res) => {
22
- const q = req.query;
23
- let result;
24
- try {
25
- const g = genomes[q.genome];
26
- if (!g) throw "invalid genome name";
27
- const ds = g.datasets[q.dslabel];
28
- if (!ds) throw "invalid dataset name";
29
- if (ds.label === "GDC" && !ds.__gdc?.doneCaching)
30
- throw "The server has not finished caching the case IDs: try again in about 2 minutes.";
31
- if ([GENE_EXPRESSION, SSGSEA, ISOFORM_EXPRESSION, METABOLITE_INTENSITY, NUMERIC_DICTIONARY_TERM].includes(
32
- q.dataType
33
- )) {
34
- if (!ds.queries?.[q.dataType] && q.dataType !== NUMERIC_DICTIONARY_TERM)
35
- throw `no ${q.dataType} data on this dataset`;
36
- if (!q.terms) throw `missing gene list`;
37
- if (!Array.isArray(q.terms)) throw `gene list is not an array`;
38
- if (q.terms.length < 3)
39
- throw `A minimum of three genes is required for clustering. Please refresh this page to clear this error.`;
40
- result = await getResult(q, ds);
41
- } else if (PROTEOME_ABUNDANCE == q.dataType) {
42
- const proteomeQuery = ds.queries?.proteome;
43
- if (!proteomeQuery?.get) throw `no ${PROTEOME_ABUNDANCE} data getter on this dataset`;
44
- if (!q.terms) throw `missing gene list`;
45
- if (!Array.isArray(q.terms)) throw `gene list is not an array`;
46
- if (q.terms.length < 3)
47
- throw `A minimum of three genes is required for clustering. Please refresh this page to clear this error.`;
48
- result = await getResult(q, ds);
49
- } else {
50
- throw "unknown q.dataType " + q.dataType;
51
- }
52
- } catch (e) {
53
- if (e.stack) console.log(e.stack);
54
- result = {
55
- status: e.status || 400,
56
- error: e.message || e
57
- };
58
- }
59
- res.send(result);
60
- };
61
- }
62
- async function getResult(q, ds) {
63
- let _q = q;
64
- if (q.dataType == GENE_EXPRESSION) {
65
- _q = JSON.parse(JSON.stringify(q));
66
- _q.forClusteringAnalysis = true;
67
- _q.__abortSignal = q.__abortSignal;
68
- }
69
- let term2sample2value, byTermId, bySampleId, skippedSexChrGenes;
70
- if (q.dataType == NUMERIC_DICTIONARY_TERM) {
71
- ;
72
- ({ term2sample2value, byTermId, bySampleId } = await getNumericDictTermAnnotation(q, ds));
73
- } else if (q.dataType == PROTEOME_ABUNDANCE) {
74
- ;
75
- ({ term2sample2value, byTermId, bySampleId, skippedSexChrGenes } = await ds.queries.proteome.get({
76
- ..._q,
77
- dataTypeDetails: _q.proteomeDetails
78
- }));
79
- } else {
80
- ;
81
- ({ term2sample2value, byTermId, bySampleId, skippedSexChrGenes } = await ds.queries[q.dataType].get(_q, ds));
82
- }
83
- const noValueTerms = [];
84
- for (const [term, obj] of term2sample2value) {
85
- if (Object.keys(obj).length === 0) {
86
- const tw = q.terms.find((t2) => t2.$id == term);
87
- const termName = !tw ? term : tw.term.type == "geneExpression" ? tw.term.gene : tw.term.type == "isoformExpression" ? tw.term.isoform : tw.term.name;
88
- noValueTerms.push(termName);
89
- term2sample2value.delete(term);
90
- delete byTermId[term];
91
- }
92
- }
93
- const removedHierClusterTerms = [];
94
- if (noValueTerms.length) {
95
- removedHierClusterTerms.push({
96
- text: `Skipped ${q.dataType == GENE_EXPRESSION ? "genes" : "items"} with no data`,
97
- lst: noValueTerms
98
- });
99
- }
100
- if (skippedSexChrGenes?.length) {
101
- removedHierClusterTerms.push({ text: "Skipped sex chromosome genes", lst: skippedSexChrGenes });
102
- }
103
- if (term2sample2value.size == 0) throw "no data";
104
- if (term2sample2value.size == 1) {
105
- const g = Array.from(term2sample2value.keys())[0];
106
- return { term: { gene: g, type: GENE_EXPRESSION }, data: term2sample2value.get(g) };
107
- }
108
- const t = Date.now();
109
- const clustering = await doClustering(term2sample2value, q, Object.keys(bySampleId).length);
110
- console.log("clustering done:", formatElapsedTime(Date.now() - t));
111
- const result = { clustering, byTermId, bySampleId };
112
- if (removedHierClusterTerms.length) result.removedHierClusterTerms = removedHierClusterTerms;
113
- return result;
114
- }
115
- async function getNumericDictTermAnnotation(q, ds) {
116
- const getDataArgs = {
117
- // TODO: figure out when term is not a termwrapper
118
- terms: q.terms.map((tw) => tw.term ? tw : { term: tw, q: { mode: "continuous" } }),
119
- filter: q.filter,
120
- filter0: q.filter0,
121
- __protected__: q.__protected__
122
- };
123
- const data = await getData(getDataArgs, ds);
124
- if (data.error) throw data.error;
125
- const term2sample2value = /* @__PURE__ */ new Map();
126
- for (const [key, sampleData] of Object.entries(data.samples)) {
127
- for (const [term, value] of Object.entries(sampleData)) {
128
- if (term !== "sample") {
129
- if (!term2sample2value.has(term)) {
130
- term2sample2value.set(term, {});
131
- }
132
- term2sample2value.get(term)[key] = value.value;
133
- }
134
- }
135
- }
136
- return { term2sample2value, byTermId: data.refs.byTermId, bySampleId: data.refs.bySampleId };
137
- }
138
- async function doClustering(data, q, numCases = 1e3) {
139
- const sampleSet = /* @__PURE__ */ new Set();
140
- let firstTerm = true;
141
- for (const o of data.values()) {
142
- const currentSampleIds = new Set(Object.keys(o));
143
- if (firstTerm) {
144
- currentSampleIds.forEach((id) => sampleSet.add(id));
145
- firstTerm = false;
146
- } else {
147
- for (const id of sampleSet) {
148
- if (!currentSampleIds.has(id)) {
149
- sampleSet.delete(id);
150
- }
151
- }
152
- }
153
- }
154
- if (sampleSet.size == 0)
155
- throw `termdb.cluster: There are no overlapping tested samples shared across the selected ${termType2label(
156
- q.dataType
157
- )}`;
158
- if (!clusterMethodLst.find((i) => i.value == q.clusterMethod)) throw "Invalid cluster method";
159
- if (!distanceMethodLst.find((i) => i.value == q.distanceMethod)) throw "Invalid distance method";
160
- const inputData = {
161
- matrix: [],
162
- row_names: [],
163
- // genes
164
- col_names: [...sampleSet].slice(0, numCases),
165
- // samples
166
- cluster_method: q.clusterMethod,
167
- distance_method: q.distanceMethod,
168
- plot_image: false
169
- // When true causes cluster.rs to plot the image into a png file (EXPERIMENTAL)
170
- };
171
- for (const [gene, o] of data) {
172
- inputData.row_names.push(gene);
173
- const row = [];
174
- for (const s of inputData.col_names) {
175
- row.push(o[s]);
176
- }
177
- inputData.matrix.push(q.zScoreTransformation ? getZscore(row) : row);
178
- }
179
- if (inputData.matrix.length == 0) throw "Clustering matrix is empty";
180
- const Routput = JSON.parse(await run_R("hclust.R", JSON.stringify(inputData)));
181
- const row_names_index = Routput.RowOrder.map((row) => inputData.row_names.indexOf(row.name));
182
- const col_names_index = Routput.ColOrder.map((col) => inputData.col_names.indexOf(col.name));
183
- const output_matrix = [];
184
- for (const rowI of row_names_index) {
185
- const newRow = [];
186
- for (const colI of col_names_index) {
187
- newRow.push(inputData.matrix[rowI][colI]);
188
- }
189
- output_matrix.push(newRow);
190
- }
191
- return {
192
- row: {
193
- merge: Routput.RowMerge,
194
- height: Routput.RowHeight,
195
- order: Routput.RowOrder,
196
- inputOrder: inputData.row_names
197
- },
198
- col: {
199
- merge: Routput.ColumnMerge,
200
- height: Routput.ColumnHeight,
201
- order: Routput.ColOrder,
202
- inputOrder: inputData.col_names
203
- },
204
- matrix: output_matrix
205
- };
206
- }
207
- function getZscore(l) {
208
- const mean = l.reduce((sum, v) => sum + v, 0) / l.length;
209
- const sd = Math.sqrt(l.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / l.length);
210
- if (sd == 0) {
211
- return l;
212
- }
213
- return l.map((v) => (v - mean) / sd);
214
- }
215
- async function validate_query_geneExpression(ds, genome) {
216
- const q = ds.queries.geneExpression;
217
- if (!q) return;
218
- q.geneExpression2bins = {};
219
- if (typeof q.get == "function") return;
220
- if (q.src == "gdcapi") {
221
- gdc_validate_query_geneExpression(ds, genome);
222
- return;
223
- }
224
- if (q.src == "native") {
225
- await validateNative(q, ds);
226
- return;
227
- }
228
- throw "unknown queries.geneExpression.src";
229
- }
230
- async function queryHDF5(hdf5_file, query, read_mode) {
231
- const input = {
232
- hdf5_file,
233
- query
234
- };
235
- if (read_mode) input.read_mode = read_mode;
236
- try {
237
- const result = await run_rust("readH5", JSON.stringify(input));
238
- if (!result || result.length === 0) {
239
- throw new Error("Failed to retrieve expression data: Empty or missing response");
240
- }
241
- return result;
242
- } catch (error) {
243
- console.error(`Error querying HDF5 for ${query}`);
244
- throw error;
245
- }
246
- }
247
- async function validateNative(q, ds) {
248
- q.file = path.join(serverconfig.tpmasterdir, q.file);
249
- q.samples = [];
250
- try {
251
- await utils.file_is_readable(q.file);
252
- const tmp = await run_rust("readH5", JSON.stringify({ hdf5_file: q.file, validate: true }));
253
- const vr = JSON.parse(tmp);
254
- if (vr.status !== "success") throw vr.message;
255
- if (!vr.samples?.length) throw "HDF5 file has no samples, please check file.";
256
- const unknownSamples = /* @__PURE__ */ new Set();
257
- for (const sn of vr.samples) {
258
- const si = ds.cohort.termdb.q.sampleName2id(sn);
259
- if (si === void 0) {
260
- if (ds.cohort.db) {
261
- throw `unknown sample ${sn} from HDF5 ${q.file}`;
262
- } else {
263
- unknownSamples.add(sn);
264
- continue;
265
- }
266
- }
267
- q.samples.push(si);
268
- }
269
- console.log(`${ds.label}: geneExpression HDF5 file validated. Format: ${vr.format}, Samples:`, q.samples.length);
270
- if (unknownSamples.size) {
271
- const arr = [...unknownSamples];
272
- console.log(`unknown samples from geneExpression HDF5 file (${arr.length}): ${arr.join(", ")}`);
273
- }
274
- } catch (error) {
275
- throw `${ds.label}: Failed to validate geneExpression HDF5 file: ${error}`;
276
- }
277
- q.get = async (param) => {
278
- const limitSamples = await mayLimitSamples(param, q.samples, ds);
279
- if (limitSamples?.size == 0) {
280
- return { term2sample2value: /* @__PURE__ */ new Map(), byTermId: {}, bySampleId: {} };
281
- }
282
- const bySampleId = {};
283
- const samples = q.samples || [];
284
- if (limitSamples) {
285
- for (const sid of limitSamples) {
286
- if (ds.cohort?.termdb?.q?.id2sampleRefs) {
287
- bySampleId[sid] = ds.cohort.termdb.q.id2sampleRefs(sid);
288
- } else {
289
- bySampleId[sid] = { label: ds.cohort.termdb.q.id2sampleName(sid) };
290
- }
291
- }
292
- } else {
293
- for (const sid of samples) {
294
- if (ds.cohort?.termdb?.q?.id2sampleRefs) {
295
- bySampleId[sid] = ds.cohort.termdb.q.id2sampleRefs(sid);
296
- } else {
297
- bySampleId[sid] = { label: ds.cohort.termdb.q.id2sampleName(sid) };
298
- }
299
- }
300
- }
301
- const term2sample2value = /* @__PURE__ */ new Map();
302
- const byTermId = {};
303
- const geneNames = [];
304
- for (const tw of param.terms) {
305
- if (tw.term.gene) {
306
- geneNames.push(tw.term.gene);
307
- }
308
- }
309
- if (geneNames.length === 0) {
310
- console.log("No genes to query");
311
- return { term2sample2value, byTermId };
312
- }
313
- const time1 = Date.now();
314
- const readMode = param.dslabel == "MMRF" ? "bulk" : null;
315
- const geneData = JSON.parse(await queryHDF5(q.file, geneNames, readMode));
316
- console.log("Time taken to run gene query:", formatElapsedTime(Date.now() - time1));
317
- const genesData = geneData.query_output || {};
318
- if (!genesData) throw "No expression data returned from HDF5 query";
319
- for (const tw of param.terms) {
320
- if (!tw.term.gene) continue;
321
- const geneResult = genesData[tw.term.gene];
322
- if (!geneResult) {
323
- console.warn(`No data found for gene ${tw.term.gene} in the response`);
324
- continue;
325
- }
326
- const samplesData = geneResult.samples || {};
327
- const s2v = {};
328
- for (const sampleName in samplesData) {
329
- const sampleId = ds.cohort.termdb.q.sampleName2id(sampleName);
330
- if (!sampleId) continue;
331
- if (limitSamples && !limitSamples.has(sampleId)) continue;
332
- if (!Number.isFinite(samplesData[sampleName])) continue;
333
- s2v[sampleId] = samplesData[sampleName];
334
- }
335
- if (Object.keys(s2v).length) {
336
- term2sample2value.set(tw.$id, s2v);
337
- }
338
- }
339
- if (term2sample2value.size == 0) {
340
- throw "No data available for the input " + param.terms?.map((tw) => tw.term.gene).join(", ");
341
- }
342
- return { term2sample2value, byTermId, bySampleId };
343
- };
344
- }
345
- async function validateQueryIsoformExpression(ds, _genome) {
346
- const q = ds.queries.isoformExpression;
347
- if (!q) return;
348
- q.geneExpression2bins = {};
349
- if (typeof q.get == "function") return;
350
- if (q.file) {
351
- await validateNativeIsoform(q, ds);
352
- return;
353
- }
354
- throw "isoformExpression requires either .get() or .file";
355
- }
356
- async function validateNativeIsoform(q, ds) {
357
- q.file = path.join(serverconfig.tpmasterdir, q.file);
358
- q.samples = [];
359
- try {
360
- await utils.file_is_readable(q.file);
361
- const tmp = await run_rust("readH5", JSON.stringify({ hdf5_file: q.file, validate: true, include_items: true }));
362
- const vr = JSON.parse(tmp);
363
- if (vr.status !== "success") throw vr.message;
364
- if (!vr.samples?.length) throw "HDF5 file has no samples, please check file.";
365
- for (const sn of vr.samples) {
366
- const si = ds.cohort.termdb.q.sampleName2id(sn);
367
- if (si == void 0) {
368
- if (ds.cohort.db) {
369
- throw `unknown sample ${sn} from HDF5 ${q.file}`;
370
- } else {
371
- continue;
372
- }
373
- }
374
- q.samples.push(si);
375
- }
376
- q.availableItems = vr.items || [];
377
- console.log(
378
- `${ds.label}: isoformExpression HDF5 file validated. Format: ${vr.format}, Samples:`,
379
- q.samples.length,
380
- "Items:",
381
- q.availableItems.length
382
- );
383
- } catch (error) {
384
- throw `${ds.label}: Failed to validate isoformExpression HDF5 file: ${error}`;
385
- }
386
- q.get = async (param) => {
387
- const limitSamples = await mayLimitSamples(param, q.samples, ds);
388
- if (limitSamples?.size == 0) {
389
- return { term2sample2value: /* @__PURE__ */ new Map(), byTermId: {}, bySampleId: {} };
390
- }
391
- const bySampleId = {};
392
- const samples = q.samples || [];
393
- if (limitSamples) {
394
- for (const sid of limitSamples) {
395
- if (ds.cohort?.termdb?.q?.id2sampleRefs) {
396
- bySampleId[sid] = ds.cohort.termdb.q.id2sampleRefs(sid);
397
- } else {
398
- bySampleId[sid] = { label: ds.cohort.termdb.q.id2sampleName(sid) };
399
- }
400
- }
401
- } else {
402
- for (const sid of samples) {
403
- if (ds.cohort?.termdb?.q?.id2sampleRefs) {
404
- bySampleId[sid] = ds.cohort.termdb.q.id2sampleRefs(sid);
405
- } else {
406
- bySampleId[sid] = { label: ds.cohort.termdb.q.id2sampleName(sid) };
407
- }
408
- }
409
- }
410
- const term2sample2value = /* @__PURE__ */ new Map();
411
- const byTermId = {};
412
- const isoformIds = [];
413
- for (const tw of param.terms) {
414
- if (tw.term.isoform) {
415
- isoformIds.push(tw.term.isoform);
416
- }
417
- }
418
- if (isoformIds.length === 0) {
419
- console.log("No isoforms to query");
420
- return { term2sample2value, byTermId };
421
- }
422
- const time1 = Date.now();
423
- const isoformData = JSON.parse(await queryHDF5(q.file, isoformIds, null));
424
- console.log("Time taken to run isoform query:", formatElapsedTime(Date.now() - time1));
425
- const isoformsData = isoformData.query_output || {};
426
- if (!isoformsData) throw "No expression data returned from HDF5 query";
427
- for (const tw of param.terms) {
428
- if (!tw.term.isoform) continue;
429
- const isoformResult = isoformsData[tw.term.isoform];
430
- if (!isoformResult) {
431
- console.warn(`No data found for isoform ${tw.term.isoform} in the response`);
432
- continue;
433
- }
434
- const samplesData = isoformResult.samples || {};
435
- const s2v = {};
436
- for (const sampleName in samplesData) {
437
- const sampleId = ds.cohort.termdb.q.sampleName2id(sampleName);
438
- if (!sampleId) continue;
439
- if (limitSamples && !limitSamples.has(sampleId)) continue;
440
- s2v[sampleId] = samplesData[sampleName];
441
- }
442
- if (Object.keys(s2v).length) {
443
- term2sample2value.set(tw.$id, s2v);
444
- }
445
- }
446
- if (term2sample2value.size == 0) {
447
- throw "No data available for the input " + param.terms?.map((tw) => tw.term.isoform).join(", ");
448
- }
449
- return { term2sample2value, byTermId, bySampleId };
450
- };
451
- }
452
- export {
453
- init,
454
- validateQueryIsoformExpression,
455
- validate_query_geneExpression
456
- };