@sjcrh/proteinpaint-server 2.173.0 → 2.174.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -5
- package/routes/grin2.js +173 -95
- package/routes/termdb.chat.js +410 -52
- package/routes/termdb.cluster.js +2 -1
- package/routes/termdb.config.js +1 -0
- package/routes/termdb.runChart.js +34 -28
- package/src/app.js +860 -333
- package/src/mds3.gdc.filter.js +58 -64
package/routes/termdb.chat.js
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import fs from "fs";
|
|
2
2
|
import { ezFetch } from "#shared";
|
|
3
|
+
import { get_samples } from "#src/termdb.sql.js";
|
|
3
4
|
import { ChatPayload } from "#types/checkers";
|
|
4
5
|
import serverconfig from "../src/serverconfig.js";
|
|
5
6
|
import { mayLog } from "#src/helpers.ts";
|
|
6
7
|
import Database from "better-sqlite3";
|
|
7
8
|
import { formatElapsedTime } from "#shared";
|
|
9
|
+
const num_filter_cutoff = 3;
|
|
8
10
|
const api = {
|
|
9
11
|
endpoint: "termdb/chat",
|
|
10
12
|
methods: {
|
|
@@ -74,7 +76,19 @@ function init({ genomes }) {
|
|
|
74
76
|
);
|
|
75
77
|
mayLog("Time taken for summary agent:", formatElapsedTime(Date.now() - time12));
|
|
76
78
|
} else if (classResult == "dge") {
|
|
77
|
-
|
|
79
|
+
const time12 = (/* @__PURE__ */ new Date()).valueOf();
|
|
80
|
+
ai_output_json = await extract_DE_search_terms_from_query(
|
|
81
|
+
q.prompt,
|
|
82
|
+
serverconfig.llm_backend,
|
|
83
|
+
comp_model_name,
|
|
84
|
+
apilink,
|
|
85
|
+
dataset_db,
|
|
86
|
+
dataset_json,
|
|
87
|
+
ds
|
|
88
|
+
);
|
|
89
|
+
mayLog("Time taken for DE agent:", formatElapsedTime(Date.now() - time12));
|
|
90
|
+
} else if (classResult == "survival") {
|
|
91
|
+
ai_output_json = { type: "html", html: "survival agent has not been implemented yet" };
|
|
78
92
|
} else {
|
|
79
93
|
ai_output_json = { type: "html", html: "Unknown classification value" };
|
|
80
94
|
}
|
|
@@ -163,6 +177,10 @@ async function call_sj_llm(prompt, model_name, apilink) {
|
|
|
163
177
|
throw "SJ API request failed:" + error;
|
|
164
178
|
}
|
|
165
179
|
}
|
|
180
|
+
function checkField(sentence) {
|
|
181
|
+
if (!sentence) return "";
|
|
182
|
+
else return sentence;
|
|
183
|
+
}
|
|
166
184
|
async function readJSONFile(file) {
|
|
167
185
|
const json_file = await fs.promises.readFile(file);
|
|
168
186
|
return JSON.parse(json_file.toString());
|
|
@@ -176,10 +194,12 @@ async function classify_query_by_dataset_type(user_prompt, comp_model_name, llm_
|
|
|
176
194
|
}
|
|
177
195
|
}
|
|
178
196
|
const classification_ds = dataset_json.charts.filter((chart) => chart.type == "Classification");
|
|
197
|
+
if (classification_ds.length == 0) throw "Classification information is not present in the dataset file.";
|
|
198
|
+
if (classification_ds[0].TrainingData.length == 0) throw "No training data is provided for the classification agent.";
|
|
179
199
|
let train_iter = 0;
|
|
180
200
|
let training_data = "";
|
|
181
201
|
if (classification_ds.length > 0 && classification_ds[0].TrainingData.length > 0) {
|
|
182
|
-
contents += classification_ds.SystemPrompt;
|
|
202
|
+
contents += checkField(dataset_json.DatasetPrompt) + checkField(classification_ds[0].SystemPrompt);
|
|
183
203
|
for (const train_data of classification_ds[0].TrainingData) {
|
|
184
204
|
train_iter += 1;
|
|
185
205
|
training_data += "Example question" + train_iter.toString() + ": " + train_data.question + " Example answer" + train_iter.toString() + ":" + JSON.stringify(train_data.answer) + " ";
|
|
@@ -194,25 +214,307 @@ async function classify_query_by_dataset_type(user_prompt, comp_model_name, llm_
|
|
|
194
214
|
} else {
|
|
195
215
|
throw "Unknown LLM backend";
|
|
196
216
|
}
|
|
197
|
-
mayLog("response:", response);
|
|
198
217
|
return JSON.parse(response);
|
|
199
218
|
}
|
|
219
|
+
async function extract_DE_search_terms_from_query(prompt, llm_backend_type, comp_model_name, apilink, dataset_db, dataset_json, ds) {
|
|
220
|
+
if (dataset_json.hasDE) {
|
|
221
|
+
const dataset_db_output = await parse_dataset_db(dataset_db);
|
|
222
|
+
const Schema = {
|
|
223
|
+
$schema: "http://json-schema.org/draft-07/schema#",
|
|
224
|
+
$ref: "#/definitions/DEType",
|
|
225
|
+
definitions: {
|
|
226
|
+
DEType: {
|
|
227
|
+
type: "object",
|
|
228
|
+
properties: {
|
|
229
|
+
group1: {
|
|
230
|
+
type: "array",
|
|
231
|
+
items: { $ref: "#/definitions/FilterTerm" },
|
|
232
|
+
description: "Name of group1 which is an array of filter terms"
|
|
233
|
+
},
|
|
234
|
+
group2: {
|
|
235
|
+
type: "array",
|
|
236
|
+
items: { $ref: "#/definitions/FilterTerm" },
|
|
237
|
+
description: "Name of group2 which is an array of filter terms"
|
|
238
|
+
},
|
|
239
|
+
method: {
|
|
240
|
+
type: "string",
|
|
241
|
+
enum: ["edgeR", "limma", "wilcoxon"],
|
|
242
|
+
description: "Method used for carrying out differential gene expression analysis"
|
|
243
|
+
}
|
|
244
|
+
},
|
|
245
|
+
required: ["group1", "group2"],
|
|
246
|
+
additionalProperties: false
|
|
247
|
+
},
|
|
248
|
+
FilterTerm: {
|
|
249
|
+
anyOf: [{ $ref: "#/definitions/CategoricalFilterTerm" }, { $ref: "#/definitions/NumericFilterTerm" }]
|
|
250
|
+
},
|
|
251
|
+
CategoricalFilterTerm: {
|
|
252
|
+
type: "object",
|
|
253
|
+
properties: {
|
|
254
|
+
term: { type: "string", description: "Name of categorical term" },
|
|
255
|
+
category: { type: "string", description: "The category of the term" },
|
|
256
|
+
join: {
|
|
257
|
+
type: "string",
|
|
258
|
+
enum: ["and", "or"],
|
|
259
|
+
description: "join term to be used only when there is more than one filter term and should be placed from the 2nd filter term onwards describing how it connects to the previous term"
|
|
260
|
+
}
|
|
261
|
+
},
|
|
262
|
+
required: ["term", "category"],
|
|
263
|
+
additionalProperties: false
|
|
264
|
+
},
|
|
265
|
+
NumericFilterTerm: {
|
|
266
|
+
type: "object",
|
|
267
|
+
properties: {
|
|
268
|
+
term: { type: "string", description: "Name of numeric term" },
|
|
269
|
+
start: { type: "number", description: "start position (or lower limit) of numeric term" },
|
|
270
|
+
stop: { type: "number", description: "stop position (or upper limit) of numeric term" },
|
|
271
|
+
join: {
|
|
272
|
+
type: "string",
|
|
273
|
+
enum: ["and", "or"],
|
|
274
|
+
description: "join term to be used only when there is more than one filter term and should be placed from the 2nd filter term onwards describing how it connects to the previous term"
|
|
275
|
+
}
|
|
276
|
+
},
|
|
277
|
+
required: ["term"],
|
|
278
|
+
additionalProperties: false
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
};
|
|
282
|
+
const DE_ds = dataset_json.charts.filter((chart) => chart.type == "DE");
|
|
283
|
+
if (DE_ds.length == 0) throw "DE information is not present in the dataset file.";
|
|
284
|
+
if (DE_ds[0].TrainingData.length == 0) throw "No training data is provided for the DE agent.";
|
|
285
|
+
let train_iter = 0;
|
|
286
|
+
let training_data = "";
|
|
287
|
+
for (const train_data of DE_ds[0].TrainingData) {
|
|
288
|
+
train_iter += 1;
|
|
289
|
+
training_data += "Example question" + train_iter.toString() + ": " + train_data.question + " Example answer" + train_iter.toString() + ":" + JSON.stringify(train_data.answer) + " ";
|
|
290
|
+
}
|
|
291
|
+
const system_prompt = "I am an assistant that extracts the groups from the user prompt to carry out differential gene expression. The final output must be in the following JSON with NO extra comments. The schema is as follows: " + JSON.stringify(Schema) + ' . "group1" and "group2" fields are compulsory. Both "group1" and "group2" consist of an array of filter variables. There are two kinds of filter variables: "Categorical" and "Numeric". "Categorical" variables are those variables which can have a fixed set of values e.g. gender, race. They are defined by the "CategoricalFilterTerm" which consists of "term" (a field from the sqlite3 db) and "category" (a value of the field from the sqlite db). "Numeric" variables are those which can have any numeric value. They are defined by "NumericFilterTerm" and contain the subfields "term" (a field from the sqlite3 db), "start" an optional filter which is defined when a lower cutoff is defined in the user input for the numeric variable and "stop" an optional filter which is defined when a higher cutoff is defined in the user input for the numeric variable. ' + // May consider deprecating this natural language description after units tests are implemented
|
|
292
|
+
checkField(dataset_json.DatasetPrompt) + checkField(DE_ds[0].SystemPrompt) + "The sqlite db in plain language is as follows:\n" + dataset_db_output.rag_docs.join(",") + " training data is as follows:" + training_data + " Question: {" + prompt + "} answer:";
|
|
293
|
+
let response;
|
|
294
|
+
if (llm_backend_type == "SJ") {
|
|
295
|
+
response = await call_sj_llm(system_prompt, comp_model_name, apilink);
|
|
296
|
+
} else if (llm_backend_type == "ollama") {
|
|
297
|
+
response = await call_ollama(system_prompt, comp_model_name, apilink);
|
|
298
|
+
} else {
|
|
299
|
+
throw "Unknown LLM backend";
|
|
300
|
+
}
|
|
301
|
+
return await validate_DE_response(response, ds, dataset_db_output.db_rows);
|
|
302
|
+
} else {
|
|
303
|
+
return { type: "html", html: "Differential gene expression not supported for this dataset" };
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
async function validate_DE_response(response, ds, db_rows) {
|
|
307
|
+
const response_type = JSON.parse(response);
|
|
308
|
+
let html = "";
|
|
309
|
+
let group1;
|
|
310
|
+
let samples1lst;
|
|
311
|
+
const name1 = generate_group_name(response_type.group1, db_rows);
|
|
312
|
+
if (!response_type.group1) {
|
|
313
|
+
html += "group1 not present in DE output";
|
|
314
|
+
} else {
|
|
315
|
+
const validated_filters = validate_filter(response_type.group1, ds, name1);
|
|
316
|
+
if (validated_filters.html.length > 0) {
|
|
317
|
+
html += validated_filters.html;
|
|
318
|
+
} else {
|
|
319
|
+
const samples1 = await get_samples({ filter: validated_filters.simplefilter }, ds, true);
|
|
320
|
+
samples1lst = samples1.map((item) => ({
|
|
321
|
+
sampleId: item.id,
|
|
322
|
+
sample: item.name
|
|
323
|
+
}));
|
|
324
|
+
group1 = {
|
|
325
|
+
name: name1,
|
|
326
|
+
in: true,
|
|
327
|
+
values: samples1lst
|
|
328
|
+
};
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
let group2;
|
|
332
|
+
let samples2lst;
|
|
333
|
+
const name2 = generate_group_name(response_type.group2, db_rows);
|
|
334
|
+
if (!response_type.group2) {
|
|
335
|
+
html += "group2 not present in DE output";
|
|
336
|
+
} else {
|
|
337
|
+
const validated_filters = validate_filter(response_type.group2, ds, name2);
|
|
338
|
+
if (validated_filters.html.length > 0) {
|
|
339
|
+
html += validated_filters.html;
|
|
340
|
+
} else {
|
|
341
|
+
const samples2 = await get_samples({ filter: validated_filters.simplefilter }, ds, true);
|
|
342
|
+
samples2lst = samples2.map((item) => ({
|
|
343
|
+
sampleId: item.id,
|
|
344
|
+
sample: item.name
|
|
345
|
+
}));
|
|
346
|
+
group2 = {
|
|
347
|
+
name: name2,
|
|
348
|
+
in: true,
|
|
349
|
+
values: samples2lst
|
|
350
|
+
};
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
let settings;
|
|
354
|
+
if (response_type.method) {
|
|
355
|
+
if (response_type.method == "edgeR" || response_type.method == "limma" || response_type.method == "wilcoxon") {
|
|
356
|
+
settings = { volcano: { method: response_type.method } };
|
|
357
|
+
} else {
|
|
358
|
+
html += "Unknown DE method: " + response_type.method;
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
if (html.length > 0) {
|
|
362
|
+
html = removeLastOccurrence(
|
|
363
|
+
html,
|
|
364
|
+
"For now, the maximum number of filter terms supported through the chatbot is " + num_filter_cutoff
|
|
365
|
+
// Remove duplicated statements in error message
|
|
366
|
+
);
|
|
367
|
+
return { type: "html", html };
|
|
368
|
+
} else {
|
|
369
|
+
const pp_plot_json = { childType: "volcano", termType: "geneExpression", chartType: "differentialAnalysis" };
|
|
370
|
+
const groups = [group1, group2];
|
|
371
|
+
const tw = {
|
|
372
|
+
q: {
|
|
373
|
+
groups
|
|
374
|
+
},
|
|
375
|
+
term: {
|
|
376
|
+
name: name1 + " vs " + name2,
|
|
377
|
+
type: "samplelst",
|
|
378
|
+
values: {
|
|
379
|
+
[name1]: {
|
|
380
|
+
color: "purple",
|
|
381
|
+
key: name1,
|
|
382
|
+
label: name1,
|
|
383
|
+
list: samples1lst
|
|
384
|
+
},
|
|
385
|
+
[name2]: {
|
|
386
|
+
color: "blue",
|
|
387
|
+
key: name2,
|
|
388
|
+
label: name2,
|
|
389
|
+
list: samples2lst
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
};
|
|
394
|
+
pp_plot_json.state = {
|
|
395
|
+
customTerms: [
|
|
396
|
+
{
|
|
397
|
+
name: name1 + " vs " + name2,
|
|
398
|
+
tw
|
|
399
|
+
}
|
|
400
|
+
],
|
|
401
|
+
groups
|
|
402
|
+
};
|
|
403
|
+
pp_plot_json.samplelst = { groups };
|
|
404
|
+
pp_plot_json.tw = tw;
|
|
405
|
+
pp_plot_json.settings = settings;
|
|
406
|
+
return { type: "plot", plot: pp_plot_json };
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
function generate_group_name(filters, db_rows) {
|
|
410
|
+
let name = "";
|
|
411
|
+
let iter = 0;
|
|
412
|
+
for (const filter of filters) {
|
|
413
|
+
if (iter > 0 && !filter.join) {
|
|
414
|
+
name += "&";
|
|
415
|
+
}
|
|
416
|
+
if (filter.join && filter.join == "and") {
|
|
417
|
+
name += "&";
|
|
418
|
+
}
|
|
419
|
+
if (filter.join && filter.join == "or") {
|
|
420
|
+
name += "|";
|
|
421
|
+
}
|
|
422
|
+
if (filter.category) {
|
|
423
|
+
name += find_label(filter, db_rows);
|
|
424
|
+
}
|
|
425
|
+
if (filter.start) {
|
|
426
|
+
name += filter.term + ">=" + filter.start.toString();
|
|
427
|
+
}
|
|
428
|
+
if (filter.stop) {
|
|
429
|
+
name += filter.term + "<=" + filter.stop.toString();
|
|
430
|
+
}
|
|
431
|
+
iter += 1;
|
|
432
|
+
}
|
|
433
|
+
return name;
|
|
434
|
+
}
|
|
435
|
+
function find_label(filter, db_rows) {
|
|
436
|
+
let label = "";
|
|
437
|
+
for (const row of db_rows) {
|
|
438
|
+
if (row.name == filter.term) {
|
|
439
|
+
for (const value of row.values) {
|
|
440
|
+
if (value.value && value.value.label && filter.category == value.key) {
|
|
441
|
+
label = value.value.label;
|
|
442
|
+
break;
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
break;
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
return label;
|
|
449
|
+
}
|
|
200
450
|
async function extract_summary_terms(prompt, llm_backend_type, comp_model_name, apilink, dataset_db, dataset_json, genedb, ds) {
|
|
201
|
-
const
|
|
451
|
+
const dataset_db_output = await parse_dataset_db(dataset_db);
|
|
202
452
|
const genes_list = await parse_geneset_db(genedb);
|
|
203
|
-
const
|
|
453
|
+
const Schema = {
|
|
454
|
+
$schema: "http://json-schema.org/draft-07/schema#",
|
|
455
|
+
$ref: "#/definitions/SummaryType",
|
|
456
|
+
definitions: {
|
|
457
|
+
SummaryType: {
|
|
458
|
+
type: "object",
|
|
459
|
+
properties: {
|
|
460
|
+
term: { type: "string", description: "Name of 1st term" },
|
|
461
|
+
term2: { type: "string", description: "Name of 2nd term" },
|
|
462
|
+
simpleFilter: {
|
|
463
|
+
type: "array",
|
|
464
|
+
items: { $ref: "#/definitions/FilterTerm" },
|
|
465
|
+
description: "Optional simple filter terms"
|
|
466
|
+
}
|
|
467
|
+
},
|
|
468
|
+
required: ["term", "simpleFilter"],
|
|
469
|
+
additionalProperties: false
|
|
470
|
+
},
|
|
471
|
+
FilterTerm: {
|
|
472
|
+
anyOf: [{ $ref: "#/definitions/CategoricalFilterTerm" }, { $ref: "#/definitions/NumericFilterTerm" }]
|
|
473
|
+
},
|
|
474
|
+
CategoricalFilterTerm: {
|
|
475
|
+
type: "object",
|
|
476
|
+
properties: {
|
|
477
|
+
term: { type: "string", description: "Name of categorical term" },
|
|
478
|
+
category: { type: "string", description: "The category of the term" },
|
|
479
|
+
join: {
|
|
480
|
+
type: "string",
|
|
481
|
+
enum: ["and", "or"],
|
|
482
|
+
description: "join term to be used only when there there is more than one filter term and should be placed in the 2nd filter term describing how it connects to the 1st term"
|
|
483
|
+
}
|
|
484
|
+
},
|
|
485
|
+
required: ["term", "category"],
|
|
486
|
+
additionalProperties: false
|
|
487
|
+
},
|
|
488
|
+
NumericFilterTerm: {
|
|
489
|
+
type: "object",
|
|
490
|
+
properties: {
|
|
491
|
+
term: { type: "string", description: "Name of numeric term" },
|
|
492
|
+
start: { type: "number", description: "start position (or lower limit) of numeric term" },
|
|
493
|
+
stop: { type: "number", description: "stop position (or upper limit) of numeric term" },
|
|
494
|
+
join: {
|
|
495
|
+
type: "string",
|
|
496
|
+
enum: ["and", "or"],
|
|
497
|
+
description: "join term to be used only when there there is more than one filter term and should be placed in the 2nd filter term describing how it connects to the 1st term"
|
|
498
|
+
}
|
|
499
|
+
},
|
|
500
|
+
required: ["term"],
|
|
501
|
+
additionalProperties: false
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
};
|
|
204
505
|
const words = prompt.replace(/[^a-zA-Z0-9\s]/g, "").split(/\s+/).map((str) => str.toLowerCase());
|
|
205
506
|
const common_genes = words.filter((item) => genes_list.includes(item));
|
|
206
507
|
const summary_ds = dataset_json.charts.filter((chart) => chart.type == "Summary");
|
|
207
|
-
if (summary_ds.length == 0) throw "
|
|
208
|
-
if (summary_ds[0].TrainingData.length == 0) throw "
|
|
508
|
+
if (summary_ds.length == 0) throw "Summary information is not present in the dataset file.";
|
|
509
|
+
if (summary_ds[0].TrainingData.length == 0) throw "No training data is provided for the summary agent.";
|
|
209
510
|
let train_iter = 0;
|
|
210
511
|
let training_data = "";
|
|
211
512
|
for (const train_data of summary_ds[0].TrainingData) {
|
|
212
513
|
train_iter += 1;
|
|
213
514
|
training_data += "Example question" + train_iter.toString() + ": " + train_data.question + " Example answer" + train_iter.toString() + ":" + JSON.stringify(train_data.answer) + " ";
|
|
214
515
|
}
|
|
215
|
-
let system_prompt = "I am an assistant that extracts the summary terms from user query. The final output must be in the following JSON format with NO extra comments. The JSON schema is as follows: " +
|
|
516
|
+
let system_prompt = "I am an assistant that extracts the summary terms from user query. The final output must be in the following JSON format with NO extra comments. The JSON schema is as follows: " + JSON.stringify(Schema) + ' term and term2 (if present) should ONLY contain names of the fields from the sqlite db. The "simpleFilter" field is optional and should contain an array of JSON terms with which the dataset will be filtered. A variable simultaneously CANNOT be part of both "term"/"term2" and "simpleFilter". There are two kinds of filter variables: "Categorical" and "Numeric". "Categorical" variables are those variables which can have a fixed set of values e.g. gender, race. They are defined by the "CategoricalFilterTerm" which consists of "term" (a field from the sqlite3 db) and "category" (a value of the field from the sqlite db). "Numeric" variables are those which can have any numeric value. They are defined by "NumericFilterTerm" and contain the subfields "term" (a field from the sqlite3 db), "start" an optional filter which is defined when a lower cutoff is defined in the user input for the numeric variable and "stop" an optional filter which is defined when a higher cutoff is defined in the user input for the numeric variable. ' + // May consider deprecating this natural language description after unit tests are implemented
|
|
517
|
+
checkField(dataset_json.DatasetPrompt) + checkField(summary_ds[0].SystemPrompt) + "\n The DB content is as follows: " + dataset_db_output.rag_docs.join(",") + " training data is as follows:" + training_data;
|
|
216
518
|
if (dataset_json.hasGeneExpression) {
|
|
217
519
|
if (common_genes.length > 0) {
|
|
218
520
|
system_prompt += "\n List of relevant genes are as follows (separated by comma(,)):" + common_genes.join(",");
|
|
@@ -250,7 +552,7 @@ function validate_summary_response(response, common_genes, dataset_json, ds) {
|
|
|
250
552
|
}
|
|
251
553
|
}
|
|
252
554
|
if (response_type.simpleFilter && response_type.simpleFilter.length > 0) {
|
|
253
|
-
const validated_filters = validate_filter(response_type.simpleFilter, ds);
|
|
555
|
+
const validated_filters = validate_filter(response_type.simpleFilter, ds, "");
|
|
254
556
|
if (validated_filters.html.length > 0) {
|
|
255
557
|
html += validated_filters.html;
|
|
256
558
|
} else {
|
|
@@ -283,56 +585,112 @@ function validate_term(response_term, common_genes, dataset_json, ds) {
|
|
|
283
585
|
}
|
|
284
586
|
return { term_type, html };
|
|
285
587
|
}
|
|
286
|
-
function
|
|
588
|
+
function countOccurrences(str, word) {
|
|
589
|
+
if (word === "") return 0;
|
|
590
|
+
let count = 0;
|
|
591
|
+
let pos = 0;
|
|
592
|
+
while ((pos = str.indexOf(word, pos)) !== -1) {
|
|
593
|
+
count++;
|
|
594
|
+
pos += word.length;
|
|
595
|
+
}
|
|
596
|
+
return count;
|
|
597
|
+
}
|
|
598
|
+
function removeLastOccurrence(str, word) {
|
|
599
|
+
const index = str.lastIndexOf(word);
|
|
600
|
+
if (index === -1) return str;
|
|
601
|
+
const occurrences = countOccurrences(str, word);
|
|
602
|
+
if (occurrences === 1) {
|
|
603
|
+
return str;
|
|
604
|
+
} else {
|
|
605
|
+
return str.slice(0, index) + str.slice(index + word.length);
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
function validate_filter(filters, ds, group_name) {
|
|
287
609
|
if (!Array.isArray(filters)) throw "filter is not array";
|
|
610
|
+
let filter_result = { html: "" };
|
|
611
|
+
if (filters.length <= 2) {
|
|
612
|
+
filter_result = generate_filter_term(filters, ds);
|
|
613
|
+
} else {
|
|
614
|
+
if (filters.length > num_filter_cutoff) {
|
|
615
|
+
filter_result.html = "For now, the maximum number of filter terms supported through the chatbot is " + num_filter_cutoff;
|
|
616
|
+
if (group_name.length > 0) {
|
|
617
|
+
filter_result.html += " . The number of filter terms for group " + group_name + " is " + filters.length + "\n";
|
|
618
|
+
} else {
|
|
619
|
+
filter_result.html += "The number of filter terms for this query is " + filters.length;
|
|
620
|
+
}
|
|
621
|
+
} else {
|
|
622
|
+
for (let i = 0; i < filters.length - 1; i++) {
|
|
623
|
+
const filter_lst = [];
|
|
624
|
+
if (i == 0) {
|
|
625
|
+
filter_lst.push(filters[i]);
|
|
626
|
+
} else {
|
|
627
|
+
filter_lst.push(filter_result.simplefilter);
|
|
628
|
+
}
|
|
629
|
+
filter_lst.push(filters[i + 1]);
|
|
630
|
+
filter_result = generate_filter_term(filter_lst, ds);
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
return { simplefilter: filter_result.simplefilter, html: filter_result.html };
|
|
635
|
+
}
|
|
636
|
+
function generate_filter_term(filters, ds) {
|
|
288
637
|
let invalid_html = "";
|
|
289
|
-
const localfilter = { type: "tvslst", in: true,
|
|
290
|
-
if (filters.length > 1) localfilter.join = "and";
|
|
638
|
+
const localfilter = { type: "tvslst", in: true, lst: [] };
|
|
291
639
|
for (const f of filters) {
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
invalid_html += "invalid filter id:" + f.term;
|
|
640
|
+
if (f.type == "tvslst") {
|
|
641
|
+
localfilter.lst.push(f);
|
|
295
642
|
} else {
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
643
|
+
const term = ds.cohort.termdb.q.termjsonByOneid(f.term);
|
|
644
|
+
if (!term) {
|
|
645
|
+
invalid_html += "invalid filter id:" + f.term;
|
|
646
|
+
} else {
|
|
647
|
+
if (f.join) {
|
|
648
|
+
localfilter.join = f.join;
|
|
301
649
|
}
|
|
302
|
-
if (
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
term
|
|
307
|
-
values: [{ key: cat }]
|
|
650
|
+
if (term.type == "categorical") {
|
|
651
|
+
let cat;
|
|
652
|
+
for (const ck in term.values) {
|
|
653
|
+
if (ck == f.category) cat = ck;
|
|
654
|
+
else if (term.values[ck].label == f.category) cat = ck;
|
|
308
655
|
}
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
656
|
+
if (!cat) invalid_html += "invalid category from " + JSON.stringify(f);
|
|
657
|
+
localfilter.lst.push({
|
|
658
|
+
type: "tvs",
|
|
659
|
+
tvs: {
|
|
660
|
+
term,
|
|
661
|
+
values: [{ key: cat }]
|
|
662
|
+
}
|
|
663
|
+
});
|
|
664
|
+
} else if (term.type == "float" || term.type == "integer") {
|
|
665
|
+
const numeric = {
|
|
666
|
+
type: "tvs",
|
|
667
|
+
tvs: {
|
|
668
|
+
term,
|
|
669
|
+
ranges: []
|
|
670
|
+
}
|
|
671
|
+
};
|
|
672
|
+
const range = {};
|
|
673
|
+
if (f.start && !f.stop) {
|
|
674
|
+
range.start = Number(f.start);
|
|
675
|
+
range.stopunbounded = true;
|
|
676
|
+
} else if (f.stop && !f.start) {
|
|
677
|
+
range.stop = Number(f.stop);
|
|
678
|
+
range.startunbounded = true;
|
|
679
|
+
} else if (f.start && f.stop) {
|
|
680
|
+
range.start = Number(f.start);
|
|
681
|
+
range.stop = Number(f.stop);
|
|
682
|
+
} else {
|
|
683
|
+
invalid_html += "Neither greater or lesser defined";
|
|
316
684
|
}
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
if (f.start && !f.stop) {
|
|
320
|
-
range.start = Number(f.start);
|
|
321
|
-
range.stopunbounded = true;
|
|
322
|
-
} else if (f.stop && !f.start) {
|
|
323
|
-
range.stop = Number(f.stop);
|
|
324
|
-
range.startunbounded = true;
|
|
325
|
-
} else if (f.start && f.stop) {
|
|
326
|
-
range.start = Number(f.start);
|
|
327
|
-
range.stop = Number(f.stop);
|
|
328
|
-
} else {
|
|
329
|
-
invalid_html += "Neither greater or lesser defined";
|
|
685
|
+
numeric.tvs.ranges.push(range);
|
|
686
|
+
localfilter.lst.push(numeric);
|
|
330
687
|
}
|
|
331
|
-
numeric.tvs.ranges.push(range);
|
|
332
|
-
localfilter.lst.push(numeric);
|
|
333
688
|
}
|
|
334
689
|
}
|
|
335
690
|
}
|
|
691
|
+
if (filters.length > 1 && !localfilter.join) {
|
|
692
|
+
localfilter.join = "and";
|
|
693
|
+
}
|
|
336
694
|
return { simplefilter: localfilter, html: invalid_html };
|
|
337
695
|
}
|
|
338
696
|
async function parse_geneset_db(genedb) {
|
|
@@ -354,6 +712,7 @@ async function parse_geneset_db(genedb) {
|
|
|
354
712
|
async function parse_dataset_db(dataset_db) {
|
|
355
713
|
const db = new Database(dataset_db);
|
|
356
714
|
const rag_docs = [];
|
|
715
|
+
const db_rows = [];
|
|
357
716
|
try {
|
|
358
717
|
const desc_rows = db.prepare("SELECT * from termhtmldef").all();
|
|
359
718
|
const description_map = [];
|
|
@@ -364,7 +723,6 @@ async function parse_dataset_db(dataset_db) {
|
|
|
364
723
|
description_map.push({ name, description });
|
|
365
724
|
});
|
|
366
725
|
const term_db_rows = db.prepare("SELECT * from terms").all();
|
|
367
|
-
const db_rows = [];
|
|
368
726
|
term_db_rows.forEach((row) => {
|
|
369
727
|
const found = description_map.find((item) => item.name === row.id);
|
|
370
728
|
if (found) {
|
|
@@ -395,15 +753,15 @@ async function parse_dataset_db(dataset_db) {
|
|
|
395
753
|
} finally {
|
|
396
754
|
db.close();
|
|
397
755
|
}
|
|
398
|
-
return rag_docs;
|
|
756
|
+
return { db_rows, rag_docs };
|
|
399
757
|
}
|
|
400
758
|
function parse_db_rows(db_row) {
|
|
401
|
-
let output_string =
|
|
759
|
+
let output_string = 'Name of the field is:"' + db_row.name + '". This field is of the type:' + db_row.term_type + ". Description: " + db_row.description;
|
|
402
760
|
if (db_row.values.length > 0) {
|
|
403
761
|
output_string += "This field contains the following possible values.";
|
|
404
762
|
for (const value of db_row.values) {
|
|
405
763
|
if (value.value && value.value.label) {
|
|
406
|
-
output_string +=
|
|
764
|
+
output_string += 'The key is "' + value.key + '" and the label is "' + value.value.label + '".';
|
|
407
765
|
}
|
|
408
766
|
}
|
|
409
767
|
}
|
package/routes/termdb.cluster.js
CHANGED
|
@@ -69,7 +69,7 @@ async function getResult(q, ds) {
|
|
|
69
69
|
({ term2sample2value, byTermId, bySampleId } = await getNumericDictTermAnnotation(q, ds));
|
|
70
70
|
} else {
|
|
71
71
|
;
|
|
72
|
-
({ term2sample2value, byTermId, bySampleId, skippedSexChrGenes } = await ds.queries[q.dataType].get(_q));
|
|
72
|
+
({ term2sample2value, byTermId, bySampleId, skippedSexChrGenes } = await ds.queries[q.dataType].get(_q, ds));
|
|
73
73
|
}
|
|
74
74
|
const noValueTerms = [];
|
|
75
75
|
for (const [term, obj] of term2sample2value) {
|
|
@@ -206,6 +206,7 @@ async function validate_query_geneExpression(ds, genome) {
|
|
|
206
206
|
const q = ds.queries.geneExpression;
|
|
207
207
|
if (!q) return;
|
|
208
208
|
q.geneExpression2bins = {};
|
|
209
|
+
if (typeof q.get == "function") return;
|
|
209
210
|
if (q.src == "gdcapi") {
|
|
210
211
|
gdc_validate_query_geneExpression(ds, genome);
|
|
211
212
|
return;
|
package/routes/termdb.config.js
CHANGED
|
@@ -78,6 +78,7 @@ function make(q, req, res, ds, genome) {
|
|
|
78
78
|
if (ds.assayAvailability) c.assayAvailability = ds.assayAvailability;
|
|
79
79
|
if (ds.cohort.correlationVolcano) c.correlationVolcano = ds.cohort.correlationVolcano;
|
|
80
80
|
if (ds.cohort.boxplots) c.boxplots = ds.cohort.boxplots;
|
|
81
|
+
if (tdb.maxGeneVariantGeneSetSize) c.maxGeneVariantGeneSetSize = tdb.maxGeneVariantGeneSetSize;
|
|
81
82
|
addRestrictAncestries(c, tdb);
|
|
82
83
|
addScatterplots(c, ds);
|
|
83
84
|
addMatrixplots(c, ds);
|