npm - @sjcrh/proteinpaint-server - Versions diffs - 2.172.0 → 2.173.0 - Mend

@sjcrh/proteinpaint-server 2.172.0 → 2.173.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sjcrh/proteinpaint-server",
-  "version": "2.172.0",
+  "version": "2.173.0",
   "type": "module",
   "description": "a genomics visualization tool for exploring a cohort's genotype and phenotype data",
   "main": "src/app.js",
@@ -66,7 +66,7 @@
     "@sjcrh/proteinpaint-r": "2.152.1-0",
     "@sjcrh/proteinpaint-rust": "2.171.0",
     "@sjcrh/proteinpaint-shared": "2.171.0-0",
-    "@sjcrh/proteinpaint-types": "2.172.0",
+    "@sjcrh/proteinpaint-types": "2.173.0",
     "@types/express": "^5.0.0",
     "@types/express-session": "^1.18.1",
     "better-sqlite3": "^12.4.1",

package/routes/termdb.chat.js CHANGED Viewed

@@ -1,8 +1,10 @@
+import fs from "fs";
+import { ezFetch } from "#shared";
 import { ChatPayload } from "#types/checkers";
-import { run_rust } from "@sjcrh/proteinpaint-rust";
 import serverconfig from "../src/serverconfig.js";
 import { mayLog } from "#src/helpers.ts";
-import { run_python } from "@sjcrh/proteinpaint-python";
+import Database from "better-sqlite3";
+import { formatElapsedTime } from "#shared";
 const api = {
   endpoint: "termdb/chat",
   methods: {
@@ -24,138 +26,389 @@ function init({ genomes }) {
       if (!g) throw "invalid genome";
       const ds = g.datasets?.[q.dslabel];
       if (!ds) throw "invalid dslabel";
-      if (serverconfig.features.pythonChatBot) {
-        const chatbot_input2 = {
-          prompt: q.prompt,
-          genome: q.genome,
-          dslabel: q.dslabel
-          //terms_tsv_path: df
-        };
-        try {
-          const ai_output_data2 = await run_python("chatBot.py", JSON.stringify(chatbot_input2));
-          res.send(ai_output_data2);
-        } catch (error) {
-          const errmsg = "Error running chatBot Python script:" + error;
-          throw new Error(errmsg);
-        }
-        return;
-      }
       const serverconfig_ds_entries = serverconfig.genomes.find((genome) => genome.name == q.genome).datasets.find((dslabel) => dslabel.name == ds.label);
       if (!serverconfig_ds_entries.aifiles) {
         throw "aifiles are missing for chatbot to work";
       }
       let apilink;
       let comp_model_name;
-      let embedding_model_name;
       if (serverconfig.llm_backend == "SJ") {
         apilink = serverconfig.sj_apilink;
         comp_model_name = serverconfig.sj_comp_model_name;
-        embedding_model_name = serverconfig.sj_embedding_model_name;
       } else if (serverconfig.llm_backend == "ollama") {
         apilink = serverconfig.ollama_apilink;
         comp_model_name = serverconfig.ollama_comp_model_name;
-        embedding_model_name = serverconfig.ollama_embedding_model_name;
       } else {
         throw "llm_backend either needs to be 'SJ' or 'ollama'";
       }
-      const chatbot_input = {
-        user_input: q.prompt,
-        apilink,
-        tpmasterdir: serverconfig.tpmasterdir,
-        comp_model_name,
-        embedding_model_name,
-        dataset_db: ds.cohort.db.file,
-        genedb: g.genedb.dbfile,
-        aiRoute: serverconfig.aiRoute,
-        // Route file for classifying chat request into various routes
-        llm_backend_name: serverconfig.llm_backend,
-        // The type of backend (engine) used for running the embedding and completion model. Currently "SJ" and "Ollama" are supported
-        aifiles: serverconfig_ds_entries.aifiles,
-        // Dataset specific data containing data-specific routes, system prompts for agents and few-shot examples
-        binpath: serverconfig.binpath
-      };
+      const dataset_db = serverconfig.tpmasterdir + "/" + ds.cohort.db.file;
+      const genedb = serverconfig.tpmasterdir + "/" + g.genedb.dbfile;
+      const dataset_json = await readJSONFile(serverconfig_ds_entries.aifiles);
       const time1 = (/* @__PURE__ */ new Date()).valueOf();
-      const classResult = JSON.parse(await run_rust("query_classification", JSON.stringify(chatbot_input)));
-      const time2 = (/* @__PURE__ */ new Date()).valueOf();
-      mayLog("Time taken for classification:", time2 - time1, "ms");
-      let ai_output_data;
+      const class_response = await classify_query_by_dataset_type(
+        q.prompt,
+        comp_model_name,
+        serverconfig.llm_backend,
+        apilink,
+        serverconfig.aiRoute,
+        dataset_json
+      );
       let ai_output_json;
-      if (classResult.route == "summary") {
-        const time12 = (/* @__PURE__ */ new Date()).valueOf();
-        ai_output_data = await run_rust("summary_agent", JSON.stringify(chatbot_input));
-        const time22 = (/* @__PURE__ */ new Date()).valueOf();
-        mayLog("Time taken for running summary agent:", time22 - time12, "ms");
-        for (const line of ai_output_data.split("\n")) {
-          if (line.startsWith("final_output:") == true) {
-            ai_output_json = JSON.parse(JSON.parse(line.replace("final_output:", "")));
-          } else {
-            mayLog(line);
-          }
+      mayLog("Time taken for classification:", formatElapsedTime(Date.now() - time1));
+      if (class_response.type == "html") {
+        ai_output_json = class_response;
+      } else if (class_response.type == "plot") {
+        const classResult = class_response.plot;
+        mayLog("classResult:", classResult);
+        if (classResult == "summary") {
+          const time12 = (/* @__PURE__ */ new Date()).valueOf();
+          ai_output_json = await extract_summary_terms(
+            q.prompt,
+            serverconfig.llm_backend,
+            comp_model_name,
+            apilink,
+            dataset_db,
+            dataset_json,
+            genedb,
+            ds
+          );
+          mayLog("Time taken for summary agent:", formatElapsedTime(Date.now() - time12));
+        } else if (classResult == "dge") {
+          ai_output_json = { type: "html", html: "DE agent not implemented yet" };
+        } else {
+          ai_output_json = { type: "html", html: "Unknown classification value" };
         }
-      } else if (classResult.route == "dge") {
-        ai_output_json = { type: "html", html: "DE agent not implemented yet" };
       } else {
-        ai_output_json = { type: "html", html: "Unknown classification value" };
-      }
-      if (ai_output_json.type == "plot") {
-        if (typeof ai_output_json.plot != "object") throw ".plot{} missing when .type=plot";
-        if (ai_output_json.plot.simpleFilter) {
-          if (!Array.isArray(ai_output_json.plot.simpleFilter)) throw "ai_output_json.plot.simpleFilter is not array";
-          const localfilter = { type: "tvslst", in: true, join: "", lst: [] };
-          if (ai_output_json.plot.simpleFilter.length > 1) localfilter.join = "and";
-          for (const f of ai_output_json.plot.simpleFilter) {
-            const term = ds.cohort.termdb.q.termjsonByOneid(f.term);
-            if (!term) throw "invalid term id from simpleFilter[].term";
-            if (term.type == "categorical") {
-              let cat;
-              for (const ck in term.values) {
-                if (ck == f.category) cat = ck;
-                else if (term.values[ck].label == f.category) cat = ck;
-              }
-              if (!cat) throw "invalid category from " + JSON.stringify(f);
-              localfilter.lst.push({
-                type: "tvs",
-                tvs: {
-                  term,
-                  values: [{ key: cat }]
-                }
-              });
-            } else if (term.type == "float" || term.type == "integer") {
-              const numeric = {
-                type: "tvs",
-                tvs: {
-                  term,
-                  ranges: []
-                }
-              };
-              const range = {};
-              if (f.gt && !f.lt) {
-                range.start = Number(f.gt);
-                range.stopunbounded = true;
-              } else if (f.lt && !f.gt) {
-                range.stop = Number(f.lt);
-                range.startunbounded = true;
-              } else if (f.gt && f.lt) {
-                range.start = Number(f.gt);
-                range.stop = Number(f.lt);
-              } else {
-                throw "Neither greater or lesser defined";
-              }
-              numeric.tvs.ranges.push(range);
-              localfilter.lst.push(numeric);
-            }
-          }
-          delete ai_output_json.plot.simpleFilter;
-          ai_output_json.plot.filter = localfilter;
-        }
+        ai_output_json = {
+          type: "html",
+          html: "Unknown classification type"
+        };
       }
       res.send(ai_output_json);
     } catch (e) {
-      if (e.stack) console.log(e.stack);
+      if (e.stack) mayLog(e.stack);
       res.send({ error: e?.message || e });
     }
   };
 }
+async function call_ollama(prompt, model_name, apilink) {
+  const temperature = 0.01;
+  const top_p = 0.95;
+  const timeout = 2e5;
+  const payload = {
+    model: model_name,
+    messages: [{ role: "user", content: prompt }],
+    raw: false,
+    stream: false,
+    keep_alive: 15,
+    //Keep the LLM loaded for 15mins
+    options: {
+      top_p,
+      temperature,
+      num_ctx: 1e4
+    }
+  };
+  try {
+    const result = await ezFetch(apilink + "/api/chat", {
+      method: "POST",
+      body: payload,
+      // ezfetch automatically stringifies objects
+      headers: { "Content-Type": "application/json" },
+      timeout: { request: timeout }
+      // ezfetch accepts milliseconds directly
+    });
+    if (result && result.message && result.message.content && result.message.content.length > 0)
+      return result.message.content;
+    else {
+      throw "Error: Received an unexpected response format:" + result;
+    }
+  } catch (error) {
+    throw "Ollama API request failed:" + error;
+  }
+}
+async function call_sj_llm(prompt, model_name, apilink) {
+  const temperature = 0.01;
+  const top_p = 0.95;
+  const timeout = 2e5;
+  const max_new_tokens = 512;
+  const payload = {
+    inputs: [
+      {
+        model_name,
+        inputs: {
+          text: prompt,
+          max_new_tokens,
+          temperature,
+          top_p
+        }
+      }
+    ]
+  };
+  try {
+    const response = await ezFetch(apilink, {
+      method: "POST",
+      body: payload,
+      // ezfetch automatically stringifies objects
+      headers: { "Content-Type": "application/json" },
+      timeout: { request: timeout }
+      // ezfetch accepts milliseconds directly
+    });
+    if (response.outputs && response.outputs[0] && response.outputs[0].generated_text) {
+      const result = response.outputs[0].generated_text;
+      return result;
+    } else {
+      throw "Error: Received an unexpected response format:" + response;
+    }
+  } catch (error) {
+    throw "SJ API request failed:" + error;
+  }
+}
+async function readJSONFile(file) {
+  const json_file = await fs.promises.readFile(file);
+  return JSON.parse(json_file.toString());
+}
+async function classify_query_by_dataset_type(user_prompt, comp_model_name, llm_backend_type, apilink, aiRoute, dataset_json) {
+  const data = await readJSONFile(aiRoute);
+  let contents = data["general"];
+  for (const key of Object.keys(data)) {
+    if (key != "general") {
+      contents += data[key];
+    }
+  }
+  const classification_ds = dataset_json.charts.filter((chart) => chart.type == "Classification");
+  let train_iter = 0;
+  let training_data = "";
+  if (classification_ds.length > 0 && classification_ds[0].TrainingData.length > 0) {
+    contents += classification_ds.SystemPrompt;
+    for (const train_data of classification_ds[0].TrainingData) {
+      train_iter += 1;
+      training_data += "Example question" + train_iter.toString() + ": " + train_data.question + " Example answer" + train_iter.toString() + ":" + JSON.stringify(train_data.answer) + " ";
+    }
+  }
+  const template = contents + " training data is as follows:" + training_data + " Question: {" + user_prompt + "} Answer: {answer}";
+  let response;
+  if (llm_backend_type == "SJ") {
+    response = await call_sj_llm(template, comp_model_name, apilink);
+  } else if (llm_backend_type == "ollama") {
+    response = await call_ollama(template, comp_model_name, apilink);
+  } else {
+    throw "Unknown LLM backend";
+  }
+  mayLog("response:", response);
+  return JSON.parse(response);
+}
+async function extract_summary_terms(prompt, llm_backend_type, comp_model_name, apilink, dataset_db, dataset_json, genedb, ds) {
+  const rag_docs = await parse_dataset_db(dataset_db);
+  const genes_list = await parse_geneset_db(genedb);
+  const StringifiedSchema = '{"$schema":"http://json-schema.org/draft-07/schema#","$ref":"#/definitions/SummaryType","definitions":{"SummaryType":{"type":"object","properties":{"term":{"type":"string"},"term2":{"type":"string"},"simpleFilter":{"type":"array","items":{"$ref":"#/definitions/FilterTerm"}}},"required":["term","simpleFilter"],"additionalProperties":false},"FilterTerm":{"anyOf":[{"$ref":"#/definitions/CategoricalFilterTerm"},{"$ref":"#/definitions/NumericFilterTerm"}]},"CategoricalFilterTerm":{"type":"object","properties":{"term":{"type":"string"},"category":{"type":"string"}},"required":["term","category"],"additionalProperties":false},"NumericFilterTerm":{"type":"object","properties":{"term":{"type":"string"},"start":{"type":"number"},"stop":{"type":"number"}},"required":["term"],"additionalProperties":false}}}';
+  const words = prompt.replace(/[^a-zA-Z0-9\s]/g, "").split(/\s+/).map((str) => str.toLowerCase());
+  const common_genes = words.filter((item) => genes_list.includes(item));
+  const summary_ds = dataset_json.charts.filter((chart) => chart.type == "Summary");
+  if (summary_ds.length == 0) throw "summary information not present in dataset file";
+  if (summary_ds[0].TrainingData.length == 0) throw "no training data provided for summary agent";
+  let train_iter = 0;
+  let training_data = "";
+  for (const train_data of summary_ds[0].TrainingData) {
+    train_iter += 1;
+    training_data += "Example question" + train_iter.toString() + ": " + train_data.question + " Example answer" + train_iter.toString() + ":" + JSON.stringify(train_data.answer) + " ";
+  }
+  let system_prompt = "I am an assistant that extracts the summary terms from user query. The final output must be in the following JSON format with NO extra comments. The JSON schema is as follows: " + StringifiedSchema + ' term and term2 (if present) should ONLY contain names of the fields from the sqlite db. The "simpleFilter" field is optional and should contain an array of JSON terms with which the dataset will be filtered. A variable simultaneously CANNOT be part of both "term"/"term2" and "simpleFilter". There are two kinds of filter variables: "Categorical" and "Numeric". "Categorical" variables are those variables which can have a fixed set of values e.g. gender, race. They are defined by the "CategoricalFilterTerm" which consists of "term" (a field from the sqlite3 db)  and "category" (a value of the field from the sqlite db).  "Numeric" variables are those which can have any numeric value. They are defined by "NumericFilterTerm" and contain  the subfields "term" (a field from the sqlite3 db), "start" an optional filter which is defined when a lower cutoff is defined in the user input for the numeric variable and "stop" an optional filter which is defined when a higher cutoff is defined in the user input for the numeric variable. ' + summary_ds.SystemPrompt + rag_docs.join(",") + " training data is as follows:" + training_data;
+  if (dataset_json.hasGeneExpression) {
+    if (common_genes.length > 0) {
+      system_prompt += "\n List of relevant genes are as follows (separated by comma(,)):" + common_genes.join(",");
+    }
+  }
+  system_prompt += " Question: {" + prompt + "} answer:";
+  let response;
+  if (llm_backend_type == "SJ") {
+    response = await call_sj_llm(system_prompt, comp_model_name, apilink);
+  } else if (llm_backend_type == "ollama") {
+    response = await call_ollama(system_prompt, comp_model_name, apilink);
+  } else {
+    throw "Unknown LLM backend";
+  }
+  return validate_summary_response(response, common_genes, dataset_json, ds);
+}
+function validate_summary_response(response, common_genes, dataset_json, ds) {
+  const response_type = JSON.parse(response);
+  const pp_plot_json = { chartType: "summary" };
+  let html = "";
+  if (response_type.html) html = response_type.html;
+  if (!response_type.term) html += "term type is not present in summary output";
+  const term1_validation = validate_term(response_type.term, common_genes, dataset_json, ds);
+  if (term1_validation.html.length > 0) {
+    html += term1_validation.html;
+  } else {
+    pp_plot_json.term = term1_validation.term_type;
+  }
+  if (response_type.term2) {
+    const term2_validation = validate_term(response_type.term2, common_genes, dataset_json, ds);
+    if (term2_validation.html.length > 0) {
+      html += term2_validation.html;
+    } else {
+      pp_plot_json.term2 = term2_validation.term_type;
+    }
+  }
+  if (response_type.simpleFilter && response_type.simpleFilter.length > 0) {
+    const validated_filters = validate_filter(response_type.simpleFilter, ds);
+    if (validated_filters.html.length > 0) {
+      html += validated_filters.html;
+    } else {
+      pp_plot_json.filter = validated_filters.simplefilter;
+    }
+  }
+  if (html.length > 0) {
+    return { type: "html", html };
+  } else {
+    return { type: "plot", plot: pp_plot_json };
+  }
+}
+function validate_term(response_term, common_genes, dataset_json, ds) {
+  let html = "";
+  let term_type;
+  const term = ds.cohort.termdb.q.termjsonByOneid(response_term);
+  if (!term) {
+    const gene_hits = common_genes.filter((gene) => gene == response_term.toLowerCase());
+    if (gene_hits.length == 0) {
+      html += "invalid term id:" + response_term;
+    } else {
+      if (dataset_json.hasGeneExpression) {
+        term_type = { term: { gene: response_term.toUpperCase(), type: "geneExpression" } };
+      } else {
+        html += "Dataset does not support gene expression";
+      }
+    }
+  } else {
+    term_type = { id: term.id };
+  }
+  return { term_type, html };
+}
+function validate_filter(filters, ds) {
+  if (!Array.isArray(filters)) throw "filter is not array";
+  let invalid_html = "";
+  const localfilter = { type: "tvslst", in: true, join: "", lst: [] };
+  if (filters.length > 1) localfilter.join = "and";
+  for (const f of filters) {
+    const term = ds.cohort.termdb.q.termjsonByOneid(f.term);
+    if (!term) {
+      invalid_html += "invalid filter id:" + f.term;
+    } else {
+      if (term.type == "categorical") {
+        let cat;
+        for (const ck in term.values) {
+          if (ck == f.category) cat = ck;
+          else if (term.values[ck].label == f.category) cat = ck;
+        }
+        if (!cat) invalid_html += "invalid category from " + JSON.stringify(f);
+        localfilter.lst.push({
+          type: "tvs",
+          tvs: {
+            term,
+            values: [{ key: cat }]
+          }
+        });
+      } else if (term.type == "float" || term.type == "integer") {
+        const numeric = {
+          type: "tvs",
+          tvs: {
+            term,
+            ranges: []
+          }
+        };
+        const range = {};
+        if (f.start && !f.stop) {
+          range.start = Number(f.start);
+          range.stopunbounded = true;
+        } else if (f.stop && !f.start) {
+          range.stop = Number(f.stop);
+          range.startunbounded = true;
+        } else if (f.start && f.stop) {
+          range.start = Number(f.start);
+          range.stop = Number(f.stop);
+        } else {
+          invalid_html += "Neither greater or lesser defined";
+        }
+        numeric.tvs.ranges.push(range);
+        localfilter.lst.push(numeric);
+      }
+    }
+  }
+  return { simplefilter: localfilter, html: invalid_html };
+}
+async function parse_geneset_db(genedb) {
+  let genes_list = [];
+  const db = new Database(genedb);
+  try {
+    const desc_rows = db.prepare("SELECT name from codingGenes").all();
+    desc_rows.forEach((row) => {
+      genes_list.push(row.name);
+    });
+    genes_list = genes_list.map((str) => str.toLowerCase());
+  } catch (error) {
+    throw "Could not parse geneDB" + error;
+  } finally {
+    db.close();
+  }
+  return genes_list;
+}
+async function parse_dataset_db(dataset_db) {
+  const db = new Database(dataset_db);
+  const rag_docs = [];
+  try {
+    const desc_rows = db.prepare("SELECT * from termhtmldef").all();
+    const description_map = [];
+    desc_rows.forEach((row) => {
+      const name = row.id;
+      const jsonhtml = JSON.parse(row.jsonhtml);
+      const description = jsonhtml.description[0].value;
+      description_map.push({ name, description });
+    });
+    const term_db_rows = db.prepare("SELECT * from terms").all();
+    const db_rows = [];
+    term_db_rows.forEach((row) => {
+      const found = description_map.find((item) => item.name === row.id);
+      if (found) {
+        const jsondata = JSON.parse(row.jsondata);
+        const description = description_map.filter((item) => item.name === row.id);
+        const term_type = row.type;
+        const values = [];
+        if (jsondata.values && Object.keys(jsondata.values).length > 0) {
+          for (const key of Object.keys(jsondata.values)) {
+            const value = jsondata.values[key];
+            const db_val = { key, value };
+            values.push(db_val);
+          }
+        }
+        const db_row = {
+          name: row.id,
+          description: description[0].description,
+          values,
+          term_type
+        };
+        const stringified_db = parse_db_rows(db_row);
+        rag_docs.push(stringified_db);
+        db_rows.push(db_row);
+      }
+    });
+  } catch (error) {
+    throw "Error in parsing dataset DB:" + error;
+  } finally {
+    db.close();
+  }
+  return rag_docs;
+}
+function parse_db_rows(db_row) {
+  let output_string = "Name of the field is:" + db_row.name + ". This field is of the type:" + db_row.term_type + ". Description: " + db_row.description;
+  if (db_row.values.length > 0) {
+    output_string += "This field contains the following possible values.";
+    for (const value of db_row.values) {
+      if (value.value && value.value.label) {
+        output_string += "The key is " + value.key + " and the label is " + value.value.label + ".";
+      }
+    }
+  }
+  return output_string;
+}
 export {
   api
 };