npm - @sjcrh/proteinpaint-rust - Versions diffs - 2.166.0 → 2.169.0 - Mend

@sjcrh/proteinpaint-rust 2.166.0 → 2.169.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-	"version": "2.166.0",
+	"version": "2.169.0",
 	"name": "@sjcrh/proteinpaint-rust",
 	"type": "module",
 	"description": "Rust-based utilities for proteinpaint",

package/src/aichatbot.rs CHANGED Viewed

@@ -30,21 +30,51 @@ pub struct AiJsonFormat {
 #[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
 enum Charts {
     // More chart types will be added here later
-    Summary(TrainTestData),
-    DE(TrainTestData),
+    Summary(TrainTestDataSummary),
+    DE(TrainTestDataDE),
 }
 #[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
-struct TrainTestData {
+struct TrainTestDataSummary {
     SystemPrompt: String,
-    TrainingData: Vec<QuestionAnswer>,
-    TestData: Vec<QuestionAnswer>,
+    TrainingData: Vec<QuestionAnswerSummary>,
+    TestData: Vec<QuestionAnswerSummary>,
 }
 #[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
-struct QuestionAnswer {
+struct QuestionAnswerSummary {
     question: String,
-    answer: String,
+    answer: SummaryType,
+}
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct TrainTestDataDE {
+    SystemPrompt: String,
+    TrainingData: Vec<QuestionAnswerDE>,
+    TestData: Vec<QuestionAnswerDE>,
+}
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct QuestionAnswerDE {
+    question: String,
+    answer: DEType,
+}
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct DEType {
+    action: String,
+    DE_output: DETerms,
+}
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct DETerms {
+    group1: GroupType,
+    group2: GroupType,
+}
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct GroupType {
+    name: String,
 }
 #[allow(non_camel_case_types)]
@@ -77,6 +107,27 @@ async fn main() -> Result<()> {
                         None => panic!("user_input field is missing in input json"),
                     }
+                    let dataset_db_json: &JsonValue = &json_string["dataset_db"];
+                    let dataset_db_str: &str;
+                    match dataset_db_json.as_str() {
+                        Some(inp) => dataset_db_str = inp,
+                        None => panic!("dataset_db field is missing in input json"),
+                    }
+                    let genedb_json: &JsonValue = &json_string["genedb"];
+                    let genedb_str: &str;
+                    match genedb_json.as_str() {
+                        Some(inp) => genedb_str = inp,
+                        None => panic!("genedb field is missing in input json"),
+                    }
+                    let aiRoute_json: &JsonValue = &json_string["aiRoute"];
+                    let aiRoute_str: &str;
+                    match aiRoute_json.as_str() {
+                        Some(inp) => aiRoute_str = inp,
+                        None => panic!("aiRoute field is missing in input json"),
+                    }
                     if user_input.len() == 0 {
                         panic!("The user input is empty");
                     }
@@ -124,8 +175,9 @@ async fn main() -> Result<()> {
                     let ai_json: AiJsonFormat =
                         serde_json::from_str(&ai_data).expect("AI JSON file does not have the correct format");
-                    let genedb = String::from(tpmasterdir) + &"/" + &ai_json.genedb;
-                    let dataset_db = String::from(tpmasterdir) + &"/" + &ai_json.db;
+                    let genedb = String::from(tpmasterdir) + &"/" + &genedb_str;
+                    let dataset_db = String::from(tpmasterdir) + &"/" + &dataset_db_str;
+                    let airoute = String::from(binpath) + &"/../../" + &aiRoute_str;
                     let apilink_json: &JsonValue = &json_string["apilink"];
                     let apilink: &str;
@@ -160,7 +212,7 @@ async fn main() -> Result<()> {
                     let temperature: f64 = 0.01;
                     let max_new_tokens: usize = 512;
                     let top_p: f32 = 0.95;
+                    let testing = false; // This variable is always false in production, this is true in test_ai.rs for testing code
                     if llm_backend_name != "ollama" && llm_backend_name != "SJ" {
                         panic!(
                             "This code currently supports only Ollama and SJ provider. llm_backend_name must be \"ollama\" or \"SJ\""
@@ -185,6 +237,8 @@ async fn main() -> Result<()> {
                             &dataset_db,
                             &genedb,
                             &ai_json,
+                            &airoute,
+                            testing,
                         )
                         .await;
                     } else if llm_backend_name == "SJ".to_string() {
@@ -207,6 +261,8 @@ async fn main() -> Result<()> {
                             &dataset_db,
                             &genedb,
                             &ai_json,
+                            &airoute,
+                            testing,
                         )
                         .await;
                     }
@@ -239,6 +295,8 @@ pub async fn run_pipeline(
     dataset_db: &str,
     genedb: &str,
     ai_json: &AiJsonFormat,
+    ai_route: &str,
+    testing: bool,
 ) -> Option<String> {
     let mut classification: String = classify_query_by_dataset_type(
         user_input,
@@ -248,6 +306,7 @@ pub async fn run_pipeline(
         temperature,
         max_new_tokens,
         top_p,
+        ai_route,
     )
     .await;
     classification = classification.replace("\"", "");
@@ -263,13 +322,20 @@ pub async fn run_pipeline(
             top_p,
         )
         .await;
-        final_output = format!(
-            "{{\"{}\":\"{}\",\"{}\":[{}}}",
-            "action",
-            "dge",
-            "DE_output",
-            de_result + &"]"
-        );
+        if testing == true {
+            final_output = format!(
+                "{{\"{}\":\"{}\",\"{}\":[{}}}",
+                "action",
+                "dge",
+                "DE_output",
+                de_result + &"]"
+            );
+        } else {
+            final_output = format!(
+                "{{\"{}\":\"{}\",\"{}\":\"{}\"}}",
+                "type", "html", "html", "DE agent not implemented yet"
+            );
+        }
     } else if classification == "summary".to_string() {
         final_output = extract_summary_information(
             user_input,
@@ -282,30 +348,83 @@ pub async fn run_pipeline(
             dataset_db,
             genedb,
             ai_json,
+            testing,
         )
         .await;
     } else if classification == "hierarchical".to_string() {
         // Not implemented yet
-        final_output = format!("{{\"{}\":\"{}\"}}", "action", "hierarchical");
+        if testing == true {
+            final_output = format!("{{\"{}\":\"{}\"}}", "action", "hierarchical");
+        } else {
+            final_output = format!(
+                "{{\"{}\":\"{}\",\"{}\":\"{}\"}}",
+                "type", "html", "html", "hierarchical clustering agent not implemented yet"
+            );
+        }
     } else if classification == "snv_indel".to_string() {
         // Not implemented yet
-        final_output = format!("{{\"{}\":\"{}\"}}", "action", "snv_indel");
+        if testing == true {
+            final_output = format!("{{\"{}\":\"{}\"}}", "action", "snv_indel");
+        } else {
+            final_output = format!(
+                "{{\"{}\":\"{}\",\"{}\":\"{}\"}}",
+                "type", "html", "html", "snv_indel agent not implemented yet"
+            );
+        }
     } else if classification == "cnv".to_string() {
         // Not implemented yet
-        final_output = format!("{{\"{}\":\"{}\"}}", "action", "cnv");
+        if testing == true {
+            final_output = format!("{{\"{}\":\"{}\"}}", "action", "cnv");
+        } else {
+            final_output = format!(
+                "{{\"{}\":\"{}\",\"{}\":\"{}\"}}",
+                "type", "html", "html", "cnv agent not implemented yet"
+            );
+        }
     } else if classification == "variant_calling".to_string() {
         // Not implemented yet and will never be supported. Need a separate messages for this
-        final_output = format!("{{\"{}\":\"{}\"}}", "action", "variant_calling");
+        if testing == true {
+            final_output = format!("{{\"{}\":\"{}\"}}", "action", "variant_calling");
+        } else {
+            final_output = format!(
+                "{{\"{}\":\"{}\",\"{}\":\"{}\"}}",
+                "type", "html", "html", "variant_calling agent not implemented yet"
+            );
+        }
     } else if classification == "survival".to_string() {
         // Not implemented yet
-        final_output = format!("{{\"{}\":\"{}\"}}", "action", "surivial");
+        if testing == true {
+            final_output = format!("{{\"{}\":\"{}\"}}", "action", "surivial");
+        } else {
+            final_output = format!(
+                "{{\"{}\":\"{}\",\"{}\":\"{}\"}}",
+                "type", "html", "html", "survival agent not implemented yet"
+            );
+        }
     } else if classification == "none".to_string() {
-        final_output = format!(
-            "{{\"{}\":\"{}\",\"{}\":\"{}\"}}",
-            "action", "none", "message", "The input query did not match any known features in Proteinpaint"
-        );
+        if testing == true {
+            final_output = format!(
+                "{{\"{}\":\"{}\",\"{}\":\"{}\"}}",
+                "action", "none", "message", "The input query did not match any known features in Proteinpaint"
+            );
+        } else {
+            final_output = format!(
+                "{{\"{}\":\"{}\",\"{}\":\"{}\"}}",
+                "type", "html", "html", "The input query did not match any known features in Proteinpaint"
+            );
+        }
     } else {
-        final_output = format!("{{\"{}\":\"{}\"}}", "action", "unknown:".to_string() + &classification);
+        if testing == true {
+            final_output = format!("{{\"{}\":\"{}\"}}", "action", "unknown:".to_string() + &classification);
+        } else {
+            final_output = format!(
+                "{{\"{}\":\"{}\",\"{}\":\"{}\"}}",
+                "type",
+                "html",
+                "html",
+                "unknown:".to_string() + &classification
+            );
+        }
     }
     Some(final_output)
 }
@@ -313,101 +432,33 @@ pub async fn run_pipeline(
 async fn classify_query_by_dataset_type(
     user_input: &str,
     comp_model: impl rig::completion::CompletionModel + 'static,
-    embedding_model: impl rig::embeddings::EmbeddingModel + 'static,
+    _embedding_model: impl rig::embeddings::EmbeddingModel + 'static,
     llm_backend_type: &llm_backend,
     temperature: f64,
     max_new_tokens: usize,
     top_p: f32,
+    ai_route: &str,
 ) -> String {
-    // Create a string to hold the file contents
-    let contents = String::from("SNV/SNP or point mutations nucleotide mutations are very common forms of mutations which can often give rise to genetic diseases such as cancer, Alzheimer's disease etc. They can be duw to substitution of nucleotide, or insertion or deletion of a nucleotide. Indels are multi-nucleotide insertion/deletion/substitutions. Complex indels are indels where insertion and deletion have happened in the same genomic locus. Every genomic sample from each patient has its own set of mutations therefore requiring personalized treatment.
-If a ProteinPaint dataset contains SNV/Indel/SV data then return JSON with single key, 'snv_indel'.
----
-Copy number variation (CNV) is a phenomenon in which sections of the genome are repeated and the number of repeats in the genome varies between individuals.[1] Copy number variation is a special type of structural variation: specifically, it is a type of duplication or deletion event that affects a considerable number of base pairs.
-If a ProteinPaint dataset contains copy number variation data then return JSON with single key, 'cnv'.
----
-Structural variants/fusions (SV) are genomic mutations when eith a DNA region is translocated or copied to an entirely different genomic locus. In case of transcriptomic data, when RNA is fused from two different genes its called a gene fusion.
-If a ProteinPaint dataset contains structural variation or gene fusion data then return JSON with single key, 'sv_fusion'.
----
-Hierarchical clustering of gene expression is an unsupervised learning technique where several number of relevant genes and the samples are clustered so as to determine (previously unknown) cohorts of samples (or patients) or structure in data. It is very commonly used to determine subtypes of a particular disease based on RNA sequencing data.
-If a ProteinPaint dataset contains hierarchical data then return JSON with single key, 'hierarchical'.
----
-Differential Gene Expression (DGE or DE) is a technique where the most upregulated (or highest) and downregulated (or lowest) genes between two cohorts of samples (or patients) are determined from a pool of THOUSANDS of genes. Differential gene expression CANNOT be computed for a SINGLE gene. A volcano plot is shown with fold-change in the x-axis and adjusted p-value on the y-axis. So, the upregulated and downregulared genes are on opposite sides of the graph and the most significant genes (based on adjusted p-value) is on the top of the graph. Following differential gene expression generally GeneSet Enrichment Analysis (GSEA) is carried out where based on the genes and their corresponding fold changes the upregulation/downregulation of genesets (or pathways) is determined.
-Sample Query1: \"Which gene has the highest expression between the two genders\"
-Sample Answer1: { \"answer\": \"dge\" }
-Sample Query2: \"Which gene has the lowest expression between the two races\"
-Sample Answer2: { \"answer\": \"dge\" }
-Sample Query1: \"Which genes are the most upregulated genes between group A and group B\"
-Sample Answer1: { \"answer\": \"dge\" }
-Sample Query3: \"Which gene are overexpressed between male and female\"
-Sample Answer3: { \"answer\": \"dge\" }
-Sample Query4: \"Which gene are housekeeping genes between male and female\"
-Sample Answer4: { \"answer\": \"dge\" }
-If a ProteinPaint dataset contains differential gene expression data then return JSON with single key, 'dge'.
----
+    // Read the file
+    let ai_route_data = fs::read_to_string(ai_route).unwrap();
-Survival analysis (also called time-to-event analysis or duration analysis) is a branch of statistics aimed at analyzing the duration of time from a well-defined time origin until one or more events happen, called survival times or duration times. In other words, in survival analysis, we are interested in a certain event and want to analyze the time until the event happens. Generally in survival analysis survival rates between two (or more) cohorts of patients  is compared.
+    // Parse the JSON data
+    let ai_json: Value = serde_json::from_str(&ai_route_data).expect("AI JSON file does not have the correct format");
-There are two main methods of survival analysis:
-1) Kaplan-Meier (HM) analysis is a univariate test that only takes into account a single categorical variable.
-2) Cox proportional hazards model (coxph) is a multivariate test that can take into account multiple variables.
-   The hazard ratio (HR) is an indicator of the effect of the stimulus (e.g. drug dose, treatment) between two cohorts of patients.
-   HR = 1: No effect
-   HR < 1: Reduction in the hazard
-   HR > 1: Increase in Hazard
-Sample Query1: \"Compare survival rates between group A and B\"
-Sample Answer1: { \"answer\": \"survival\" }
-If a ProteinPaint dataset contains survival data then return JSON with single key, 'survival'.
----
-Next generation sequencing reads (NGS) are mapped to a human genome using alignment algorithm such as burrows-wheelers alignment algorithm. Then these reads are called using variant calling algorithms such as GATK (Genome Analysis Toolkit). However this type of analysis is too compute intensive and beyond the scope of visualization software such as ProteinPaint.
-If a user query asks about variant calling or mapping reads then JSON with single key, 'variant_calling'.
----
-Summary plot in ProteinPaint shows the various facets of the datasets. Show expression of a SINGLE gene or compare the expression of a SINGLE gene across two different cohorts defined by the user. It may show all the samples according to their respective diagnosis or subtypes of cancer. It is also useful for comparing and correlating different clinical variables. It can show all possible distributions, frequency of a category, overlay, correlate or cross-tabulate with another variable on top of it. If a user query asks about a SINGLE gene expression or correlating clinical variables then return JSON with single key, 'summary'.
-Sample Query1: \"Show all fusions for patients with age less than 30\"
-Sample Answer1: { \"answer\": \"summary\" }
-Sample Query2: \"List all molecular subtypes of leukemia\"
-Sample Answer2: { \"answer\": \"summary\" }
-Sample Query3: \"is tp53 expression higher in men than women ?\"
-Sample Answer3: { \"answer\": \"summary\" }
-Sample Query4: \"Compare ATM expression between races for women greater than 80yrs\"
-Sample Answer4: { \"answer\": \"summary\" }
+    // Create a string to hold the file contents
+    let mut contents = String::from("");
+    if let Some(object) = ai_json.as_object() {
+        for (_key, value) in object {
+            contents += &value.as_str().unwrap();
+            contents += "---"; // Adding delimiter
+        }
+    }
-If a query does not match any of the fields described above, then return JSON with single key, 'none'
-");
+    // Removing the last "---" characters
+    contents.pop();
+    contents.pop();
+    contents.pop();
     // Split the contents by the delimiter "---"
     let parts: Vec<&str> = contents.split("---").collect();
@@ -438,18 +489,18 @@ If a query does not match any of the fields described above, then return JSON wi
         rag_docs.push(part.trim().to_string())
     }
-    //let top_k: usize = 3;
+    //let top_k: usize = 3; // Embedding model not used currently
     // Create embeddings and add to vector store
-    let embeddings = EmbeddingsBuilder::new(embedding_model.clone())
-        .documents(rag_docs)
-        .expect("Reason1")
-        .build()
-        .await
-        .unwrap();
+    //let embeddings = EmbeddingsBuilder::new(embedding_model.clone())
+    //    .documents(rag_docs)
+    //    .expect("Reason1")
+    //    .build()
+    //    .await
+    //    .unwrap();
-    // Create vector store
-    let mut vector_store = InMemoryVectorStore::<String>::default();
-    InMemoryVectorStore::add_documents(&mut vector_store, embeddings);
+    //// Create vector store
+    //let mut vector_store = InMemoryVectorStore::<String>::default();
+    //InMemoryVectorStore::add_documents(&mut vector_store, embeddings);
     // Create RAG agent
     let agent = AgentBuilder::new(comp_model).preamble(&(String::from("Generate classification for the user query into summary, dge, hierarchical, snv_indel, cnv, variant_calling, sv_fusion and none categories. Return output in JSON with ALWAYS a single word answer { \"answer\": \"dge\" }, that is 'summary' for summary plot, 'dge' for differential gene expression, 'hierarchical' for hierarchical clustering, 'snv_indel' for SNV/Indel, 'cnv' for CNV and 'sv_fusion' for SV/fusion, 'variant_calling' for variant calling, 'surivial' for survival data, 'none' for none of the previously described categories. The summary plot list and summarizes the cohort of patients according to the user query. The answer should always be in lower case\n The options are as follows:\n") + &contents + "\nQuestion= {question} \nanswer")).temperature(temperature).additional_params(additional).build();
@@ -801,6 +852,7 @@ async fn extract_summary_information(
     dataset_db: &str,
     genedb: &str,
     ai_json: &AiJsonFormat,
+    testing: bool,
 ) -> String {
     let (rag_docs, db_vec) = parse_dataset_db(dataset_db).await;
     let additional;
@@ -845,7 +897,7 @@ async fn extract_summary_information(
         .filter(|x| user_words2.contains(&x.to_lowercase()))
         .collect();
-    let mut summary_data_check: Option<TrainTestData> = None;
+    let mut summary_data_check: Option<TrainTestDataSummary> = None;
     for chart in ai_json.charts.clone() {
         if let Charts::Summary(traindata) = chart {
             summary_data_check = Some(traindata);
@@ -858,6 +910,7 @@ async fn extract_summary_information(
             let mut training_data: String = String::from("");
             let mut train_iter = 0;
             for ques_ans in summary_data.TrainingData {
+                let summary_answer: SummaryType = ques_ans.answer;
                 train_iter += 1;
                 training_data += "Example question";
                 training_data += &train_iter.to_string();
@@ -867,7 +920,7 @@ async fn extract_summary_information(
                 training_data += "Example answer";
                 training_data += &train_iter.to_string();
                 training_data += &":";
-                training_data += &ques_ans.answer;
+                training_data += &serde_json::to_string(&summary_answer).unwrap();
                 training_data += &"\n";
             }
@@ -919,7 +972,8 @@ async fn extract_summary_information(
                 }
             }
             //println!("final_llm_json:{}", final_llm_json);
-            let final_validated_json = validate_summary_output(final_llm_json.clone(), db_vec, common_genes, ai_json);
+            let final_validated_json =
+                validate_summary_output(final_llm_json.clone(), db_vec, common_genes, ai_json, testing);
             final_validated_json
         }
         None => {
@@ -949,7 +1003,7 @@ struct SummaryType {
 impl SummaryType {
     #[allow(dead_code)]
-    pub fn sort_summarytype_struct(&mut self) {
+    pub fn sort_summarytype_struct(mut self) -> SummaryType {
         // This function is necessary for testing (test_ai.rs) to see if two variables of type "SummaryType" are equal or not. Without this a vector of two Summarytype holding the same values but in different order will be classified separately.
         self.summaryterms.sort();
@@ -957,6 +1011,7 @@ impl SummaryType {
             Some(ref mut filterterms) => filterterms.sort(),
             None => {}
         }
+        self.clone()
     }
 }
@@ -974,7 +1029,7 @@ impl PartialOrd for SummaryTerms {
             (SummaryTerms::clinical(_), SummaryTerms::clinical(_)) => Some(std::cmp::Ordering::Equal),
             (SummaryTerms::geneExpression(_), SummaryTerms::geneExpression(_)) => Some(std::cmp::Ordering::Equal),
             (SummaryTerms::clinical(_), SummaryTerms::geneExpression(_)) => Some(std::cmp::Ordering::Greater),
-            (SummaryTerms::geneExpression(_), SummaryTerms::clinical(_)) => Some(std::cmp::Ordering::Greater),
+            (SummaryTerms::geneExpression(_), SummaryTerms::clinical(_)) => Some(std::cmp::Ordering::Less),
         }
     }
 }
@@ -1063,6 +1118,7 @@ fn validate_summary_output(
     db_vec: Vec<DbRows>,
     common_genes: Vec<String>,
     ai_json: &AiJsonFormat,
+    testing: bool,
 ) -> String {
     let json_value: SummaryType =
        serde_json::from_str(&raw_llm_json).expect("Did not get a valid JSON of type {action: summary, summaryterms:[{clinical: term1}, {geneExpression: gene}], filter:[{term: term1, value: value1}]} from the LLM");
@@ -1094,7 +1150,7 @@ fn validate_summary_output(
                     match term_verification.correct_field {
                         Some(tm) => validated_summary_terms.push(SummaryTerms::clinical(tm)),
                         None => {
-                            message = message + &"\"" + &clin + &"\"" + &" not found in db.";
+                            message = message + &"'" + &clin + &"'" + &" not found in db.";
                         }
                     }
                 } else if Some(term_verification.correct_field.clone()).is_some()
@@ -1122,7 +1178,7 @@ fn validate_summary_output(
                         if num_gene_verification == 0 || common_genes.len() == 0 {
                             if message.to_lowercase().contains(&gene.to_lowercase()) { // Check if the LLM has already added the message, if not then add it
                             } else {
-                                message = message + &"\"" + &gene + &"\"" + &" not found in genedb.";
+                                message = message + &"'" + &gene + &"'" + &" not found in genedb.";
                             }
                         }
                     }
@@ -1138,6 +1194,8 @@ fn validate_summary_output(
         }
     }
+    let mut pp_plot_json: Value; // The PP compliant plot JSON
+    pp_plot_json = serde_json::from_str(&"{\"chartType\":\"summary\"}").expect("Not a valid JSON");
     match &json_value.filter {
         Some(filter_terms_array) => {
             let mut validated_filter_terms = Vec::<FilterTerm>::new();
@@ -1168,21 +1226,21 @@ fn validate_summary_output(
                             validated_filter_terms.push(categorical_filter_term);
                         }
                         if term_verification.correct_field.is_none() {
-                            message = message + &"\"" + &categorical.term + &"\" filter term not found in db";
+                            message = message + &"'" + &categorical.term + &"' filter term not found in db";
                         }
                         if value_verification.is_none() {
                             message = message
-                                + &"\""
+                                + &"'"
                                 + &categorical.value
-                                + &"\" filter value not found for filter field \""
+                                + &"' filter value not found for filter field '"
                                 + &categorical.term
-                                + "\" in db";
+                                + "' in db";
                         }
                     }
                     FilterTerm::Numeric(numeric) => {
                         let term_verification = verify_json_field(&numeric.term, &db_vec);
                         if term_verification.correct_field.is_none() {
-                            message = message + &"\"" + &numeric.term + &"\" filter term not found in db";
+                            message = message + &"'" + &numeric.term + &"' filter term not found in db";
                         } else {
                             let numeric_filter_term: FilterTerm = FilterTerm::Numeric(numeric.clone());
                             validated_filter_terms.push(numeric_filter_term);
@@ -1229,8 +1287,68 @@ fn validate_summary_output(
             }
             if validated_filter_terms.len() > 0 {
-                if let Some(obj) = new_json.as_object_mut() {
-                    obj.insert(String::from("filter"), serde_json::json!(validated_filter_terms));
+                if testing == true {
+                    if let Some(obj) = new_json.as_object_mut() {
+                        obj.insert(String::from("filter"), serde_json::json!(validated_filter_terms));
+                    }
+                } else {
+                    let mut validated_filter_terms_PP: String = "[".to_string();
+                    let mut filter_hits = 0;
+                    for validated_term in validated_filter_terms {
+                        match validated_term {
+                            FilterTerm::Categorical(categorical_filter) => {
+                                let string_json = "{\"term\":\"".to_string()
+                                    + &categorical_filter.term
+                                    + &"\", \"category\":\""
+                                    + &categorical_filter.value
+                                    + &"\"},";
+                                validated_filter_terms_PP += &string_json;
+                            }
+                            FilterTerm::Numeric(numeric_filter) => {
+                                let string_json;
+                                if numeric_filter.greaterThan.is_some() && numeric_filter.lessThan.is_none() {
+                                    string_json = "{\"term\":\"".to_string()
+                                        + &numeric_filter.term
+                                        + &"\", \"gt\":\""
+                                        + &numeric_filter.greaterThan.unwrap().to_string()
+                                        + &"\"},";
+                                } else if numeric_filter.greaterThan.is_none() && numeric_filter.lessThan.is_some() {
+                                    string_json = "{\"term\":\"".to_string()
+                                        + &numeric_filter.term
+                                        + &"\", \"lt\":\""
+                                        + &numeric_filter.lessThan.unwrap().to_string()
+                                        + &"\"},";
+                                } else if numeric_filter.greaterThan.is_some() && numeric_filter.lessThan.is_some() {
+                                    string_json = "{\"term\":\"".to_string()
+                                        + &numeric_filter.term
+                                        + &"\", \"lt\":\""
+                                        + &numeric_filter.lessThan.unwrap().to_string()
+                                        + &"\", \"gt\":\""
+                                        + &numeric_filter.greaterThan.unwrap().to_string()
+                                        + &"\"},";
+                                } else {
+                                    // When both greater and less than are none
+                                    panic!(
+                                        "Numeric filter term {} is missing both greater than and less than values. One of them must be defined",
+                                        &numeric_filter.term
+                                    );
+                                }
+                                validated_filter_terms_PP += &string_json;
+                            }
+                        };
+                        filter_hits += 1;
+                    }
+                    println!("validated_filter_terms_PP:{}", validated_filter_terms_PP);
+                    if filter_hits > 0 {
+                        validated_filter_terms_PP.pop();
+                        validated_filter_terms_PP += &"]";
+                        if let Some(obj) = pp_plot_json.as_object_mut() {
+                            obj.insert(
+                                String::from("simpleFilter"),
+                                serde_json::from_str(&validated_filter_terms_PP).expect("Not a valid JSON"),
+                            );
+                        }
+                    }
                 }
             }
         }
@@ -1240,6 +1358,10 @@ fn validate_summary_output(
     // Removing terms that are found both in filter term as well summary
     let mut validated_summary_terms_final = Vec::<SummaryTerms>::new();
+    let mut sum_iter = 0;
+    let mut pp_json: Value; // New JSON value that will contain items of the final PP compliant JSON
+    pp_json = serde_json::from_str(&"{\"type\":\"plot\"}").expect("Not a valid JSON");
     for summary_term in &validated_summary_terms {
         let mut hit = 0;
         match summary_term {
@@ -1276,9 +1398,53 @@ fn validate_summary_output(
                 }
             }
         }
         if hit == 0 {
+            let mut termidpp: Option<TermIDPP> = None;
+            let mut geneexp: Option<GeneExpressionPP> = None;
+            match summary_term {
+                SummaryTerms::clinical(clinical_term) => {
+                    termidpp = Some(TermIDPP {
+                        id: clinical_term.to_string(),
+                    });
+                }
+                SummaryTerms::geneExpression(gene) => {
+                    geneexp = Some(GeneExpressionPP {
+                        gene: gene.to_string(),
+                        r#type: "geneExpression".to_string(),
+                    });
+                }
+            }
+            if sum_iter == 0 {
+                if termidpp.is_some() {
+                    if let Some(obj) = pp_plot_json.as_object_mut() {
+                        obj.insert(String::from("term"), serde_json::json!(Some(termidpp)));
+                    }
+                }
+                if geneexp.is_some() {
+                    let gene_term = GeneTerm { term: geneexp.unwrap() };
+                    if let Some(obj) = pp_plot_json.as_object_mut() {
+                        obj.insert(String::from("term"), serde_json::json!(gene_term));
+                    }
+                }
+            } else if sum_iter == 1 {
+                if termidpp.is_some() {
+                    if let Some(obj) = pp_plot_json.as_object_mut() {
+                        obj.insert(String::from("term2"), serde_json::json!(Some(termidpp)));
+                    }
+                }
+                if geneexp.is_some() {
+                    let gene_term = GeneTerm { term: geneexp.unwrap() };
+                    if let Some(obj) = pp_plot_json.as_object_mut() {
+                        obj.insert(String::from("term2"), serde_json::json!(gene_term));
+                    }
+                }
+            }
             validated_summary_terms_final.push(summary_term.clone())
         }
+        sum_iter += 1
     }
     if let Some(obj) = new_json.as_object_mut() {
@@ -1288,14 +1454,61 @@ fn validate_summary_output(
         );
     }
+    if let Some(obj) = pp_json.as_object_mut() {
+        // The `if let` ensures we only proceed if the top-level JSON is an object.
+        // Append a new string field.
+        obj.insert(String::from("plot"), serde_json::json!(pp_plot_json));
+    }
+    let mut err_json: Value; // Error JSON containing the error message (if present)
     if message.len() > 0 {
-        if let Some(obj) = new_json.as_object_mut() {
-            // The `if let` ensures we only proceed if the top-level JSON is an object.
-            // Append a new string field.
-            obj.insert(String::from("message"), serde_json::json!(message));
+        if testing == false {
+            err_json = serde_json::from_str(&"{\"type\":\"html\"}").expect("Not a valid JSON");
+            if let Some(obj) = err_json.as_object_mut() {
+                // The `if let` ensures we only proceed if the top-level JSON is an object.
+                // Append a new string field.
+                obj.insert(String::from("html"), serde_json::json!(message));
+            };
+            serde_json::to_string(&err_json).unwrap()
+        } else {
+            if let Some(obj) = new_json.as_object_mut() {
+                // The `if let` ensures we only proceed if the top-level JSON is an object.
+                // Append a new string field.
+                obj.insert(String::from("message"), serde_json::json!(message));
+            };
+            serde_json::to_string(&new_json).unwrap()
+        }
+    } else {
+        if testing == true {
+            // When testing script output native LLM JSON
+            serde_json::to_string(&new_json).unwrap()
+        } else {
+            // When in production output PP compliant JSON
+            serde_json::to_string(&pp_json).unwrap()
         }
     }
-    serde_json::to_string(&new_json).unwrap()
+}
+fn getGeneExpression() -> String {
+    "geneExpression".to_string()
+}
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct TermIDPP {
+    id: String,
+}
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct GeneTerm {
+    term: GeneExpressionPP,
+}
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct GeneExpressionPP {
+    gene: String,
+    // Serde uses this for deserialization.
+    #[serde(default = "getGeneExpression")]
+    r#type: String,
 }
 #[derive(Debug, Clone)]

package/src/manhattan_plot.rs CHANGED Viewed

@@ -57,6 +57,7 @@ struct InteractiveData {
     x_buffer: i64,
     y_min: f64,
     y_max: f64,
+    device_pixel_ratio: f64,
 }
 #[derive(Serialize)]
@@ -216,8 +217,10 @@ fn grin2_file_read(
                 Some(q) => q,
                 None => continue,
             };
             let q_val: f64 = match q_val_str.parse() {
                 Ok(v) if v > 0.0 => v,
+                Ok(v) if v == 0.0 => 1e-300, // Treat exact 0 as ~1e-300 so we can still show q-values that are 0 and not filter them out
                 _ => continue,
             };
             let neg_log10_q = -q_val.log10();
@@ -335,12 +338,8 @@ fn plot_grin2_manhattan(
     let png_width = plot_width + 2 * png_dot_radius;
     let png_height = plot_height + 2 * png_dot_radius;
-    let w: u32 = (png_width * device_pixel_ratio as u64)
-        .try_into()
-        .expect("PNG width too large for u32");
-    let h: u32 = (png_height * device_pixel_ratio as u64)
-        .try_into()
-        .expect("PNG height too large for u32");
+    let w: u32 = ((png_width as f64) * dpr) as u32;
+    let h: u32 = ((png_height as f64) * dpr) as u32;
     // Create RGB buffer for Plotters
     let mut buffer = vec![0u8; w as usize * h as usize * 3];
@@ -402,8 +401,8 @@ fn plot_grin2_manhattan(
         for (i, p) in point_details.iter_mut().enumerate() {
             let (px, py) = pixel_positions[*&sig_indices[i]];
-            p.pixel_x = px;
-            p.pixel_y = py;
+            p.pixel_x = px / dpr;
+            p.pixel_y = py / dpr;
         }
         // flush root drawing area
@@ -469,6 +468,7 @@ fn plot_grin2_manhattan(
         x_buffer,
         y_min,
         y_max,
+        device_pixel_ratio: dpr,
     };
     Ok((png_data, interactive_data))
 }

package/src/test_ai.rs CHANGED Viewed

@@ -20,6 +20,7 @@ mod tests {
         ollama_comp_model_name: String,
         ollama_embedding_model_name: String,
         genomes: Vec<Genomes>,
+        aiRoute: String,
     }
     #[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
@@ -42,13 +43,14 @@ mod tests {
         let top_p: f32 = 0.95;
         let serverconfig_file_path = Path::new("../../serverconfig.json");
         let absolute_path = serverconfig_file_path.canonicalize().unwrap();
+        let testing = true; // This causes the JSON being output from run_pipeline() to be in LLM JSON format
         // Read the file
         let data = fs::read_to_string(absolute_path).unwrap();
         // Parse the JSON data
         let serverconfig: ServerConfig = serde_json::from_str(&data).expect("JSON not in serverconfig.json format");
+        let airoute = String::from("../../") + &serverconfig.aiRoute;
         for genome in &serverconfig.genomes {
             for dataset in &genome.datasets {
                 match &dataset.aifiles {
@@ -83,7 +85,6 @@ mod tests {
                                 .expect("Ollama server not found");
                             let embedding_model = ollama_client.embedding_model(ollama_embedding_model_name);
                             let comp_model = ollama_client.completion_model(ollama_comp_model_name);
                             for chart in ai_json.charts.clone() {
                                 match chart {
                                     super::super::Charts::Summary(testdata) => {
@@ -100,13 +101,16 @@ mod tests {
                                                 &dataset_db,
                                                 &genedb,
                                                 &ai_json,
+                                                &airoute,
+                                                testing,
                                             )
                                             .await;
-                                            let mut llm_json_value: super::super::SummaryType = serde_json::from_str(&llm_output.unwrap()).expect("Did not get a valid JSON of type {action: summary, summaryterms:[{clinical: term1}, {geneExpression: gene}], filter:[{term: term1, value: value1}]} from the LLM");
-                                            let mut expected_json_value: super::super::SummaryType = serde_json::from_str(&ques_ans.answer).expect("Did not get a valid JSON of type {action: summary, summaryterms:[{clinical: term1}, {geneExpression: gene}], filter:[{term: term1, value: value1}]} from the LLM");
+                                            let llm_json_value: super::super::SummaryType = serde_json::from_str(&llm_output.unwrap()).expect("Did not get a valid JSON of type {action: summary, summaryterms:[{clinical: term1}, {geneExpression: gene}], filter:[{term: term1, value: value1}]} from the LLM");
+                                            let sum: super::super::SummaryType = ques_ans.answer;
+                                            //println!("expected answer:{:?}", &sum);
                                             assert_eq!(
                                                 llm_json_value.sort_summarytype_struct(),
-                                                expected_json_value.sort_summarytype_struct()
+                                                sum.sort_summarytype_struct()
                                             );
                                         }
                                     }
@@ -142,13 +146,27 @@ mod tests {
                                                     &dataset_db,
                                                     &genedb,
                                                     &ai_json,
+                                                    &airoute,
+                                                    testing,
                                                 )
                                                 .await;
-                                                let mut llm_json_value: super::super::SummaryType = serde_json::from_str(&llm_output.unwrap()).expect("Did not get a valid JSON of type {action: summary, summaryterms:[{clinical: term1}, {geneExpression: gene}], filter:[{term: term1, value: value1}]} from the LLM");
-                                                let mut expected_json_value: super::super::SummaryType = serde_json::from_str(&ques_ans.answer).expect("Did not get a valid JSON of type {action: summary, summaryterms:[{clinical: term1}, {geneExpression: gene}], filter:[{term: term1, value: value1}]} from the LLM");
+                                                //println!("user_input:{}", user_input);
+                                                //println!("llm_answer:{:?}", llm_output);
+                                                //println!("expected answer:{:?}", &ques_ans.answer);
+                                                let llm_json_value: super::super::SummaryType = serde_json::from_str(&llm_output.unwrap()).expect("Did not get a valid JSON of type {action: summary, summaryterms:[{clinical: term1}, {geneExpression: gene}], filter:[{term: term1, value: value1}]} from the LLM");
+                                                //println!(
+                                                //    "llm_answer:{:?}",
+                                                //    llm_json_value.clone().sort_summarytype_struct()
+                                                //);
+                                                //println!(
+                                                //    "expected answer:{:?}",
+                                                //    &expected_json_value.clone().sort_summarytype_struct()
+                                                //);
+                                                let sum: super::super::SummaryType = ques_ans.answer;
+                                                //println!("expected answer:{:?}", &sum);
                                                 assert_eq!(
                                                     llm_json_value.sort_summarytype_struct(),
-                                                    expected_json_value.sort_summarytype_struct()
+                                                    sum.sort_summarytype_struct()
                                                 );
                                             } else {
                                                 panic!("The user input is empty");