npm - @sjcrh/proteinpaint-rust - Versions diffs - 2.149.0 → 2.152.1-0 - Mend

@sjcrh/proteinpaint-rust 2.149.0 → 2.152.1-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/src/aichatbot.rs CHANGED Viewed

@@ -1,67 +1,63 @@
+// Syntax: cd .. && cargo build --release && time cat ~/sjpp/test.txt | target/release/aichatbot
+#![allow(non_snake_case)]
 use anyhow::Result;
 use json::JsonValue;
 use r2d2_sqlite::SqliteConnectionManager;
 use rig::agent::AgentBuilder;
-use rig::client::CompletionClient;
-use rig::client::EmbeddingsClient;
 use rig::completion::Prompt;
 use rig::embeddings::builder::EmbeddingsBuilder;
-use std::collections::HashMap;
-//use rig::providers::ollama;
 use rig::vector_store::in_memory_store::InMemoryVectorStore;
 use schemars::JsonSchema;
 use serde_json::{Map, Value, json};
-use std::io::{self};
+use std::collections::HashMap;
+use std::fs;
+use std::io;
+use std::path::Path;
+mod ollama; // Importing custom rig module for invoking ollama server
 mod sjprovider; // Importing custom rig module for invoking SJ GPU server
-#[allow(non_camel_case_types)]
-#[derive(Debug, Clone)]
-enum llm_backend {
-    Ollama(),
-    Sj(),
-}
+mod test_ai; // Test examples for AI chatbot
-#[derive(Debug, JsonSchema)]
-#[allow(dead_code)]
-struct OutputJson {
-    pub answer: String,
+// Struct for intaking data from dataset json
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+pub struct AiJsonFormat {
+    hasGeneExpression: bool,
+    db: String,     // Dataset db
+    genedb: String, // Gene db
+    charts: Vec<Charts>,
 }
-#[allow(non_camel_case_types)]
-#[derive(Debug, JsonSchema)]
-#[allow(dead_code)]
-enum cutoff_info {
-    lesser(f32),
-    greater(f32),
-    equalto(f32),
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+enum Charts {
+    // More chart types will be added here later
+    Summary(TrainTestData),
+    DE(TrainTestData),
 }
-#[derive(Debug, JsonSchema)]
-#[allow(dead_code)]
-struct Cutoff {
-    cutoff_name: cutoff_info,
-    units: Option<String>,
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct TrainTestData {
+    SystemPrompt: String,
+    TrainingData: Vec<QuestionAnswer>,
+    TestData: Vec<QuestionAnswer>,
 }
-#[derive(Debug, JsonSchema)]
-#[allow(dead_code)]
-struct Filter {
-    name: String,
-    cutoff: Cutoff,
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct QuestionAnswer {
+    question: String,
+    answer: String,
 }
-#[derive(Debug, JsonSchema)]
-#[allow(dead_code)]
-struct Group {
-    name: String,
-    filter: Filter,
+#[allow(non_camel_case_types)]
+#[derive(Debug, Clone)]
+pub enum llm_backend {
+    Ollama(),
+    Sj(),
 }
 #[derive(Debug, JsonSchema)]
 #[allow(dead_code)]
-struct DEOutput {
-    group1: Group,
-    group2: Group,
+struct OutputJson {
+    pub answer: String,
 }
 #[tokio::main]
@@ -73,23 +69,64 @@ async fn main() -> Result<()> {
             let input_json = json::parse(&input);
             match input_json {
                 Ok(json_string) => {
+                    //println!("json_string:{}", json_string);
                     let user_input_json: &JsonValue = &json_string["user_input"];
-                    //let user_input = "Does aspirin leads to decrease in death rates among Africans?";
-                    //let user_input = "Show the point deletion in TP53 gene.";
-                    //let user_input = "Generate DE plot for men with weight greater than 30lbs vs women less than 20lbs";
                     let user_input: &str;
                     match user_input_json.as_str() {
                         Some(inp) => user_input = inp,
                         None => panic!("user_input field is missing in input json"),
                     }
-                    let dataset_db_json: &JsonValue = &json_string["dataset_db"];
-                    let mut dataset_db: Option<&str> = None;
-                    match dataset_db_json.as_str() {
-                        Some(inp) => dataset_db = Some(inp),
-                        None => {}
+                    if user_input.len() == 0 {
+                        panic!("The user input is empty");
+                    }
+                    let tpmasterdir_json: &JsonValue = &json_string["tpmasterdir"];
+                    let tpmasterdir: &str;
+                    match tpmasterdir_json.as_str() {
+                        Some(inp) => tpmasterdir = inp,
+                        None => panic!("tpmasterdir not found"),
+                    }
+                    let binpath_json: &JsonValue = &json_string["binpath"];
+                    let binpath: &str;
+                    match binpath_json.as_str() {
+                        Some(inp) => binpath = inp,
+                        None => panic!("binpath not found"),
+                    }
+                    let ai_json_file_json: &JsonValue = &json_string["aifiles"];
+                    let ai_json_file: String;
+                    match ai_json_file_json.as_str() {
+                        Some(inp) => ai_json_file = String::from(binpath) + &"/../../" + &inp,
+                        None => {
+                            panic!("ai json file not found")
+                        }
                     }
+                    let ai_json_file = Path::new(&ai_json_file);
+                    let ai_json_file_path;
+                    let current_dir = std::env::current_dir().unwrap();
+                    match ai_json_file.canonicalize() {
+                        Ok(p) => ai_json_file_path = p,
+                        Err(_) => {
+                            panic!(
+                                "AI JSON file path not found:{:?}, current directory:{:?}",
+                                ai_json_file, current_dir
+                            )
+                        }
+                    }
+                    // Read the file
+                    let ai_data = fs::read_to_string(ai_json_file_path).unwrap();
+                    // Parse the JSON data
+                    let ai_json: AiJsonFormat =
+                        serde_json::from_str(&ai_data).expect("AI JSON file does not have the correct format");
+                    let genedb = String::from(tpmasterdir) + &"/" + &ai_json.genedb;
+                    let dataset_db = String::from(tpmasterdir) + &"/" + &ai_json.db;
                     let apilink_json: &JsonValue = &json_string["apilink"];
                     let apilink: &str;
                     match apilink_json.as_str() {
@@ -131,7 +168,7 @@ async fn main() -> Result<()> {
                     } else if llm_backend_name == "ollama".to_string() {
                         llm_backend_type = llm_backend::Ollama();
                         // Initialize Ollama client
-                        let ollama_client = rig::providers::ollama::Client::builder()
+                        let ollama_client = ollama::Client::builder()
                             .base_url(apilink)
                             .build()
                             .expect("Ollama server not found");
@@ -145,10 +182,11 @@ async fn main() -> Result<()> {
                             temperature,
                             max_new_tokens,
                             top_p,
-                            dataset_db,
+                            &dataset_db,
+                            &genedb,
+                            &ai_json,
                         )
                         .await;
-                    // "gpt-oss:20b" "granite3-dense:latest" "PetrosStav/gemma3-tools:12b" "llama3-groq-tool-use:latest" "PetrosStav/gemma3-tools:12b"
                     } else if llm_backend_name == "SJ".to_string() {
                         llm_backend_type = llm_backend::Sj();
                         // Initialize Sj provider client
@@ -166,17 +204,19 @@ async fn main() -> Result<()> {
                             temperature,
                             max_new_tokens,
                             top_p,
-                            dataset_db,
+                            &dataset_db,
+                            &genedb,
+                            &ai_json,
                         )
                         .await;
                     }
                     match final_output {
                         Some(fin_out) => {
-                            println!("final_output:{:?}", fin_out);
+                            println!("final_output:{:?}", fin_out.replace("\\", ""));
                         }
                         None => {
-                            println!("final_output:{{\"{}\":\"{}\"}}", "chartType", "unknown");
+                            println!("final_output:{{\"{}\":\"{}\"}}", "action", "unknown");
                         }
                     }
                 }
@@ -188,7 +228,7 @@ async fn main() -> Result<()> {
     Ok(())
 }
-async fn run_pipeline(
+pub async fn run_pipeline(
     user_input: &str,
     comp_model: impl rig::completion::CompletionModel + 'static,
     embedding_model: impl rig::embeddings::EmbeddingModel + 'static,
@@ -196,7 +236,9 @@ async fn run_pipeline(
     temperature: f64,
     max_new_tokens: usize,
     top_p: f32,
-    dataset_db: Option<&str>,
+    dataset_db: &str,
+    genedb: &str,
+    ai_json: &AiJsonFormat,
 ) -> Option<String> {
     let mut classification: String = classify_query_by_dataset_type(
         user_input,
@@ -223,7 +265,7 @@ async fn run_pipeline(
         .await;
         final_output = format!(
             "{{\"{}\":\"{}\",\"{}\":[{}}}",
-            "chartType",
+            "action",
             "dge",
             "DE_output",
             de_result + &"]"
@@ -238,32 +280,32 @@ async fn run_pipeline(
             max_new_tokens,
             top_p,
             dataset_db,
+            genedb,
+            ai_json,
         )
         .await;
-    } else if classification == "hierarchial".to_string() {
+    } else if classification == "hierarchical".to_string() {
         // Not implemented yet
-        final_output = format!("{{\"{}\":\"{}\"}}", "chartType", "hierarchial");
+        final_output = format!("{{\"{}\":\"{}\"}}", "action", "hierarchical");
     } else if classification == "snv_indel".to_string() {
         // Not implemented yet
-        final_output = format!("{{\"{}\":\"{}\"}}", "chartType", "snv_indel");
+        final_output = format!("{{\"{}\":\"{}\"}}", "action", "snv_indel");
     } else if classification == "cnv".to_string() {
         // Not implemented yet
-        final_output = format!("{{\"{}\":\"{}\"}}", "chartType", "cnv");
+        final_output = format!("{{\"{}\":\"{}\"}}", "action", "cnv");
     } else if classification == "variant_calling".to_string() {
         // Not implemented yet and will never be supported. Need a separate messages for this
-        final_output = format!("{{\"{}\":\"{}\"}}", "chartType", "variant_calling");
-    } else if classification == "surivial".to_string() {
+        final_output = format!("{{\"{}\":\"{}\"}}", "action", "variant_calling");
+    } else if classification == "survival".to_string() {
         // Not implemented yet
-        final_output = format!("{{\"{}\":\"{}\"}}", "chartType", "surivial");
+        final_output = format!("{{\"{}\":\"{}\"}}", "action", "surivial");
     } else if classification == "none".to_string() {
-        final_output = format!("{{\"{}\":\"{}\"}}", "chartType", "none");
-        println!("The input query did not match any known features in Proteinpaint");
-    } else {
         final_output = format!(
-            "{{\"{}\":\"{}\"}}",
-            "chartType",
-            "unknown:".to_string() + &classification
+            "{{\"{}\":\"{}\",\"{}\":\"{}\"}}",
+            "action", "none", "message", "The input query did not match any known features in Proteinpaint"
         );
+    } else {
+        final_output = format!("{{\"{}\":\"{}\"}}", "action", "unknown:".to_string() + &classification);
     }
     Some(final_output)
 }
@@ -295,19 +337,35 @@ Structural variants/fusions (SV) are genomic mutations when eith a DNA region is
 If a ProteinPaint dataset contains structural variation or gene fusion data then return JSON with single key, 'sv_fusion'.
 ---
-Hierarchial clustering of gene expression is an unsupervised learning technique where several number of relevant genes and the samples are clustered so as to determine (previously unknown) cohorts of samples (or patients) or structure in data. It is very commonly used to determine subtypes of a particular disease based on RNA sequencing data.
+Hierarchical clustering of gene expression is an unsupervised learning technique where several number of relevant genes and the samples are clustered so as to determine (previously unknown) cohorts of samples (or patients) or structure in data. It is very commonly used to determine subtypes of a particular disease based on RNA sequencing data.
-If a ProteinPaint dataset contains hierarchial data then return JSON with single key, 'hierarchial'.
+If a ProteinPaint dataset contains hierarchical data then return JSON with single key, 'hierarchical'.
 ---
-Differential Gene Expression (DGE or DE) is a technique where the most upregulated and downregulated genes between two cohorts of samples (or patients) are determined. A volcano plot is shown with fold-change in the x-axis and adjusted p-value on the y-axis. So, the upregulated and downregulared genes are on opposite sides of the graph and the most significant genes (based on adjusted p-value) is on the top of the graph. Following differential gene expression generally GeneSet Enrichment Analysis (GSEA) is carried out where based on the genes and their corresponding fold changes the upregulation/downregulation of genesets (or pathways) is determined.
+Differential Gene Expression (DGE or DE) is a technique where the most upregulated (or highest) and downregulated (or lowest) genes between two cohorts of samples (or patients) are determined from a pool of THOUSANDS of genes. Differential gene expression CANNOT be computed for a SINGLE gene. A volcano plot is shown with fold-change in the x-axis and adjusted p-value on the y-axis. So, the upregulated and downregulared genes are on opposite sides of the graph and the most significant genes (based on adjusted p-value) is on the top of the graph. Following differential gene expression generally GeneSet Enrichment Analysis (GSEA) is carried out where based on the genes and their corresponding fold changes the upregulation/downregulation of genesets (or pathways) is determined.
+Sample Query1: \"Which gene has the highest expression between the two genders\"
+Sample Answer1: { \"answer\": \"dge\" }
+Sample Query2: \"Which gene has the lowest expression between the two races\"
+Sample Answer2: { \"answer\": \"dge\" }
+Sample Query1: \"Which genes are the most upregulated genes between group A and group B\"
+Sample Answer1: { \"answer\": \"dge\" }
+Sample Query3: \"Which gene are overexpressed between male and female\"
+Sample Answer3: { \"answer\": \"dge\" }
+Sample Query4: \"Which gene are housekeeping genes between male and female\"
+Sample Answer4: { \"answer\": \"dge\" }
 If a ProteinPaint dataset contains differential gene expression data then return JSON with single key, 'dge'.
 ---
-Survival analysis (also called time-to-event analysis or duration analysis) is a branch of statistics aimed at analyzing the duration of time from a well-defined time origin until one or more events happen, called survival times or duration times. In other words, in survival analysis, we are interested in a certain event and want to analyze the time until the event happens.
+Survival analysis (also called time-to-event analysis or duration analysis) is a branch of statistics aimed at analyzing the duration of time from a well-defined time origin until one or more events happen, called survival times or duration times. In other words, in survival analysis, we are interested in a certain event and want to analyze the time until the event happens. Generally in survival analysis survival rates between two (or more) cohorts of patients  is compared.
 There are two main methods of survival analysis:
@@ -319,6 +377,10 @@ There are two main methods of survival analysis:
    HR < 1: Reduction in the hazard
    HR > 1: Increase in Hazard
+Sample Query1: \"Compare survival rates between group A and B\"
+Sample Answer1: { \"answer\": \"survival\" }
 If a ProteinPaint dataset contains survival data then return JSON with single key, 'survival'.
 ---
@@ -329,15 +391,20 @@ If a user query asks about variant calling or mapping reads then JSON with singl
 ---
-Summary plot in ProteinPaint shows the various facets of the datasets. It may show all the samples according to their respective diagnosis or subtypes of cancer. It is also useful for visualizing all the different facets of the dataset. You can display a categorical variable and overlay another variable on top it and stratify (or divide) using a third variable simultaneously. You can also custom filters to the dataset so that you can only study part of the dataset. If a user query asks about variant calling or mapping reads then JSON with single key, 'summary'.
+Summary plot in ProteinPaint shows the various facets of the datasets. Show expression of a SINGLE gene or compare the expression of a SINGLE gene across two different cohorts defined by the user. It may show all the samples according to their respective diagnosis or subtypes of cancer. It is also useful for comparing and correlating different clinical variables. It can show all possible distributions, frequency of a category, overlay, correlate or cross-tabulate with another variable on top of it. If a user query asks about a SINGLE gene expression or correlating clinical variables then return JSON with single key, 'summary'.
 Sample Query1: \"Show all fusions for patients with age less than 30\"
 Sample Answer1: { \"answer\": \"summary\" }
-Sample Query1: \"List all molecular subtypes of leukemia\"
-Sample Answer1: { \"answer\": \"summary\" }
+Sample Query2: \"List all molecular subtypes of leukemia\"
+Sample Answer2: { \"answer\": \"summary\" }
+Sample Query3: \"is tp53 expression higher in men than women ?\"
+Sample Answer3: { \"answer\": \"summary\" }
+Sample Query4: \"Compare ATM expression between races for women greater than 80yrs\"
+Sample Answer4: { \"answer\": \"summary\" }
----
 If a query does not match any of the fields described above, then return JSON with single key, 'none'
 ");
@@ -345,14 +412,16 @@ If a query does not match any of the fields described above, then return JSON wi
     // Split the contents by the delimiter "---"
     let parts: Vec<&str> = contents.split("---").collect();
     let schema_json: Value = serde_json::to_value(schemars::schema_for!(OutputJson)).unwrap(); // error handling here
+    let schema_json_string = serde_json::to_string_pretty(&schema_json).unwrap();
     let additional;
     match llm_backend_type {
         llm_backend::Ollama() => {
             additional = json!({
-                    "format": schema_json
-            }
-                );
+                    "max_new_tokens": max_new_tokens,
+                    "top_p": top_p,
+                    "schema_json": schema_json_string
+            });
         }
         llm_backend::Sj() => {
             additional = json!({
@@ -369,7 +438,7 @@ If a query does not match any of the fields described above, then return JSON wi
         rag_docs.push(part.trim().to_string())
     }
-    let top_k: usize = 3;
+    //let top_k: usize = 3;
     // Create embeddings and add to vector store
     let embeddings = EmbeddingsBuilder::new(embedding_model.clone())
         .documents(rag_docs)
@@ -383,20 +452,25 @@ If a query does not match any of the fields described above, then return JSON wi
     InMemoryVectorStore::add_documents(&mut vector_store, embeddings);
     // Create RAG agent
-    let agent = AgentBuilder::new(comp_model).preamble("Generate classification for the user query into summary, dge, hierarchial, snv_indel, cnv, variant_calling, sv_fusion and none categories. Return output in JSON with ALWAYS a single word answer { \"answer\": \"dge\" }, that is 'summary' for summary plot, 'dge' for differential gene expression, 'hierarchial' for hierarchial clustering, 'snv_indel' for SNV/Indel, 'cnv' for CNV and 'sv_fusion' for SV/fusion, 'variant_calling' for variant calling, 'surivial' for survival data, 'none' for none of the previously described categories. The summary plot list and summarizes the cohort of patients according to the user query. The answer should always be in lower case").dynamic_context(top_k, vector_store.index(embedding_model)).temperature(temperature).additional_params(additional).build();
+    let agent = AgentBuilder::new(comp_model).preamble(&(String::from("Generate classification for the user query into summary, dge, hierarchical, snv_indel, cnv, variant_calling, sv_fusion and none categories. Return output in JSON with ALWAYS a single word answer { \"answer\": \"dge\" }, that is 'summary' for summary plot, 'dge' for differential gene expression, 'hierarchical' for hierarchical clustering, 'snv_indel' for SNV/Indel, 'cnv' for CNV and 'sv_fusion' for SV/fusion, 'variant_calling' for variant calling, 'surivial' for survival data, 'none' for none of the previously described categories. The summary plot list and summarizes the cohort of patients according to the user query. The answer should always be in lower case\n The options are as follows:\n") + &contents + "\nQuestion= {question} \nanswer")).temperature(temperature).additional_params(additional).build();
+    //.dynamic_context(top_k, vector_store.index(embedding_model))
-    let response = agent.prompt(user_input).await.expect("Failed to prompt ollama");
+    let response = agent.prompt(user_input).await.expect("Failed to prompt server");
     //println!("Ollama: {}", response);
     let result = response.replace("json", "").replace("```", "");
     let json_value: Value = serde_json::from_str(&result).expect("REASON");
     match llm_backend_type {
-        llm_backend::Ollama() => json_value.as_object().unwrap()["answer"].to_string().replace("\"", ""),
+        llm_backend::Ollama() => {
+            let json_value2: Value = serde_json::from_str(&json_value["content"].to_string()).expect("REASON2");
+            let json_value3: Value = serde_json::from_str(&json_value2.as_str().unwrap()).expect("REASON3");
+            json_value3["answer"].to_string()
+        }
         llm_backend::Sj() => {
             let json_value2: Value =
                 serde_json::from_str(&json_value[0]["generated_text"].to_string()).expect("REASON2");
             //println!("json_value2:{}", json_value2.as_str().unwrap());
-            let json_value3: Value = serde_json::from_str(&json_value2.as_str().unwrap()).expect("REASON2");
+            let json_value3: Value = serde_json::from_str(&json_value2.as_str().unwrap()).expect("REASON3");
             //let json_value3: Value = serde_json::from_str(&json_value2["answer"].to_string()).expect("REASON2");
             //println!("Classification result:{}", json_value3["answer"]);
             json_value3["answer"].to_string()
@@ -404,6 +478,45 @@ If a query does not match any of the fields described above, then return JSON wi
     }
 }
+// DE JSON output schema
+#[allow(non_camel_case_types)]
+#[derive(Debug, JsonSchema)]
+#[allow(dead_code)]
+enum cutoff_info {
+    lesser(f32),
+    greater(f32),
+    equalto(f32),
+}
+#[derive(Debug, JsonSchema)]
+#[allow(dead_code)]
+struct Cutoff {
+    cutoff_name: cutoff_info,
+    units: Option<String>,
+}
+#[derive(Debug, JsonSchema)]
+#[allow(dead_code)]
+struct Filter {
+    name: String,
+    cutoff: Cutoff,
+}
+#[derive(Debug, JsonSchema)]
+#[allow(dead_code)]
+struct Group {
+    name: String,
+    filter: Filter,
+}
+#[derive(Debug, JsonSchema)]
+#[allow(dead_code)]
+struct DEOutput {
+    group1: Group,
+    group2: Group,
+}
 #[allow(non_snake_case)]
 async fn extract_DE_search_terms_from_query(
     user_input: &str,
@@ -440,16 +553,17 @@ Output JSON query5: {\"group1\": {\"name\": \"males\", \"filter\": {\"name\": \"
     let parts: Vec<&str> = contents.split("---").collect();
     let schema_json: Value = serde_json::to_value(schemars::schema_for!(DEOutput)).unwrap(); // error handling here
+    let schema_json_string = serde_json::to_string_pretty(&schema_json).unwrap();
     //println!("DE schema:{}", schema_json);
     let additional;
     match llm_backend_type {
         llm_backend::Ollama() => {
             additional = json!({
-                    "format": schema_json
-            }
-                );
+                    "max_new_tokens": max_new_tokens,
+                    "top_p": top_p,
+                    "schema_json": schema_json_string
+            });
         }
         llm_backend::Sj() => {
             additional = json!({
@@ -480,16 +594,21 @@ Output JSON query5: {\"group1\": {\"name\": \"males\", \"filter\": {\"name\": \"
     InMemoryVectorStore::add_documents(&mut vector_store, embeddings);
     // Create RAG agent
-    let router_instructions = "Extract the group variable names for differential gene expression from input query. When two groups are found give the following JSON output with no extra comments. Show {{\"group1\": {\"name\": \"groupA\"}, \"group2\": {\"name\": \"groupB\"}}}. In case no suitable groups are found, show {\"output\":\"No suitable two groups found for differential gene expression\"}. In case of a continuous variable such as age, height added additional field to the group called \"filter\". This should contain a sub-field called \"names\" followed by a subfield called \"cutoff\". This sub-field should contain a key either greater, lesser or equalto. If the continuous variable has units provided by the user then add it in a separate field called \"units\". User query1: \"Show volcano plot for Asians with age less than 20 and African greater than 80\". Output JSON query1: {\"group1\": {\"name\": \"Asians\", \"filter\": {\"name\": \"age\", \"cutoff\": {\"lesser\": 20}}}, \"group2\": {\"name\": \"African\", \"filter\": {\"name\": \"age\", \"cutoff\": {\"greater\": 80}}}}. User query2: \"Show Differential gene expression plot for males with height greater than 185cm and women with less than 100cm\". Output JSON query2: {\"group1\": {\"name\": \"males\", \"filter\": {\"name\": \"height\", \"cutoff\": {\"greater\": 185, \"units\":\"cm\"}}}, \"group2\": {\"name\": \"women\", \"filter\": {\"name\": \"height\", \"cutoff\": {\"lesser\": 100, \"units\": \"cm\"}}}}. User query3: \"Show DE plot between healthy and diseased groups. Output JSON query3: {\"group1\":{\"name\":\"healthy\"},\"group2\":{\"name\":\"diseased\"}}";
+    let router_instructions = String::from(
+        "Extract the group variable names for differential gene expression from input query. When two groups are found give the following JSON output with no extra comments. Show {{\"group1\": {\"name\": \"groupA\"}, \"group2\": {\"name\": \"groupB\"}}}. In case no suitable groups are found, show {\"output\":\"No suitable two groups found for differential gene expression\"}. In case of a continuous variable such as age, height added additional field to the group called \"filter\". This should contain a sub-field called \"names\" followed by a subfield called \"cutoff\". This sub-field should contain a key either greater, lesser or equalto. If the continuous variable has units provided by the user then add it in a separate field called \"units\".",
+    ) + &contents
+        + " The JSON schema is as follows"
+        + &schema_json_string
+        + "\n Examples: User query1: \"Show volcano plot for Asians with age less than 20 and African greater than 80\". Output JSON query1: {\"group1\": {\"name\": \"Asians\", \"filter\": {\"name\": \"age\", \"cutoff\": {\"lesser\": 20}}}, \"group2\": {\"name\": \"African\", \"filter\": {\"name\": \"age\", \"cutoff\": {\"greater\": 80}}}}. User query2: \"Show Differential gene expression plot for males with height greater than 185cm and women with less than 100cm\". Output JSON query2: {\"group1\": {\"name\": \"males\", \"filter\": {\"name\": \"height\", \"cutoff\": {\"greater\": 185, \"units\":\"cm\"}}}, \"group2\": {\"name\": \"women\", \"filter\": {\"name\": \"height\", \"cutoff\": {\"lesser\": 100, \"units\": \"cm\"}}}}. User query3: \"Show DE plot between healthy and diseased groups. Output JSON query3: {\"group1\":{\"name\":\"healthy\"},\"group2\":{\"name\":\"diseased\"}} \nQuestion= {question} \nanswer";
     //println! {"router_instructions:{}",router_instructions};
     let agent = AgentBuilder::new(comp_model)
-        .preamble(router_instructions)
+        .preamble(&router_instructions)
         .dynamic_context(rag_docs_length, vector_store.index(embedding_model))
         .temperature(temperature)
         .additional_params(additional)
         .build();
-    let response = agent.prompt(user_input).await.expect("Failed to prompt ollama");
+    let response = agent.prompt(user_input).await.expect("Failed to prompt server");
     //println!("Ollama_groups: {}", response);
     let result = response.replace("json", "").replace("```", "");
@@ -497,7 +616,12 @@ Output JSON query5: {\"group1\": {\"name\": \"males\", \"filter\": {\"name\": \"
     let json_value: Value = serde_json::from_str(&result).expect("REASON");
     //println!("json_value:{}", json_value);
     match llm_backend_type {
-        llm_backend::Ollama() => json_value.to_string(),
+        llm_backend::Ollama() => {
+            let json_value2: Value = serde_json::from_str(&json_value["content"].to_string()).expect("REASON2");
+            //println!("json_value2:{:?}", json_value2);
+            let json_value3: Value = serde_json::from_str(&json_value2.as_str().unwrap()).expect("REASON3");
+            json_value3.to_string()
+        }
         llm_backend::Sj() => {
             let json_value2: Value =
                 serde_json::from_str(&json_value[0]["generated_text"].to_string()).expect("REASON2");
@@ -509,6 +633,7 @@ Output JSON query5: {\"group1\": {\"name\": \"males\", \"filter\": {\"name\": \"
     }
 }
+#[derive(Debug, Clone)]
 struct DbRows {
     name: String,
     description: Option<String>,
@@ -516,6 +641,21 @@ struct DbRows {
     values: Vec<String>,
 }
+async fn parse_geneset_db(db: &str) -> Vec<String> {
+    let manager = SqliteConnectionManager::file(db);
+    let pool = r2d2::Pool::new(manager).unwrap();
+    let conn = pool.get().unwrap();
+    let sql_statement_genedb = "SELECT * from codingGenes";
+    let mut genedb = conn.prepare(&sql_statement_genedb).unwrap();
+    let mut rows_genedb = genedb.query([]).unwrap();
+    let mut gene_list = Vec::<String>::new();
+    while let Some(coding_gene) = rows_genedb.next().unwrap() {
+        let code_gene: String = coding_gene.get(0).unwrap();
+        gene_list.push(code_gene)
+    }
+    gene_list
+}
 trait ParseDbRows {
     fn parse_db_rows(&self) -> String;
 }
@@ -544,7 +684,7 @@ impl ParseDbRows for DbRows {
     }
 }
-async fn parse_dataset_db(db: &str) -> Vec<String> {
+async fn parse_dataset_db(db: &str) -> (Vec<String>, Vec<DbRows>) {
     let manager = SqliteConnectionManager::file(db);
     let pool = r2d2::Pool::new(manager).unwrap();
     let conn = pool.get().unwrap();
@@ -574,7 +714,7 @@ async fn parse_dataset_db(db: &str) -> Vec<String> {
     }
     //// Open the file
-    //let mut file = File::open(dataset_agnostic_file).unwrap();
+    //let mut file = File::open(dataset_file).unwrap();
     //// Create a string to hold the file contents
     //let mut contents = String::new();
@@ -603,6 +743,7 @@ async fn parse_dataset_db(db: &str) -> Vec<String> {
     // Print the separated parts
     let mut rag_docs = Vec::<String>::new();
     let mut names = Vec::<String>::new();
+    let mut db_vec = Vec::<DbRows>::new();
     while let Some(row) = rows_terms.next().unwrap() {
         //println!("row:{:?}", row);
         let name: String = row.get(0).unwrap();
@@ -637,6 +778,7 @@ async fn parse_dataset_db(db: &str) -> Vec<String> {
                     term_type: item_type,
                     values: keys,
                 };
+                db_vec.push(item.clone());
                 //println!("Field details:{}", item.parse_db_rows());
                 rag_docs.push(item.parse_db_rows());
                 names.push(name)
@@ -645,60 +787,109 @@ async fn parse_dataset_db(db: &str) -> Vec<String> {
         }
     }
     //println!("names:{:?}", names);
-    rag_docs
+    (rag_docs, db_vec)
 }
 async fn extract_summary_information(
     user_input: &str,
     comp_model: impl rig::completion::CompletionModel + 'static,
-    embedding_model: impl rig::embeddings::EmbeddingModel + 'static,
+    _embedding_model: impl rig::embeddings::EmbeddingModel + 'static,
     llm_backend_type: &llm_backend,
     temperature: f64,
     max_new_tokens: usize,
     top_p: f32,
-    dataset_db: Option<&str>,
+    dataset_db: &str,
+    genedb: &str,
+    ai_json: &AiJsonFormat,
 ) -> String {
-    match dataset_db {
-        Some(db) => {
-            let rag_docs = parse_dataset_db(db).await;
-            //println!("rag_docs:{:?}", rag_docs);
-            let additional;
-            match llm_backend_type {
-                llm_backend::Ollama() => {
-                    additional = json!({});
-                }
-                llm_backend::Sj() => {
-                    additional = json!({
-                            "max_new_tokens": max_new_tokens,
-                            "top_p": top_p
-                    });
-                }
-            }
-            let rag_docs_length = rag_docs.len();
-            // Create embeddings and add to vector store
-            let embeddings = EmbeddingsBuilder::new(embedding_model.clone())
-                .documents(rag_docs)
-                .expect("Reason1")
-                .build()
-                .await
-                .unwrap();
+    let (rag_docs, db_vec) = parse_dataset_db(dataset_db).await;
+    let additional;
+    let schema_json = schemars::schema_for!(SummaryType); // error handling here
+    let schema_json_string = serde_json::to_string_pretty(&schema_json).unwrap();
+    //println!("schema_json summary:{}", schema_json_string);
+    match llm_backend_type {
+        llm_backend::Ollama() => {
+            additional = json!({
+                    "max_new_tokens": max_new_tokens,
+                    "top_p": top_p,
+                    "schema_json": schema_json_string
+            });
+        }
+        llm_backend::Sj() => {
+            additional = json!({
+                    "max_new_tokens": max_new_tokens,
+                    "top_p": top_p
+            });
+        }
+    }
-            // Create vector store
-            let mut vector_store = InMemoryVectorStore::<String>::default();
-            InMemoryVectorStore::add_documents(&mut vector_store, embeddings);
+    // Create embeddings and add to vector store
+    //let embeddings = EmbeddingsBuilder::new(embedding_model.clone())
+    //    .documents(rag_docs)
+    //    .expect("Reason1")
+    //    .build()
+    //    .await
+    //    .unwrap();
+    //// Create vector store
+    //let mut vector_store = InMemoryVectorStore::<String>::default();
+    //InMemoryVectorStore::add_documents(&mut vector_store, embeddings);
+    let gene_list: Vec<String> = parse_geneset_db(genedb).await;
+    let lowercase_user_input = user_input.to_lowercase();
+    let user_words: Vec<&str> = lowercase_user_input.split_whitespace().collect();
+    let user_words2: Vec<String> = user_words.into_iter().map(|s| s.to_string()).collect();
+    let common_genes: Vec<String> = gene_list
+        .into_iter()
+        .filter(|x| user_words2.contains(&x.to_lowercase()))
+        .collect();
+    let mut summary_data_check: Option<TrainTestData> = None;
+    for chart in ai_json.charts.clone() {
+        if let Charts::Summary(traindata) = chart {
+            summary_data_check = Some(traindata);
+            break;
+        }
+    }
-            //let system_prompt = "I am an assistant that figures out the summary term from its respective dataset file. Extract the summary term {summary_term} from user query. The final output must be in the following JSON format {{\"chartType\":\"summary\",\"term\":{{\"id\":\"{{summary_term}}\"}}}}";
+    match summary_data_check {
+        Some(summary_data) => {
+            let mut training_data: String = String::from("");
+            let mut train_iter = 0;
+            for ques_ans in summary_data.TrainingData {
+                train_iter += 1;
+                training_data += "Example question";
+                training_data += &train_iter.to_string();
+                training_data += &":";
+                training_data += &ques_ans.question;
+                training_data += &" ";
+                training_data += "Example answer";
+                training_data += &train_iter.to_string();
+                training_data += &":";
+                training_data += &ques_ans.answer;
+                training_data += &"\n";
+            }
-            let top_k = rag_docs_length;
-            let system_prompt = String::from(
-                "I am an assistant that extracts the summary term from user query. It has four fields: group_categories (required), overlay (optional), filter (optional) and divide_by (optional). group_categories (required) is the primary variable being displayed. Overlay consists of the variable that must be overlayed on top of group_categories. divide_by is the variable used to stratify group_categories into two or more categories. The final output must be in the following JSON format with no extra comments: {\"chartType\":\"summary\",\"term\":{\"group_categories\":\"{group_category_answer}\",\"overlay\":\"{overlay_answer}\",\"divide_by\":\"{divide_by_answer}\",\"filter\":\"{filter_answer}\"}}. The values being added to the JSON parameters must be previously defined as field in the database. If the filter variable is a \"value\" of a \"field\" in the database, use the field name and add the value as a \"filter cutoff\" . If the \"filter\" field is defined in the user query, it should contain an array with each item containing a subfield called \"name\" with the name of the filter variable. If the type of variable is \"categories\", add another field as \"variable_type\" = \"categories\". In case the type of the variable is \"categories\", show the sub-category as a separate sub-field \"cutoff\" with a sub nested JSON with \"name\" as the field containing the subcategory name. In case the type of the variable is \"float\" it should contain a subfield called \"name\" followed by subfield \"variable_type\" = \"float\". In the \"cutoff\" subfield, the nested JSON should contain the field \"lower\" containing the lower numeric limit and the \"upper\" field containing the upper numeric limit. If the upper and lower cutoffs are not defined in the user query, use a default value of 0 and 100 respectively. Sample query1: \"Show ETR1 subtype\" Answer query1: \"{\"chartType\":\"summary\",\"term\":{\"group_categories\":\"ETR1\"}}. Sample query2: \"Show hyperdiploid subtype with age overlayed on top of it\" Answer query2: \"{\"chartType\":\"summary\",\"term\":{\"group_categories\":\"hyperdiploid\", \"overlay\":\"age\"}}. Sample query3: \"Show BAR1 subtype with age overlayed on top of it and stratify it on the basis of gender\" Answer query4: \"{\"chartType\":\"summary\",\"term\":{\"group_categories\":\"BAR1\", \"overlay\":\"age\", \"divide_by\":\"sex\"}}. Sample query5: \"Show summary for cancer-diagnosis only for men\". Since gender is a categorical variable and the user wants to select for men, the answer query for sample query5 is as follows: \"{\"chartType\":\"summary\",\"term\":{\"group_categories\":\"cancer-diagnosis\", \"filter\": {\"name\": \"gender\", \"variable_type\": \"categories\", \"cutoff\": {\"name\": \"male\"}}}}. Sample query6: \"Show molecular subtype summary for patients with age less than 30\". Age is a float variable so we need to provide the lower and higher cutoffs. So the answer to sample query6 is as follows: \"{\"chartType\":\"summary\",\"term\":{\"group_categories\":\"Molecular subtype\", \"filter\": {\"name\": \"age\", \"variable_type\": \"float\", \"cutoff\": {\"lower\": 0, \"higher\": 30}}}} ",
+            let system_prompt: String = String::from(
+                String::from(
+                    "I am an assistant that extracts the summary terms from user query. The final output must be in the following JSON format with NO extra comments. There are three fields in the JSON to be returned: The \"action\" field will ALWAYS be \"summary\". The \"summaryterms\" field should contain all the variables that the user wants to visualize. The \"clinical\" subfield should ONLY contain names of the fields from the sqlite db. ",
+                ) + &summary_data.SystemPrompt
+                    + &" The \"filter\" field is optional and should contain an array of JSON terms with which the dataset will be filtered. A variable simultaneously CANNOT be part of both \"summaryterms\" and \"filter\". There are two kinds of filter variables: \"Categorical\" and \"Numeric\". \"Categorical\" variables are those variables which can have a fixed set of values e.g. gender, molecular subtypes. They are defined by the \"CategoricalFilterTerm\" which consists of \"term\" (a field from the sqlite3 db)  and \"value\" (a value of the field from the sqlite db).  \"Numeric\" variables are those which can have any numeric value. They are defined by \"NumericFilterTerm\" and contain  the subfields \"term\" (a field from the sqlite3 db), \"greaterThan\" an optional filter which is defined when a lower cutoff is defined in the user input for the numeric variable and \"lessThan\" an optional filter which is defined when a higher cutoff is defined in the user input for the numeric variable. The \"message\" field only contain messages of terms in the user input that were not found in their respective databases. The JSON schema is as follows:"
+                    + &schema_json_string
+                    + &training_data
+                    + "The sqlite db in plain language is as follows:\n"
+                    + &rag_docs.join(",")
+                    + &"\n Relevant genes are as follows (separated by comma(,)):"
+                    + &common_genes.join(",")
+                    + &"\nQuestion: {question} \nanswer:",
             );
             //println!("system_prompt:{}", system_prompt);
             // Create RAG agent
             let agent = AgentBuilder::new(comp_model)
                 .preamble(&system_prompt)
-                .dynamic_context(top_k, vector_store.index(embedding_model))
+                //.dynamic_context(top_k, vector_store.index(embedding_model))
                 .temperature(temperature)
                 .additional_params(additional)
                 .build();
@@ -711,20 +902,463 @@ async fn extract_summary_information(
             let json_value: Value = serde_json::from_str(&result).expect("REASON");
             //println!("Classification result:{}", json_value);
+            let final_llm_json;
             match llm_backend_type {
-                llm_backend::Ollama() => json_value.to_string(),
+                llm_backend::Ollama() => {
+                    let json_value2: Value = serde_json::from_str(&json_value["content"].to_string()).expect("REASON2");
+                    let json_value3: Value = serde_json::from_str(&json_value2.as_str().unwrap()).expect("REASON3");
+                    final_llm_json = json_value3.to_string()
+                }
                 llm_backend::Sj() => {
                     let json_value2: Value =
                         serde_json::from_str(&json_value[0]["generated_text"].to_string()).expect("REASON2");
                     //println!("json_value2:{}", json_value2.as_str().unwrap());
-                    let json_value3: Value = serde_json::from_str(&json_value2.as_str().unwrap()).expect("REASON2");
+                    let json_value3: Value = serde_json::from_str(&json_value2.as_str().unwrap()).expect("REASON3");
                     //println!("Classification result:{}", json_value3);
-                    json_value3.to_string()
+                    final_llm_json = json_value3.to_string()
                 }
             }
+            //println!("final_llm_json:{}", final_llm_json);
+            let final_validated_json = validate_summary_output(final_llm_json.clone(), db_vec, common_genes, ai_json);
+            final_validated_json
         }
         None => {
-            panic!("Dataset db file needed for summary term extraction from user input")
+            panic!("summary chart train and test data is not defined in dataset JSON file")
+        }
+    }
+}
+fn get_summary_string() -> String {
+    "summary".to_string()
+}
+//const action: &str = &"summary";
+//const geneExpression: &str = &"geneExpression";
+#[derive(PartialEq, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct SummaryType {
+    // Serde uses this for deserialization.
+    #[serde(default = "get_summary_string")]
+    // Schemars uses this for schema generation.
+    #[schemars(rename = "action")]
+    action: String,
+    summaryterms: Vec<SummaryTerms>,
+    filter: Option<Vec<FilterTerm>>,
+    message: Option<String>,
+}
+impl SummaryType {
+    #[allow(dead_code)]
+    pub fn sort_summarytype_struct(&mut self) {
+        // This function is necessary for testing (test_ai.rs) to see if two variables of type "SummaryType" are equal or not. Without this a vector of two Summarytype holding the same values but in different order will be classified separately.
+        self.summaryterms.sort();
+        match self.filter.clone() {
+            Some(ref mut filterterms) => filterterms.sort(),
+            None => {}
+        }
+    }
+}
+#[derive(PartialEq, Eq, Ord, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+enum SummaryTerms {
+    #[allow(non_camel_case_types)]
+    clinical(String),
+    #[allow(non_camel_case_types)]
+    geneExpression(String),
+}
+impl PartialOrd for SummaryTerms {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        match (self, other) {
+            (SummaryTerms::clinical(_), SummaryTerms::clinical(_)) => Some(std::cmp::Ordering::Equal),
+            (SummaryTerms::geneExpression(_), SummaryTerms::geneExpression(_)) => Some(std::cmp::Ordering::Equal),
+            (SummaryTerms::clinical(_), SummaryTerms::geneExpression(_)) => Some(std::cmp::Ordering::Greater),
+            (SummaryTerms::geneExpression(_), SummaryTerms::clinical(_)) => Some(std::cmp::Ordering::Greater),
+        }
+    }
+}
+#[derive(PartialEq, Eq, Ord, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+enum FilterTerm {
+    Categorical(CategoricalFilterTerm),
+    Numeric(NumericFilterTerm),
+}
+impl PartialOrd for FilterTerm {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        match (self, other) {
+            (FilterTerm::Categorical(_), FilterTerm::Categorical(_)) => Some(std::cmp::Ordering::Equal),
+            (FilterTerm::Numeric(_), FilterTerm::Numeric(_)) => Some(std::cmp::Ordering::Equal),
+            (FilterTerm::Categorical(_), FilterTerm::Numeric(_)) => Some(std::cmp::Ordering::Greater),
+            (FilterTerm::Numeric(_), FilterTerm::Categorical(_)) => Some(std::cmp::Ordering::Greater),
+        }
+    }
+}
+#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+struct CategoricalFilterTerm {
+    term: String,
+    value: String,
+}
+#[derive(Debug, Clone, schemars::JsonSchema, serde::Serialize, serde::Deserialize)]
+#[allow(non_snake_case)]
+struct NumericFilterTerm {
+    term: String,
+    greaterThan: Option<f32>,
+    lessThan: Option<f32>,
+}
+impl PartialEq for NumericFilterTerm {
+    fn eq(&self, other: &Self) -> bool {
+        let greater_equality: bool;
+        match (self.greaterThan, other.greaterThan) {
+            (Some(a), Some(b)) => greater_equality = (a - b).abs() < 1e-6,
+            (None, None) => greater_equality = true,
+            _ => greater_equality = false,
+        }
+        let less_equality: bool;
+        match (self.lessThan, other.lessThan) {
+            (Some(a), Some(b)) => less_equality = (a - b).abs() < 1e-6,
+            (None, None) => less_equality = true,
+            _ => less_equality = false,
+        }
+        if greater_equality == true && less_equality == true {
+            true
+        } else {
+            false
+        }
+    }
+}
+impl Eq for NumericFilterTerm {}
+impl PartialOrd for NumericFilterTerm {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        if self.greaterThan < other.greaterThan {
+            Some(std::cmp::Ordering::Less)
+        } else if self.greaterThan > other.greaterThan {
+            Some(std::cmp::Ordering::Greater)
+        } else if self.lessThan < other.lessThan {
+            Some(std::cmp::Ordering::Less)
+        } else if self.lessThan > other.lessThan {
+            Some(std::cmp::Ordering::Greater)
+        } else {
+            Some(std::cmp::Ordering::Equal)
+        }
+    }
+}
+impl Ord for NumericFilterTerm {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.partial_cmp(other).unwrap()
+    }
+}
+fn validate_summary_output(
+    raw_llm_json: String,
+    db_vec: Vec<DbRows>,
+    common_genes: Vec<String>,
+    ai_json: &AiJsonFormat,
+) -> String {
+    let json_value: SummaryType =
+       serde_json::from_str(&raw_llm_json).expect("Did not get a valid JSON of type {action: summary, summaryterms:[{clinical: term1}, {geneExpression: gene}], filter:[{term: term1, value: value1}]} from the LLM");
+    let mut message: String = String::from("");
+    match json_value.message {
+        Some(mes) => {
+            message = message + &mes; // Append any message given by the LLM
+        }
+        None => {}
+    }
+    let mut new_json: Value; // New JSON value that will contain items of the final validated JSON
+    if json_value.action != String::from("summary") {
+        message = message + &"Did not return a summary action";
+        new_json = serde_json::json!(null);
+    } else {
+        new_json = serde_json::from_str(&"{\"action\":\"summary\"}").expect("Not a valid JSON");
+    }
+    let mut validated_summary_terms = Vec::<SummaryTerms>::new();
+    let mut summary_terms_tobe_removed = Vec::<SummaryTerms>::new();
+    for sum_term in &json_value.summaryterms {
+        match sum_term {
+            SummaryTerms::clinical(clin) => {
+                let term_verification = verify_json_field(clin, &db_vec);
+                if Some(term_verification.correct_field.clone()).is_some()
+                    && term_verification.correct_value.clone().is_none()
+                {
+                    match term_verification.correct_field {
+                        Some(tm) => validated_summary_terms.push(SummaryTerms::clinical(tm)),
+                        None => {
+                            message = message + &"\"" + &clin + &"\"" + &" not found in db.";
+                        }
+                    }
+                } else if Some(term_verification.correct_field.clone()).is_some()
+                    && Some(term_verification.correct_value.clone()).is_some()
+                {
+                    message = message
+                        + &term_verification.correct_value.unwrap()
+                        + &"is a value of "
+                        + &term_verification.correct_field.unwrap()
+                        + &".";
+                }
+            }
+            SummaryTerms::geneExpression(gene) => {
+                match ai_json.hasGeneExpression {
+                    true => {
+                        let mut num_gene_verification = 0;
+                        for common_gene in &common_genes {
+                            // Comparing predicted gene against the common gene
+                            if common_gene == gene {
+                                num_gene_verification += 1;
+                                validated_summary_terms.push(SummaryTerms::geneExpression(String::from(gene)));
+                            }
+                        }
+                        if num_gene_verification == 0 || common_genes.len() == 0 {
+                            if message.to_lowercase().contains(&gene.to_lowercase()) { // Check if the LLM has already added the message, if not then add it
+                            } else {
+                                message = message + &"\"" + &gene + &"\"" + &" not found in genedb.";
+                            }
+                        }
+                    }
+                    false => {
+                        let missing_gene_data: &str = "gene expression is not supported for this dataset";
+                        if message.to_lowercase().contains(&missing_gene_data.to_lowercase()) { // Check if the LLM has already added the message, if not then add it
+                        } else {
+                            message = message + &"Gene expression not supported for this dataset";
+                        }
+                    }
+                }
+            }
+        }
+    }
+    match &json_value.filter {
+        Some(filter_terms_array) => {
+            let mut validated_filter_terms = Vec::<FilterTerm>::new();
+            for parsed_filter_term in filter_terms_array {
+                match parsed_filter_term {
+                    FilterTerm::Categorical(categorical) => {
+                        let term_verification = verify_json_field(&categorical.term, &db_vec);
+                        let mut value_verification: Option<String> = None;
+                        for item in &db_vec {
+                            if &item.name == &categorical.term {
+                                for val in &item.values {
+                                    if &categorical.value == val {
+                                        value_verification = Some(val.clone());
+                                        break;
+                                    }
+                                }
+                            }
+                            if value_verification != None {
+                                break;
+                            }
+                        }
+                        if term_verification.correct_field.is_some() && value_verification.is_some() {
+                            let verified_filter = CategoricalFilterTerm {
+                                term: term_verification.correct_field.clone().unwrap(),
+                                value: value_verification.clone().unwrap(),
+                            };
+                            let categorical_filter_term: FilterTerm = FilterTerm::Categorical(verified_filter);
+                            validated_filter_terms.push(categorical_filter_term);
+                        }
+                        if term_verification.correct_field.is_none() {
+                            message = message + &"\"" + &categorical.term + &"\" filter term not found in db";
+                        }
+                        if value_verification.is_none() {
+                            message = message
+                                + &"\""
+                                + &categorical.value
+                                + &"\" filter value not found for filter field \""
+                                + &categorical.term
+                                + "\" in db";
+                        }
+                    }
+                    FilterTerm::Numeric(numeric) => {
+                        let term_verification = verify_json_field(&numeric.term, &db_vec);
+                        if term_verification.correct_field.is_none() {
+                            message = message + &"\"" + &numeric.term + &"\" filter term not found in db";
+                        } else {
+                            let numeric_filter_term: FilterTerm = FilterTerm::Numeric(numeric.clone());
+                            validated_filter_terms.push(numeric_filter_term);
+                        }
+                    }
+                }
+            }
+            for summary_term in &validated_summary_terms {
+                match summary_term {
+                    SummaryTerms::clinical(clinicial_term) => {
+                        for filter_term in &validated_filter_terms {
+                            match filter_term {
+                                FilterTerm::Categorical(categorical) => {
+                                    if &categorical.term == clinicial_term {
+                                        summary_terms_tobe_removed.push(summary_term.clone());
+                                    }
+                                }
+                                FilterTerm::Numeric(numeric) => {
+                                    if &numeric.term == clinicial_term {
+                                        summary_terms_tobe_removed.push(summary_term.clone());
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    SummaryTerms::geneExpression(gene) => {
+                        for filter_term in &validated_filter_terms {
+                            match filter_term {
+                                FilterTerm::Categorical(categorical) => {
+                                    if &categorical.term == gene {
+                                        summary_terms_tobe_removed.push(summary_term.clone());
+                                    }
+                                }
+                                FilterTerm::Numeric(numeric) => {
+                                    if &numeric.term == gene {
+                                        summary_terms_tobe_removed.push(summary_term.clone());
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            if validated_filter_terms.len() > 0 {
+                if let Some(obj) = new_json.as_object_mut() {
+                    obj.insert(String::from("filter"), serde_json::json!(validated_filter_terms));
+                }
+            }
+        }
+        None => {}
+    }
+    // Removing terms that are found both in filter term as well summary
+    let mut validated_summary_terms_final = Vec::<SummaryTerms>::new();
+    for summary_term in &validated_summary_terms {
+        let mut hit = 0;
+        match summary_term {
+            SummaryTerms::clinical(clinical_term) => {
+                for summary_term2 in &summary_terms_tobe_removed {
+                    match summary_term2 {
+                        SummaryTerms::clinical(clinical_term2) => {
+                            if clinical_term == clinical_term2 {
+                                hit = 1;
+                            }
+                        }
+                        SummaryTerms::geneExpression(gene2) => {
+                            if clinical_term == gene2 {
+                                hit = 1;
+                            }
+                        }
+                    }
+                }
+            }
+            SummaryTerms::geneExpression(gene) => {
+                for summary_term2 in &summary_terms_tobe_removed {
+                    match summary_term2 {
+                        SummaryTerms::clinical(clinical_term2) => {
+                            if gene == clinical_term2 {
+                                hit = 1;
+                            }
+                        }
+                        SummaryTerms::geneExpression(gene2) => {
+                            if gene == gene2 {
+                                hit = 1;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        if hit == 0 {
+            validated_summary_terms_final.push(summary_term.clone())
+        }
+    }
+    if let Some(obj) = new_json.as_object_mut() {
+        obj.insert(
+            String::from("summaryterms"),
+            serde_json::json!(validated_summary_terms_final),
+        );
+    }
+    if message.len() > 0 {
+        if let Some(obj) = new_json.as_object_mut() {
+            // The `if let` ensures we only proceed if the top-level JSON is an object.
+            // Append a new string field.
+            obj.insert(String::from("message"), serde_json::json!(message));
+        }
+    }
+    serde_json::to_string(&new_json).unwrap()
+}
+#[derive(Debug, Clone)]
+struct VerifiedField {
+    correct_field: Option<String>,         // Name of the correct field
+    correct_value: Option<String>, // Name of the correct value if there is a match between incorrect field and one of the values
+    _probable_fields: Option<Vec<String>>, // If multiple fields are matching to the incomplete query
+}
+fn verify_json_field(llm_field_name: &str, db_vec: &Vec<DbRows>) -> VerifiedField {
+    // Check if llm_field_name exists or not in db name field
+    let verified_result: VerifiedField;
+    if db_vec.iter().any(|item| item.name == llm_field_name) {
+        //println!("Found \"{}\" in db", llm_field_name);
+        verified_result = VerifiedField {
+            correct_field: Some(String::from(llm_field_name)),
+            correct_value: None,
+            _probable_fields: None,
+        };
+    } else {
+        println!("Did not find \"{}\" in db", llm_field_name);
+        // Check to see if llm_field_name exists as values under any of the fields
+        let (search_field, search_val) = verify_json_value(llm_field_name, &db_vec);
+        match search_field {
+            Some(x) => {
+                verified_result = VerifiedField {
+                    correct_field: Some(String::from(x)),
+                    correct_value: search_val,
+                    _probable_fields: None,
+                };
+            }
+            None => {
+                // Incorrect field found neither in any of the fields nor any of the values. This will then invoke embedding match across all the fields and their corresponding values
+                let mut search_terms = Vec::<String>::new();
+                search_terms.push(String::from(llm_field_name)); // Added the incorrect field item to the search
+                verified_result = VerifiedField {
+                    correct_field: None,
+                    correct_value: None,
+                    _probable_fields: None,
+                };
+            }
+        }
+    }
+    verified_result
+}
+fn verify_json_value(llm_value_name: &str, db_vec: &Vec<DbRows>) -> (Option<String>, Option<String>) {
+    let mut search_field: Option<String> = None;
+    let mut search_val: Option<String> = None;
+    for item in db_vec {
+        for val in &item.values {
+            if llm_value_name == val {
+                search_field = Some(item.name.clone());
+                search_val = Some(String::from(val));
+                break;
+            }
+        }
+        match search_field {
+            Some(_) => break,
+            None => {}
         }
     }
+    (search_field, search_val)
 }