PyPI - kobai-sdk - Versions diffs - 0.2.8rc3__tar.gz → 0.2.8rc5__tar.gz - Mend

kobai-sdk 0.2.8rc3tar.gz → 0.2.8rc5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kobai-sdk might be problematic. Click here for more details.

Files changed (21) hide show

{kobai_sdk-0.2.8rc3 → kobai_sdk-0.2.8rc5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: kobai-sdk
-Version: 0.2.8rc3
+Version: 0.2.8rc5
 Summary: A package that enables interaction with a Kobai tenant.
 Author-email: Ryan Oattes <ryan@kobai.io>
 License:                                  Apache License
@@ -222,7 +222,7 @@ Requires-Dist: azure-storage-blob
 Requires-Dist: langchain-core
 Requires-Dist: langchain-community
 Requires-Dist: langchain_openai
-Requires-Dist: sentence_transformers
+Requires-Dist: databricks_langchain
 Provides-Extra: dev
 Requires-Dist: black; extra == "dev"
 Requires-Dist: bumpver; extra == "dev"

kobai_sdk-0.2.8rc5/kobai/ai_query.py ADDED Viewed

@@ -0,0 +1,255 @@
+from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from sentence_transformers import SentenceTransformer, util
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.embeddings import Embeddings
+from langchain_core.documents import Document
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.callbacks import CallbackManagerForRetrieverRun
+from langchain_core.runnables import RunnablePassthrough, RunnableLambda
+from langchain_core.vectorstores import InMemoryVectorStore
+from typing import Union, List
+MESSAGE_SYSTEM_TEMPLATE = """
+    You are a data analyst tasked with answering questions based on a provided data set. Please answer the questions based on the provided context below. Make sure not to make any changes to the context, if possible, when preparing answers to provide accurate responses. If the answer cannot be found in context, just politely say that you do not know, do not try to make up an answer.
+    When you receive a question from the user, answer only that one question in a concise manner. Do not elaborate with other questions.
+    """
+MESSAGE_AI_TEMPLATE = """
+    The table information is as follows:
+    {table_data}
+    """
+MESSAGE_USER_CONTEXT_TEMPLATE = """
+    The context being provided is from a table named: {table_name}
+    """
+MESSAGE_USER_QUESTION_TEMPLATE = """
+    {question}
+    """
+SIMPLE_PROMPT_TEMPLATE = f"""
+    {MESSAGE_SYSTEM_TEMPLATE}
+    {MESSAGE_USER_CONTEXT_TEMPLATE}
+    {MESSAGE_AI_TEMPLATE}
+    Question: {MESSAGE_USER_QUESTION_TEMPLATE}
+    """
+class QuestionRetriever(BaseRetriever):
+    #https://python.langchain.com/docs/how_to/custom_retriever/
+    #https://github.com/langchain-ai/langchain/issues/12304
+    documents: List[Document]
+    k: int = 5000
+    #def __init__(self, documents: List[Document], k: int = 5000):
+    #    self.documents = documents
+    #    self.k = k
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        """Sync implementations for retriever."""
+        matching_documents = []
+        for document in self.documents:
+            if len(matching_documents) > self.k:
+                return matching_documents
+            #if query.lower() in document.page_content.lower():
+            #    matching_documents.append(document)
+            matching_documents.append(document)
+        return matching_documents
+def format_docs(docs):
+    return "\n\n".join([d.page_content for d in docs])
+def input_only(inpt):
+    return inpt["question"]
+def followup_question(user_question, question_results, question_name, question_def, embedding_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, use_inmem_vectors=False):
+    row_texts = process_question_results(question_def, question_results)
+    question_documents = [Document(page_content=r, metadata={"source": "kobai"}) for r in row_texts]
+    if use_inmem_vectors:
+        question_retriever = InMemoryVectorStore.from_documents(question_documents, embedding=embedding_model).as_retriever(
+    search_kwargs={"k": 5}
+)
+    else:
+        question_retriever = QuestionRetriever(documents=question_documents)
+    output_parser = StrOutputParser()
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            SystemMessagePromptTemplate.from_template(
+                MESSAGE_SYSTEM_TEMPLATE),
+            HumanMessagePromptTemplate.from_template(
+                MESSAGE_USER_CONTEXT_TEMPLATE),
+            AIMessagePromptTemplate.from_template(MESSAGE_AI_TEMPLATE),
+            HumanMessagePromptTemplate.from_template(
+                MESSAGE_USER_QUESTION_TEMPLATE)
+        ]
+    )
+    chain = (
+        {"table_name": RunnablePassthrough(), "table_data": RunnableLambda(input_only) | question_retriever | format_docs, "question": RunnablePassthrough()}
+        | prompt
+        | chat_model
+        | output_parser
+    )
+    response = chain.invoke(
+        {
+            "table_name": question_name,
+            "question": user_question
+        }
+    )
+    return response
+def init_question_search_index(tenant_questions, emb_model):
+    q_ids = [q["id"] for q in tenant_questions]
+    q_descs = [q["description"] for q in tenant_questions]
+    if isinstance(emb_model, SentenceTransformer):
+        q_vectors = emb_model.encode(q_descs)
+    else:
+        q_vectors = emb_model.embed_documents(q_descs)
+    return {"ids": q_ids, "descs": q_descs, "vectors": q_vectors}
+def question_search(search_text: str, search_index, emb_model, k: int):
+    if isinstance(emb_model, SentenceTransformer):
+        search_vec = emb_model.encode(search_text)
+    else:
+        search_vec = emb_model.embed_query(search_text)
+    #search_vec = emb_model.encode(search_text)
+    matches = __top_vector_matches(search_vec, search_index["vectors"], top=k)
+    for mi, m in enumerate(matches):
+        matches[mi]["id"] = search_index["ids"][m["index"]]
+        matches[mi]["description"] = search_index["descs"][m["index"]]
+    return matches
+def __top_vector_matches(test_vec, options_list_vec, top=1):
+    scores_t = util.cos_sim(test_vec, options_list_vec)[0]
+    scores_l = scores_t.tolist()
+    scores_d = [{"index": i, "value": v} for i, v in enumerate(scores_l)]
+    sorted_d = sorted(scores_d, key=lambda i: i["value"], reverse=True)
+    top_d = sorted_d[0:top]
+    return top_d
+def process_question_results(question_def, question_results):
+    """
+    Returns a template to format each row in Kobai JSON question output into a format readable by LLMs.
+    Parameters:
+    question_def (any): Kobai standard JSON definition of question.
+    question_results (any): JSON representation of Kobai base question results.
+    """
+    concept_props = {}
+    concept_rels = {}
+    for ci in question_def["definition"]:
+        con_name =  question_def["definition"][ci]["label"].replace("_", " ")
+        con_label = question_def["definition"][ci]["label"]
+        concept_props[ci] = {"name": con_name, "props": []}
+        for p in question_def["definition"][ci]["properties"]:
+            if p["hidden"] == False:
+                if len(p["aggregates"]) > 0:
+                    for a in p["aggregates"]:
+                        prop_column = con_label + "_" + p["label"] + "_" + a["type"]
+                        prop_name = p["label"].replace("_", " ")
+                        concept_props[ci]["props"].append({"column": prop_column, "name": prop_name, "agg": a["type"]})
+                else:
+                    prop_column = con_label + "_" + p["label"]
+                    prop_name = p["label"].replace("_", " ")
+                    concept_props[ci]["props"].append({"column": prop_column, "name": prop_name, "agg": None})
+        for r in question_def["definition"][ci]["relations"]:
+            prop_name = question_def["definition"][ci]["relations"][r]["label"].replace("_", " ")
+            for ri in question_def["definition"][ci]["relations"][r]["relationInstances"]:
+                if ci not in concept_rels:
+                    concept_rels[ci] = {"count": 0, "edges": []}
+                concept_rels[ci]["edges"].append({"src": ci, "dst": ri["relationTypeUri"], "name": prop_name})
+                concept_rels[ci]["count"] += 1
+    row_texts = {}
+    for ci, c in concept_props.items():
+        p_texts = []
+        for p in c["props"]:
+            if p["agg"] is None:
+                p_text = p["name"] + " " + "{" + p["column"] + "}"
+            else:
+                p_text = p["agg"] + " of " + p["name"] + " " + "{" + p["column"] + "}"
+            p_texts.append(p_text)
+        c_text = __get_article(c["name"]) + " " + c["name"]
+        if len(c["props"]) > 0:
+            c_text += " with " + __smart_comma_formatting(p_texts)
+        row_texts[ci] = c_text
+    max_src = ""
+    max_src_count = -1
+    for r in concept_rels:
+        if concept_rels[r]["count"] > max_src_count:
+            max_src_count = concept_rels[r]["count"]
+            max_src = r
+    concept_order = [max_src]
+    for t in concept_rels[max_src]["edges"]:
+        concept_order.append(t["dst"])
+    for c in concept_props:
+        if c not in concept_order:
+            concept_order.append(c)
+    row_template = concept_order[0] + " is connected to " + " and connected to ".join(concept_order[1:])
+    for c in row_texts:
+        row_template = row_template.replace(c, row_texts[c])
+    row_template = row_template[0].upper() + row_template[1:] + "."
+    row_texts = []
+    for row in question_results:
+        row_text = row_template
+        for col in row:
+            row_text = row_text.replace("{" + col + "}", str(row[col]))
+        row_texts.append(row_text)
+    #data = "\n".join(row_texts)
+    return row_texts
+    #return data
+def __smart_comma_formatting(items):
+    if items == None:
+        return ""
+    match len(items):
+        case 0:
+            return ""
+        case 1:
+            return items[0]
+        case 2:
+            return items[0] + " and " + items[1]
+        case _:
+            return ", ".join(items[0: -1]) + " and " + items[-1]
+def __get_article(label):
+    if label[0:1].lower() in ["a", "e", "i", "o", "u"]:
+        return "an"
+    else:
+        return "a"

{kobai_sdk-0.2.8rc3 → kobai_sdk-0.2.8rc5}/kobai/ai_rag.py RENAMED Viewed

@@ -1,4 +1,5 @@
-from kobai import tenant_client
+from kobai import tenant_api
+from pyspark.sql import SparkSession
 from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
 from pyspark.sql import functions as F
@@ -11,12 +12,43 @@ from langchain_community.document_loaders import PySparkDataFrameLoader
 from langchain import hub
 from langchain_core.output_parsers import StrOutputParser
+import urllib
+import urllib.parse
+class AIContext:
+    schema: str
+    spark_session: SparkSession
+    model_id: str
+    tenant_json: str
+    api_client: tenant_api.TenantAPI
+def ai_run_question_remote(tc: AIContext, question_id, dynamic_filters: dict = None):
-def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None, concept_white_list=None, use_questions=False):
+        """
+        Returns JSON formatted result of Kobai question.
+        Parameters:
+        question_id (int): Numeric identifier of Kobai question.
+        """
+        uri = '/data-svcs/api/query/' + str(question_id) + '/execute?' #'/data-svcs/api/query/4518/solution/9/execute/tabular?'
+        queryParams = {'jsontype': 'tableau'}
+        if bool(dynamic_filters):
+            queryParams.update(dynamic_filters)
+        uri += urllib.parse.urlencode(queryParams)
+        json={
+                'simulations': {'concepts': {}, 'data': None}
+                }
+        response = tc.api_client._TenantAPI__run_post(uri, json)
+        return response.json()
+def generate_sentences(tc: AIContext, replica_schema=None, concept_white_list=None, use_questions=False):
     """
     Extract Semantic Data from Graph to Delta Table
@@ -26,24 +58,27 @@ def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None, conc
     concept_white_list ([str]) OPTIONAL: A list of Domain and Concept names for extraction.
     use_questions (bool) OPTIONAL: Extract facts from published Kobai questions.
     """
-    if tc.spark_client is None:
-        return None
-    ss = tc.spark_client.spark_session
+    #if tc.spark_client is None:
+    #    return None
+    ss = tc.spark_session
     print("Getting Tenant Config")
-    tenant_json = tc.get_tenant_config()
+    tenant_json = tc.tenant_json
-    concepts = __get_concept_metadata(tenant_json, tc.schema, tc.model_id, concept_white_list)
+    concepts = __get_concept_metadata(
+        tenant_json, tc.schema, tc.model_id, concept_white_list)
     print("Dropping and Recreating the RAG Table")
     ss.sql(__create_rag_table_sql(tc.schema, tc.model_id))
     print("Generating Extraction SQL")
     sql_statements = []
-    sql_statements.extend(__generate_sentence_sql_concept_literals(concepts, tc.schema, tc.model_id))
-    sql_statements.extend(__generate_sentence_sql_concept_relations(concepts, tc.schema, tc.model_id))
+    sql_statements.extend(__generate_sentence_sql_concept_literals(
+        concepts, tc.schema, tc.model_id))
+    sql_statements.extend(__generate_sentence_sql_concept_relations(
+        concepts, tc.schema, tc.model_id))
     print("Running the Extraction")
     for sql_statement in sql_statements:
@@ -55,14 +90,16 @@ def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None, conc
     if replica_schema is not None:
         print("Replicating Schema")
         ss.sql(__create_rag_table_sql(replica_schema, tc.model_id))
-        ss.sql(__replicate_to_catalog_sql(tc.schema, replica_schema, tc.model_id))
+        ss.sql(__replicate_to_catalog_sql(
+            tc.schema, replica_schema, tc.model_id))
-def __generate_sentences_from_questions(tc: tenant_client.TenantClient):
-    ss = tc.spark_client.spark_session
+def __generate_sentences_from_questions(tc: AIContext):
+    ss = tc.spark_session
     print("Getting Question Data")
-    tenant_json = tc.get_tenant_config()
+    tenant_json = tc.tenant_json
     published_queries = []
     for p in tenant_json["publishedAPIs"]:
@@ -73,22 +110,21 @@ def __generate_sentences_from_questions(tc: tenant_client.TenantClient):
         if q["id"] in published_queries:
             question_names[q["id"]] = q["description"]
-    schemaV = StructType([
-            StructField("sentence",StringType(),True),
-            StructField("query_id", StringType(), True)
-        ])
+    schema_v = StructType([
+        StructField("sentence", StringType(), True),
+        StructField("query_id", StringType(), True)
+    ])
     sentences = []
     for p in published_queries:
-        output = tc.run_question_remote(p)
+        output = ai_run_question_remote(tc, p)
         for r in output:
             sentence = f"For {question_names[p]}: "
             for c in r:
                 sentence += f"The {c.replace('_', ' ')} is {r[c]}. "
             sentences.append([sentence, p])
-    sentences_df = ss.createDataFrame(sentences, schemaV)
+    sentences_df = ss.createDataFrame(sentences, schema_v)
     sentences_df = sentences_df.select(
         F.col("sentence").alias("sentence"),
         F.col("query_id").alias("concept_id"),
@@ -96,19 +132,17 @@ def __generate_sentences_from_questions(tc: tenant_client.TenantClient):
     )
     schema = tc.schema
     view_name = f"rag_{tc.model_id}_question_sentences"
     sentences_df.createOrReplaceTempView(view_name)
     full_sql = f"INSERT INTO {schema}.rag_{tc.model_id} (content, concept_id, type)"
     full_sql += f" SELECT sentence, concept_id, type FROM {view_name}"
-    ss.sql(full_sql)
+    ss.sql(full_sql)
-def encode_to_delta_local(tc: tenant_client.TenantClient, st_model: SentenceTransformer, replica_schema=None):
+def encode_to_delta_local(tc: AIContext, st_model: SentenceTransformer, replica_schema=None):
     """
     Encode Semantic Data to Vectors in Delta Table
@@ -118,10 +152,10 @@ def encode_to_delta_local(tc: tenant_client.TenantClient, st_model: SentenceTran
     replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
     """
-    if tc.spark_client is None:
-        return None
-    ss = tc.spark_client.spark_session
+    #if tc.spark_client is None:
+    #    return None
+    ss = tc.spark_session
     schema = tc.schema
     if replica_schema is not None:
@@ -133,7 +167,6 @@ def encode_to_delta_local(tc: tenant_client.TenantClient, st_model: SentenceTran
     num_records = sentences_df.count()
     query_batch_size = 100000
     for x in range(0, num_records, query_batch_size):
         print(f"Running Batch Starting at {x}")
         sentences_sql = f" SELECT id, content FROM {schema}.rag_{tc.model_id} ORDER BY id LIMIT {str(query_batch_size)} OFFSET {str(x)}"
@@ -141,25 +174,27 @@ def encode_to_delta_local(tc: tenant_client.TenantClient, st_model: SentenceTran
         content_list = [r["content"] for r in sentences_df.collect()]
         id_list = [r["id"] for r in sentences_df.collect()]
-        vector_list = st_model.encode(content_list, normalize_embeddings=True, show_progress_bar=True)
+        vector_list = st_model.encode(
+            content_list, normalize_embeddings=True, show_progress_bar=True)
-        schemaV = StructType([
-            StructField("id",IntegerType(),True),
+        schema_v = StructType([
+            StructField("id", IntegerType(), True),
             StructField("vector", ArrayType(FloatType()), False)
         ])
-        updated_list = [[r[0], r[1].tolist()] for r in zip(id_list, vector_list)]
-        updated_df = ss.createDataFrame(updated_list, schemaV)
+        updated_list = [[r[0], r[1].tolist()]
+                        for r in zip(id_list, vector_list)]
+        updated_df = ss.createDataFrame(updated_list, schema_v)
         target_table = DeltaTable.forName(ss, f"{schema}.rag_{tc.model_id}")
         target_table.alias("t") \
-        .merge(
+            .merge(
             updated_df.alias("s"),
             't.id = s.id'
         ) \
-        .whenMatchedUpdate(set = {"vector": "s.vector"}) \
-        .execute()
+            .whenMatchedUpdate(set={"vector": "s.vector"}) \
+            .execute()
     ss.sql(f"""
           CREATE FUNCTION IF NOT EXISTS {schema}.cos_sim(a ARRAY<FLOAT>, b ARRAY<FLOAT>)
@@ -171,8 +206,8 @@ def encode_to_delta_local(tc: tenant_client.TenantClient, st_model: SentenceTran
             $$
           """)
-def rag_delta(tc: tenant_client.TenantClient, emb_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, question, k=5, replica_schema=None):
+def rag_delta(tc: AIContext, emb_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, question, k=5, replica_schema=None):
     """
     Run a RAG query using vectors in Delta table.
@@ -184,25 +219,26 @@ def rag_delta(tc: tenant_client.TenantClient, emb_model: Union[SentenceTransform
     k (int) OPTIONAL: The number of RAG documents to retrieve.
     replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
     """
     schema = tc.schema
     if replica_schema is not None:
         schema = replica_schema
-    if tc.spark_client is None:
-        print("Instantiate Spark Client First")
-        return None
-    ss = tc.spark_client.spark_session
+    #if tc.spark_client is None:
+    #    print("Instantiate Spark Client First")
+    #    return None
+    ss = tc.spark_session
     if isinstance(emb_model, SentenceTransformer):
-        vector_list = emb_model.encode(question, normalize_embeddings=True).tolist()
+        vector_list = emb_model.encode(
+            question, normalize_embeddings=True).tolist()
     elif isinstance(emb_model, Embeddings):
         vector_list = emb_model.embed_query(question)
     else:
         print("Invalid Embedding Model Type")
         return None
     if not isinstance(chat_model, BaseChatModel):
         print("Invalid Chat Model Type")
         return None
@@ -216,7 +252,7 @@ def rag_delta(tc: tenant_client.TenantClient, emb_model: Union[SentenceTransform
             ORDER BY score DESC
             LIMIT {k}
             """)
     loader = PySparkDataFrameLoader(ss, results, page_content_column="content")
     documents = loader.load()
     docs_content = "\n\n".join(doc.page_content for doc in documents)
@@ -236,20 +272,24 @@ def rag_delta(tc: tenant_client.TenantClient, emb_model: Union[SentenceTransform
     return response
 def __create_rag_table_sql(schema, model_id):
     return f"CREATE OR REPLACE TABLE {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
 def __replicate_to_catalog_sql(base_schema, target_schema, model_id):
     move_sql = f"INSERT INTO {target_schema}.rag_{model_id} (content, concept_id, type)"
     move_sql += f" SELECT content, concept_id, type FROM {base_schema}.rag_{model_id}"
     return move_sql
 def __generate_sentence_sql_concept_literals(concepts, schema, model_id):
     statements = []
     for con in concepts:
         sql = f"'This is a {con['label']}. '"
         sql += " || 'It is identified by ' || cid._plain_conceptid || '. '"
         sql_from = f"(SELECT _conceptid, _plain_conceptid FROM {con['prop_table_name']} GROUP BY _conceptid, _plain_conceptid) cid"
         for prop in con["properties"]:
@@ -257,15 +297,16 @@ def __generate_sentence_sql_concept_literals(concepts, schema, model_id):
             sql_from += f" ON cid._conceptid = {prop['label']}._conceptid"
             sql_from += f" AND {prop['label']}.type = 'l'"
             sql_from += f" AND {prop['label']}.name = '{prop['name']}'"
             sql += f" || 'The {prop['label']} is ' || ifnull(any_value({prop['label']}.value) IGNORE NULLS, 'unknown') || '. '"
         full_sql = f"INSERT INTO {schema}.rag_{model_id} (content, concept_id, type)"
         full_sql += f" SELECT {sql} content, cid._conceptid concept_id, 'c' type FROM {sql_from} GROUP BY cid._conceptid, cid._plain_conceptid"
         statements.append(full_sql)
     return statements
 def __generate_sentence_sql_concept_relations(concepts, schema, model_id):
     statements = []
     for con in concepts:
@@ -280,13 +321,13 @@ def __generate_sentence_sql_concept_relations(concepts, schema, model_id):
             sql += f" || ' has a relationship called {rel['label']} that connects it to one or more {rel['target_con_label']} identified by '"
             sql += " || concat_ws(', ', array_agg(cid._plain_conceptid)) || '. '"
             full_sql = f"INSERT INTO {schema}.rag_{model_id} (content, concept_id, type)"
             full_sql += f" SELECT {sql} content, rel._conceptid concept_id, 'e' type FROM {sql_from} GROUP BY rel._conceptid, rel._plain_conceptid"
             statements.append(full_sql)
     return statements
 def __get_concept_metadata(tenant_json, schema, model_id, whitelist):
     target_concept_labels = {}
     target_table_names = {}
@@ -297,7 +338,7 @@ def __get_concept_metadata(tenant_json, schema, model_id, whitelist):
                 "prop": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_np",
                 "con": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_c"
             }
     concepts = []
     for d in tenant_json["domains"]:
         for c in d["concepts"]:
@@ -306,7 +347,7 @@ def __get_concept_metadata(tenant_json, schema, model_id, whitelist):
                 con_props.append({
                     "label": col["label"],
                     "name": f"{model_id}/{d['name']}/{c['label']}#{col['label']}"
-                    })
+                })
             con_rels = []
             for rel in c["relations"]:
                 if whitelist is not None and target_concept_labels[rel["relationTypeUri"]] not in whitelist:
@@ -328,7 +369,7 @@ def __get_concept_metadata(tenant_json, schema, model_id, whitelist):
                 "parents": con_parents,
                 "prop_table_name": target_table_names[c["uri"]]["prop"],
                 "con_table_name": target_table_names[c["uri"]]["con"]
-                })
+            })
     for ci, c in enumerate(concepts):
         if len(c["parents"]) > 0:
@@ -343,4 +384,4 @@ def __get_concept_metadata(tenant_json, schema, model_id, whitelist):
             continue
         out_concepts.append(c)
-    return out_concepts
+    return out_concepts

{kobai_sdk-0.2.8rc3 → kobai_sdk-0.2.8rc5}/kobai/tenant_client.py RENAMED Viewed

@@ -7,7 +7,14 @@ import requests
 from azure.identity import DeviceCodeCredential
 from pyspark.sql import SparkSession
-from . import spark_client, databricks_client, ai_query, tenant_api
+from langchain_community.chat_models import ChatDatabricks
+from databricks_langchain import DatabricksEmbeddings
+from sentence_transformers import SentenceTransformer
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.embeddings import Embeddings
+from typing import Union
+from . import spark_client, databricks_client, ai_query, tenant_api, ai_rag
 class TenantClient:
@@ -39,6 +46,10 @@ class TenantClient:
         self.model_id = ""
         self.proxies = None
         self.ssl_verify = True
+        self.question_search_index = None
+        self.embedding_model = None
+        self.chat_model = None
     def update_proxy(self, proxies: any):
         self.proxies = proxies
@@ -99,10 +110,11 @@ class TenantClient:
         self.__api_init_session()
         self.__set_tenant_solutionid()
+        self.init_ai_components()
         print("Authentication Successful.")
-    def authenticate_brower_token(self, access_token):
+    def authenticate_browser_token(self, access_token):
         """
         Authenticate the TenantClient with the Kobai instance. Returns nothing, but stores bearer token in client.
@@ -116,6 +128,8 @@ class TenantClient:
         self.__api_init_session()
         self.__set_tenant_solutionid()
+        self.init_ai_components()
         print("Authentication Successful.")
@@ -410,212 +424,114 @@ class TenantClient:
         return return_questions
 ########################################
-# AI Functions
+# RAG Functions
 ########################################
-    def followup_question(self, followup_question, question_results, question_id, override_model=None, use_simple_prompt=False, debug=False):
+    def get_ai_context(self):
+        context = ai_rag.AIContext()
+        context.model_id = self.model_id
+        context.schema = self.schema
+        context.tenant_json = self.get_tenant_config()
+        context.spark_session = self.spark_client.spark_session
+        context.api_client = self.api_client
+        return context
+    def rag_generate_sentences(self, replica_schema=None, concept_white_list=None, use_questions=False):
         """
-        Use LLM to further investigate the results of a Kobai base question.
+        Extract Semantic Data from Graph to Delta Table
         Parameters:
-        followup_question (str): A natural language question to apply.
-        question_results (any): JSON representation of Kobai base question results.
-        question_id (int): Numeric id for base Kobai question.
-        override_model (LangChain BaseLanguageModel) OPTIONAL: Langchain LLM or ChatModel runnable.
-        use_simple_prompt (bool) OPTIONAL: Uses ChatPrompt when True, Prompt when False.
-        debug (bool) OPTIONAL: Set Langchain debug for troubleshooting.
+        replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
+        concept_white_list ([str]) OPTIONAL: A list of Domain and Concept names for extraction.
+        use_questions (bool) OPTIONAL: Extract facts from published Kobai questions.
         """
-        question_def = self.get_question(question_id)
-        question_name = question_def["description"]
-        row_texts = []
-        row_template = self.process_question_results(question_def)
-        for row in question_results:
-            row_text = row_template
-            for col in row:
-                row_text = row_text.replace("{" + col + "}", str(row[col]))
-            row_texts.append(row_text)
-        data = "\n".join(row_texts)
+        ai_rag.generate_sentences(self.get_ai_context(), replica_schema=replica_schema, concept_white_list=concept_white_list, use_questions=use_questions)
-        return ai_query.followup_question(followup_question,
-                                          data,
-                                          question_name,
-                                          None,
-                                          override_model=override_model,
-                                          )
+    def rag_encode_to_delta_local(self, st_model: SentenceTransformer, replica_schema=None):
+        """
+        Encode Semantic Data to Vectors in Delta Table
-    def process_question_results(self, question_def):
+        Parameters:
+        st_model (SentenceTransformer): A sentence_transformers model to use for encoding.
+        replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
+        """
+        ai_rag.encode_to_delta_local(self.get_ai_context(), st_model=st_model, replica_schema=replica_schema)
+    def rag_delta(self, emb_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, question, k=5, replica_schema=None):
         """
-        Returns a template to format each row in Kobai JSON question output into a format readable by LLMs.
+        Run a RAG query using vectors in Delta table.
         Parameters:
-        question_def (any): Kobai standard JSON definition of question.
-        """
-        concept_props = {}
-        concept_rels = {}
-        for ci in question_def["definition"]:
-            con_name =  question_def["definition"][ci]["label"].replace("_", " ")
-            con_label = question_def["definition"][ci]["label"]
-            concept_props[ci] = {"name": con_name, "props": []}
-            for p in question_def["definition"][ci]["properties"]:
-                if p["hidden"] == False:
-                    if len(p["aggregates"]) > 0:
-                        for a in p["aggregates"]:
-                            prop_column = con_label + "_" + p["label"] + "_" + a["type"]
-                            prop_name = p["label"].replace("_", " ")
-                            concept_props[ci]["props"].append({"column": prop_column, "name": prop_name, "agg": a["type"]})
-                    else:
-                        prop_column = con_label + "_" + p["label"]
-                        prop_name = p["label"].replace("_", " ")
-                        concept_props[ci]["props"].append({"column": prop_column, "name": prop_name, "agg": None})
-            for r in question_def["definition"][ci]["relations"]:
-                prop_name = question_def["definition"][ci]["relations"][r]["label"].replace("_", " ")
-                for ri in question_def["definition"][ci]["relations"][r]["relationInstances"]:
-                    if ci not in concept_rels:
-                        concept_rels[ci] = {"count": 0, "edges": []}
-                    concept_rels[ci]["edges"].append({"src": ci, "dst": ri["relationTypeUri"], "name": prop_name})
-                    concept_rels[ci]["count"] += 1
-        row_texts = {}
-        for ci, c in concept_props.items():
-            p_texts = []
-            for p in c["props"]:
-                if p["agg"] is None:
-                    p_text = p["name"] + " " + "{" + p["column"] + "}"
-                else:
-                    p_text = p["agg"] + " of " + p["name"] + " " + "{" + p["column"] + "}"
-                p_texts.append(p_text)
-            c_text = self.__get_article(c["name"]) + " " + c["name"]
-            if len(c["props"]) > 0:
-                c_text += " with " + self.__smart_comma_formatting(p_texts)
-            row_texts[ci] = c_text
+        emb_model (UNION[SentenceTransformer, Embeddings]): A sentence_transformers or langchain embedding model to use for encoding the query.
+        chat_model (BaseChatModel): A langchain chat model to use in the RAG pipeline.
+        question (str): The user's query.
+        k (int) OPTIONAL: The number of RAG documents to retrieve.
+        replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
+        """
+        ai_rag.rag_delta(self.get_ai_context(), emb_model=emb_model, chat_model=chat_model, question=question, k=k, replica_schema=replica_schema)
-        max_src = ""
-        max_src_count = -1
+########################################
+# AI Functions
+########################################
-        for r in concept_rels:
-            if concept_rels[r]["count"] > max_src_count:
-                max_src_count = concept_rels[r]["count"]
-                max_src = r
+    def followup_question(self, user_question, question_id=None, use_inmem_vectors=False):
+        """
+        Use LLM to answer question in the context of a Kobai Studio question.
+        Parameters:
+        user_question (str): A natural language question to apply.
+        question_id (int) OPTIONAL: A Kobai question to use as a data source. Otherwise, an appropriate question will be automatically found.
+        use_inmem_vectors (bool) OPTIONAL: For large query sets, this secondary processing can reduce the data required in the context window.
+        """
-        concept_order = [max_src]
-        for t in concept_rels[max_src]["edges"]:
-            concept_order.append(t["dst"])
+        if question_id is None:
-        for c in concept_props:
-            if c not in concept_order:
-                concept_order.append(c)
+            suggestions = self.question_search(user_question, k=1)
-        row_text = concept_order[0] + " is connected to " + " and connected to ".join(concept_order[1:])
+            question_id = suggestions[0]["id"]
-        for c in row_texts:
-            row_text = row_text.replace(c, row_texts[c])
+        question_results = self.run_question_remote(question_id)
-        row_text = row_text[0].upper() + row_text[1:] + "."
-        return row_text
-    def process_question_results2(self, question_def):
+        question_def = self.get_question(question_id)
+        question_name = question_def["description"]
+        return ai_query.followup_question(user_question, question_results, question_name, question_def, self.embedding_model, self.chat_model, use_inmem_vectors=use_inmem_vectors)
+    def init_ai_components(self, embedding_model: Union[SentenceTransformer, Embeddings] = None, chat_model: BaseChatModel = None):
         """
-        Returns a template to format each row in Kobai JSON question output into a format readable by LLMs.
+        Set Chat and Embedding models for AI functions to use. If no arguments provided, Databricks hosted services are used.
         Parameters:
-        question_def (any): Kobai standard JSON definition of question.
+        embedding_model (Union[SentenceTransformer, Embeddings]) OPTIONAL: A sentence_transformer or Langchain Embedding model.
+        chat_model (BaseChatModel) OPTIONAL: A Langchain BaseChatModel chat model.
         """
-        concept_props = {}
-        concept_rels = {}
-        for ci in question_def["definition"]:
-            con_name =  question_def["definition"][ci]["label"].replace("_", " ")
-            con_label = question_def["definition"][ci]["label"]
-            concept_props[ci] = {"name": con_name, "props": []}
-            for p in question_def["definition"][ci]["properties"]:
-                if p["hidden"] == False:
-                    if len(p["aggregates"]) > 0:
-                        for a in p["aggregates"]:
-                            prop_column = con_label + "_" + p["label"] + "_" + a["type"]
-                            prop_name = p["label"].replace("_", " ")
-                            concept_props[ci]["props"].append({"column": prop_column, "name": prop_name, "agg": a["type"]})
-                    else:
-                        prop_column = con_label + "_" + p["label"]
-                        prop_name = p["label"].replace("_", " ")
-                        concept_props[ci]["props"].append({"column": prop_column, "name": prop_name, "agg": None})
-            for r in question_def["definition"][ci]["relations"]:
-                prop_name = question_def["definition"][ci]["relations"][r]["label"].replace("_", " ")
-                for ri in question_def["definition"][ci]["relations"][r]["relationInstances"]:
-                    if ci not in concept_rels:
-                        concept_rels[ci] = {"count": 0, "edges": []}
-                    concept_rels[ci]["edges"].append({"src": ci, "dst": ri["relationTypeUri"], "name": prop_name})
-                    concept_rels[ci]["count"] += 1
-        row_texts = {}
-        for ci, c in concept_props.items():
-            p_texts = []
-            for p in c["props"]:
-                if p["agg"] is None:
-                    p_text = p["name"] + " " + "{" + p["column"] + "}"
-                else:
-                    p_text = p["agg"] + " of " + p["name"] + " " + "{" + p["column"] + "}"
-                p_texts.append(p_text)
-            c_text = self.__get_article(c["name"]) + " " + c["name"]
-            if len(c["props"]) > 0:
-                c_text += " with " + self.__smart_comma_formatting(p_texts)
-            row_texts[ci] = c_text
+        if embedding_model is not None:
+            self.embedding_model = embedding_model
+        else:
+            #self.embedding_model = SentenceTransformer("baai/bge-large-en-v1.5")
+            self.embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
+        if chat_model is not None:
+            self.chat_model = chat_model
+        else:
+            self.chat_model = ChatDatabricks(endpoint="databricks-dbrx-instruct")
-        max_src = ""
-        max_src_count = -1
+        self.question_search_index = ai_query.init_question_search_index(self.list_questions(), self.embedding_model)
-        for r in concept_rels:
-            if concept_rels[r]["count"] > max_src_count:
-                max_src_count = concept_rels[r]["count"]
-                max_src = r
+    def question_search(self, search_text, k: int = 1):
+        """
+        Retrieve metadata about Kobai Questions based on user search text.
+        Parameters:
+        search_text (str): Text to compare against question names.
+        k (int) OPTIONAL: Number of top-k matches to return.
+        """
-        concept_order = [max_src]
-        for t in concept_rels[max_src]["edges"]:
-            concept_order.append(t["dst"])
+        question_list = ai_query.question_search(search_text, self.question_search_index, self.embedding_model, k)
+        return question_list
-        for c in concept_props:
-            if c not in concept_order:
-                concept_order.append(c)
-        row_text = concept_order[0] + " is connected to " + " and connected to ".join(concept_order[1:])
-        for c in row_texts:
-            row_text = row_text.replace(c, row_texts[c])
-        row_text = row_text[0].upper() + row_text[1:] + "."
-        return row_text
-    def __smart_comma_formatting(self, items):
-        if items == None:
-            return ""
-        match len(items):
-            case 0:
-                return ""
-            case 1:
-                return items[0]
-            case 2:
-                return items[0] + " and " + items[1]
-            case _:
-                return ", ".join(items[0: -1]) + " and " + items[-1]
-    def __get_article(self, label):
-        if label[0:1].lower() in ["a", "e", "i", "o", "u"]:
-            return "an"
-        else:
-            return "a"
 ########################################
 # Tenant Questions
@@ -943,7 +859,14 @@ class TenantClient:
             response = self.api_client._TenantAPI__run_get('/data-svcs/model/domain/questions/count')
             for q in response.json()["drafts"]:
                 question_list.append({"id": q["id"], "description": q["description"]})
-        return question_list
+        visited_ids = []
+        unique_question_list = []
+        for q in question_list:
+            if q["id"] not in visited_ids:
+                visited_ids.append(q["id"])
+                unique_question_list.append(q)
+        return unique_question_list
     def get_question_id(self, label, domain_label=None):

{kobai_sdk-0.2.8rc3 → kobai_sdk-0.2.8rc5}/kobai_sdk.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: kobai-sdk
-Version: 0.2.8rc3
+Version: 0.2.8rc5
 Summary: A package that enables interaction with a Kobai tenant.
 Author-email: Ryan Oattes <ryan@kobai.io>
 License:                                  Apache License
@@ -222,7 +222,7 @@ Requires-Dist: azure-storage-blob
 Requires-Dist: langchain-core
 Requires-Dist: langchain-community
 Requires-Dist: langchain_openai
-Requires-Dist: sentence_transformers
+Requires-Dist: databricks_langchain
 Provides-Extra: dev
 Requires-Dist: black; extra == "dev"
 Requires-Dist: bumpver; extra == "dev"

{kobai_sdk-0.2.8rc3 → kobai_sdk-0.2.8rc5}/kobai_sdk.egg-info/SOURCES.txt RENAMED Viewed

@@ -7,7 +7,6 @@ kobai/ai_query.py
 kobai/ai_rag.py
 kobai/databricks_client.py
 kobai/demo_tenant_client.py
-kobai/llm_config.py
 kobai/spark_client.py
 kobai/tenant_api.py
 kobai/tenant_client.py

{kobai_sdk-0.2.8rc3 → kobai_sdk-0.2.8rc5}/kobai_sdk.egg-info/requires.txt RENAMED Viewed

@@ -6,7 +6,7 @@ azure-storage-blob
 langchain-core
 langchain-community
 langchain_openai
-sentence_transformers
+databricks_langchain
 [dev]
 black

{kobai_sdk-0.2.8rc3 → kobai_sdk-0.2.8rc5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "kobai-sdk"
-version = "0.2.8rc3"
+version = "0.2.8rc5"
 description = "A package that enables interaction with a Kobai tenant."
 readme = "README.md"
 authors = [{ name = "Ryan Oattes", email = "ryan@kobai.io" }]
@@ -26,8 +26,8 @@ dependencies = [
     "langchain-core",
     "langchain-community",
     "langchain_openai",
-    "sentence_transformers"
-]
+    "databricks_langchain"
+    ]
 requires-python = ">=3.11"
 [project.optional-dependencies]

kobai_sdk-0.2.8rc3/kobai/ai_query.py DELETED Viewed

@@ -1,114 +0,0 @@
-from kobai import llm_config
-from langchain_core.prompts import ChatPromptTemplate, PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-from langchain_community.chat_models import ChatDatabricks
-from langchain.globals import set_debug
-from azure.identity import DefaultAzureCredential, get_bearer_token_provider
-from langchain_openai import AzureChatOpenAI
-MESSAGE_SYSTEM_TEMPLATE = """
-    You are a data analyst tasked with answering questions based on a provided data set. Please answer the questions based on the provided context below. Make sure not to make any changes to the context, if possible, when preparing answers to provide accurate responses. If the answer cannot be found in context, just politely say that you do not know, do not try to make up an answer.
-    When you receive a question from the user, answer only that one question in a concise manner. Do not elaborate with other questions.
-    """
-MESSAGE_AI_TEMPLATE = """
-    The table information is as follows:
-    {table_data}
-    """
-MESSAGE_USER_CONTEXT_TEMPLATE = """
-    The context being provided is from a table named: {table_name}
-    """
-MESSAGE_USER_QUESTION_TEMPLATE = """
-    {question}
-    """
-SIMPLE_PROMPT_TEMPLATE = f"""
-    {MESSAGE_SYSTEM_TEMPLATE}
-    {MESSAGE_USER_CONTEXT_TEMPLATE}
-    {MESSAGE_AI_TEMPLATE}
-    Question: {MESSAGE_USER_QUESTION_TEMPLATE}
-    """
-def followup_question(question, data, question_name, llm_config:llm_config, override_model=None):
-    """
-    Use LLM to answer question in the context of provided data.
-    Parameters:
-    question (str): A natural language question to apply.
-    data (str): Simple dictionary-like structured data.
-    question_name (str): Dataset name for context.
-    llm_config (LLMConfig): User set LLM configurations and some default ones.
-    override_model (LangChain BaseLanguageModel) OPTIONAL: Langchain LLM or ChatModel runnable.
-    """
-    set_debug(llm_config.debug)
-    # If override model is provided, then use the override model as chat model.
-    if override_model is not None:
-        chat_model=override_model
-    elif llm_config.llm_provider == "databricks":
-        chat_model = ChatDatabricks(
-            endpoint = llm_config.endpoint,
-            temperature = llm_config.temperature,
-            max_tokens = llm_config.max_tokens,
-            )
-    elif llm_config.llm_provider == "azure_openai":
-        if(llm_config.api_key is None):
-            # Authenticate through AZ Login or through service principal
-            # Instantiate the AzureChatOpenAI model
-            chat_model = AzureChatOpenAI(
-                azure_endpoint=llm_config.endpoint,
-                azure_deployment=llm_config.deployment,
-                azure_ad_token=llm_config.aad_token,
-                openai_api_version=llm_config.api_version,
-                temperature = llm_config.temperature,
-                max_tokens = llm_config.max_tokens,
-            )
-        else:
-            # Authenticate through API Key
-            chat_model = AzureChatOpenAI(
-                api_key = llm_config.api_key,
-                azure_endpoint=llm_config.endpoint,
-                azure_deployment=llm_config.deployment,
-                openai_api_version=llm_config.api_version,
-                temperature = llm_config.temperature,
-                max_tokens = llm_config.max_tokens,
-            )
-    else:
-        chat_model = ChatDatabricks(
-            endpoint = llm_config.endpoint,
-            temperature = llm_config.temperature,
-            max_tokens = llm_config.max_tokens,
-            )
-    if llm_config.use_simple_prompt:
-        prompt = PromptTemplate.from_template(SIMPLE_PROMPT_TEMPLATE)
-    else:
-        prompt = ChatPromptTemplate.from_messages(
-            [
-                SystemMessagePromptTemplate.from_template(MESSAGE_SYSTEM_TEMPLATE),
-                HumanMessagePromptTemplate.from_template(MESSAGE_USER_CONTEXT_TEMPLATE),
-                AIMessagePromptTemplate.from_template(MESSAGE_AI_TEMPLATE),
-                HumanMessagePromptTemplate.from_template(MESSAGE_USER_QUESTION_TEMPLATE)
-            ]
-        )
-    output_parser = StrOutputParser()
-    chain = prompt | chat_model | output_parser
-    response = chain.invoke(
-        {
-            "table_name": question_name,
-            "table_data": str(data),
-            "question": question
-        }
-    )
-    return response

kobai_sdk-0.2.8rc3/kobai/llm_config.py DELETED Viewed

@@ -1,40 +0,0 @@
-import os
-from azure.identity import DefaultAzureCredential
-class LLMConfig:
-    def __init__(self, deployment: str = None, api_key: str = None, max_tokens: int = 150, temperature: float = 0.1, endpoint: str = "databricks-dbrx-instruct", use_simple_prompt: bool = False, debug: bool = False,
-                llm_provider: str = "databricks", api_version: str = "2024-02-15-preview"):
-        """
-        Initialize the LLMConfig
-        Parameters:
-        deployment (str): LLM against which the query is run.
-        api_key (str): The api_key used for authenticating with the LLM.
-        max_tokens (int): Maximum number of tokens that the model can generate in a single response.
-        temperature (float): Parameter that controls the randomness and creativity of the text generated by the LLM.
-        endpoint (str): The endpoint of the LLM to connect to.
-        debug (bool) OPTIONAL: Set Langchain debug for troubleshooting.
-        use_simple_prompt (bool) OPTIONAL: Simple Prompt template for a language model.
-        llm_provider (str): Provider of the LLM.
-        api_version (str): version of the LLM API that the application will use for making requests.
-        """
-        self.endpoint = endpoint
-        self.deployment = deployment
-        self.api_key = api_key
-        self.api_version = api_version
-        self.use_simple_prompt = use_simple_prompt
-        self.debug = debug
-        self.llm_provider = llm_provider
-        self.max_tokens = max_tokens
-        self.temperature = temperature
-    def get_azure_ad_token(self):
-        # Get the Azure Credential
-        credential = DefaultAzureCredential()
-        # Set the API type to `azure_ad`
-        os.environ["OPENAI_API_TYPE"] = "azure_ad"
-        # Set the API_KEY to the token from the Azure credential
-        self.aad_token = credential.get_token("https://cognitiveservices.azure.com/.default").token