PyPI - kobai-sdk - Versions diffs - 0.2.8rc1__py3-none-any.whl → 0.2.8rc3__py3-none-any.whl - Mend

kobai-sdk 0.2.8rc1py3-none-any.whl → 0.2.8rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kobai-sdk might be problematic. Click here for more details.

Files changed (6) hide show

kobai/ai_rag.py CHANGED Viewed

@@ -1,6 +1,32 @@
 from kobai import tenant_client
-def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None):
+from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
+from pyspark.sql import functions as F
+from sentence_transformers import SentenceTransformer
+from delta import DeltaTable
+from typing import Union
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.embeddings import Embeddings
+from langchain_community.document_loaders import PySparkDataFrameLoader
+from langchain import hub
+from langchain_core.output_parsers import StrOutputParser
+def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None, concept_white_list=None, use_questions=False):
+    """
+    Extract Semantic Data from Graph to Delta Table
+    Parameters:
+    tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
+    replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
+    concept_white_list ([str]) OPTIONAL: A list of Domain and Concept names for extraction.
+    use_questions (bool) OPTIONAL: Extract facts from published Kobai questions.
+    """
     if tc.spark_client is None:
         return None
@@ -9,7 +35,7 @@ def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None):
     print("Getting Tenant Config")
     tenant_json = tc.get_tenant_config()
-    concepts = __get_concept_metadata(tenant_json, tc.schema, tc.model_id)
+    concepts = __get_concept_metadata(tenant_json, tc.schema, tc.model_id, concept_white_list)
     print("Dropping and Recreating the RAG Table")
     ss.sql(__create_rag_table_sql(tc.schema, tc.model_id))
@@ -23,11 +49,193 @@ def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None):
     for sql_statement in sql_statements:
         ss.sql(sql_statement)
+    if use_questions:
+        __generate_sentences_from_questions(tc)
     if replica_schema is not None:
         print("Replicating Schema")
         ss.sql(__create_rag_table_sql(replica_schema, tc.model_id))
         ss.sql(__replicate_to_catalog_sql(tc.schema, replica_schema, tc.model_id))
+def __generate_sentences_from_questions(tc: tenant_client.TenantClient):
+    ss = tc.spark_client.spark_session
+    print("Getting Question Data")
+    tenant_json = tc.get_tenant_config()
+    published_queries = []
+    for p in tenant_json["publishedAPIs"]:
+        published_queries.append(p["queryId"])
+    question_names = {}
+    for q in tenant_json["queries"]:
+        if q["id"] in published_queries:
+            question_names[q["id"]] = q["description"]
+    schemaV = StructType([
+            StructField("sentence",StringType(),True),
+            StructField("query_id", StringType(), True)
+        ])
+    sentences = []
+    for p in published_queries:
+        output = tc.run_question_remote(p)
+        for r in output:
+            sentence = f"For {question_names[p]}: "
+            for c in r:
+                sentence += f"The {c.replace('_', ' ')} is {r[c]}. "
+            sentences.append([sentence, p])
+    sentences_df = ss.createDataFrame(sentences, schemaV)
+    sentences_df = sentences_df.select(
+        F.col("sentence").alias("sentence"),
+        F.col("query_id").alias("concept_id"),
+        F.lit("q").alias("type"),
+    )
+    schema = tc.schema
+    view_name = f"rag_{tc.model_id}_question_sentences"
+    sentences_df.createOrReplaceTempView(view_name)
+    full_sql = f"INSERT INTO {schema}.rag_{tc.model_id} (content, concept_id, type)"
+    full_sql += f" SELECT sentence, concept_id, type FROM {view_name}"
+    ss.sql(full_sql)
+def encode_to_delta_local(tc: tenant_client.TenantClient, st_model: SentenceTransformer, replica_schema=None):
+    """
+    Encode Semantic Data to Vectors in Delta Table
+    Parameters:
+    tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
+    st_model (SentenceTransformer): A sentence_transformers model to use for encoding.
+    replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
+    """
+    if tc.spark_client is None:
+        return None
+    ss = tc.spark_client.spark_session
+    schema = tc.schema
+    if replica_schema is not None:
+        schema = replica_schema
+    sentences_sql = f"SELECT content FROM {schema}.rag_{tc.model_id}"
+    sentences_df = ss.sql(sentences_sql)
+    num_records = sentences_df.count()
+    query_batch_size = 100000
+    for x in range(0, num_records, query_batch_size):
+        print(f"Running Batch Starting at {x}")
+        sentences_sql = f" SELECT id, content FROM {schema}.rag_{tc.model_id} ORDER BY id LIMIT {str(query_batch_size)} OFFSET {str(x)}"
+        sentences_df = ss.sql(sentences_sql)
+        content_list = [r["content"] for r in sentences_df.collect()]
+        id_list = [r["id"] for r in sentences_df.collect()]
+        vector_list = st_model.encode(content_list, normalize_embeddings=True, show_progress_bar=True)
+        schemaV = StructType([
+            StructField("id",IntegerType(),True),
+            StructField("vector", ArrayType(FloatType()), False)
+        ])
+        updated_list = [[r[0], r[1].tolist()] for r in zip(id_list, vector_list)]
+        updated_df = ss.createDataFrame(updated_list, schemaV)
+        target_table = DeltaTable.forName(ss, f"{schema}.rag_{tc.model_id}")
+        target_table.alias("t") \
+        .merge(
+            updated_df.alias("s"),
+            't.id = s.id'
+        ) \
+        .whenMatchedUpdate(set = {"vector": "s.vector"}) \
+        .execute()
+    ss.sql(f"""
+          CREATE FUNCTION IF NOT EXISTS {schema}.cos_sim(a ARRAY<FLOAT>, b ARRAY<FLOAT>)
+            RETURNS FLOAT
+            LANGUAGE PYTHON
+            AS $$
+                import numpy as np
+                return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
+            $$
+          """)
+def rag_delta(tc: tenant_client.TenantClient, emb_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, question, k=5, replica_schema=None):
+    """
+    Run a RAG query using vectors in Delta table.
+    Parameters:
+    tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
+    emb_model (UNION[SentenceTransformer, Embeddings]): A sentence_transformers or langchain embedding model to use for encoding the query.
+    chat_model (BaseChatModel): A langchain chat model to use in the RAG pipeline.
+    question (str): The user's query.
+    k (int) OPTIONAL: The number of RAG documents to retrieve.
+    replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
+    """
+    schema = tc.schema
+    if replica_schema is not None:
+        schema = replica_schema
+    if tc.spark_client is None:
+        print("Instantiate Spark Client First")
+        return None
+    ss = tc.spark_client.spark_session
+    if isinstance(emb_model, SentenceTransformer):
+        vector_list = emb_model.encode(question, normalize_embeddings=True).tolist()
+    elif isinstance(emb_model, Embeddings):
+        vector_list = emb_model.embed_query(question)
+    else:
+        print("Invalid Embedding Model Type")
+        return None
+    if not isinstance(chat_model, BaseChatModel):
+        print("Invalid Chat Model Type")
+        return None
+    vector_list = [str(x) for x in vector_list]
+    vector_sql = ", ".join(vector_list)
+    results = ss.sql(f"""
+            SELECT content, reduce(zip_with(vector, cast(array({vector_sql}) as array<float>), (x,y) -> x*y), float(0.0), (acc,x) -> acc + x) score
+            FROM {schema}.rag_{tc.model_id}
+            ORDER BY score DESC
+            LIMIT {k}
+            """)
+    loader = PySparkDataFrameLoader(ss, results, page_content_column="content")
+    documents = loader.load()
+    docs_content = "\n\n".join(doc.page_content for doc in documents)
+    prompt = hub.pull("rlm/rag-prompt")
+    output_parser = StrOutputParser()
+    chain = prompt | chat_model | output_parser
+    response = chain.invoke(
+        {
+            "context": docs_content,
+            "question": question
+        }
+    )
+    return response
 def __create_rag_table_sql(schema, model_id):
     return f"CREATE OR REPLACE TABLE {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
@@ -40,74 +248,99 @@ def __generate_sentence_sql_concept_literals(concepts, schema, model_id):
     statements = []
     for con in concepts:
         sql = f"'This is a {con['label']}. '"
-        sql += " || 'It is identified by ' || split(cid._conceptid,'#')[1] || '. '"
+        sql += " || 'It is identified by ' || cid._plain_conceptid || '. '"
-        sql_from = f"{con['con_table_name']} cid"
+        sql_from = f"(SELECT _conceptid, _plain_conceptid FROM {con['prop_table_name']} GROUP BY _conceptid, _plain_conceptid) cid"
         for prop in con["properties"]:
-            sql_from += f" INNER JOIN {con['prop_table_name']} AS {prop['label']}"
+            sql_from += f" LEFT JOIN {con['prop_table_name']} AS {prop['label']}"
             sql_from += f" ON cid._conceptid = {prop['label']}._conceptid"
             sql_from += f" AND {prop['label']}.type = 'l'"
             sql_from += f" AND {prop['label']}.name = '{prop['name']}'"
-            sql += f" || 'The {prop['label']} is ' || any_value({prop['label']}.value) IGNORE NULLS || '. '"
-            full_sql = f"INSERT INTO {schema}.rag_{model_id} (content, concept_id, type)"
-            full_sql += f" SELECT {sql} content, cid._conceptid concept_id, 'c' type FROM {sql_from} GROUP BY cid._conceptid"
-            statements.append(full_sql)
-            #test_df = spark.sql(full_sql)
+            sql += f" || 'The {prop['label']} is ' || ifnull(any_value({prop['label']}.value) IGNORE NULLS, 'unknown') || '. '"
+        full_sql = f"INSERT INTO {schema}.rag_{model_id} (content, concept_id, type)"
+        full_sql += f" SELECT {sql} content, cid._conceptid concept_id, 'c' type FROM {sql_from} GROUP BY cid._conceptid, cid._plain_conceptid"
+        statements.append(full_sql)
     return statements
 def __generate_sentence_sql_concept_relations(concepts, schema, model_id):
     statements = []
     for con in concepts:
-        sql_from = f"{con['prop_table_name']} "
         for rel in con["relations"]:
+            sql_from = f"{con['prop_table_name']} rel"
+            sql_from += f" INNER JOIN (SELECT _conceptid, _plain_conceptid FROM {rel['target_table_name']} GROUP BY _conceptid, _plain_conceptid) cid"
+            sql_from += f" ON rel.value = cid._conceptid"
+            sql_from += f" AND rel.type = 'r'"
+            sql_from += f" AND rel.name = '{rel['name']}'"
-            sql = f"'The {con['label']} identified by ' || split(_conceptid,'#')[1]"
+            sql = f"'The {con['label']} identified by ' || rel._plain_conceptid"
             sql += f" || ' has a relationship called {rel['label']} that connects it to one or more {rel['target_con_label']} identified by '"
-            sql += " || concat_ws(', ', array_agg(split(value, '#')[1])) || '. '"
+            sql += " || concat_ws(', ', array_agg(cid._plain_conceptid)) || '. '"
             full_sql = f"INSERT INTO {schema}.rag_{model_id} (content, concept_id, type)"
-            full_sql += f" SELECT {sql} content, _conceptid concept_id, 'e' type FROM {sql_from} GROUP BY _conceptid"
+            full_sql += f" SELECT {sql} content, rel._conceptid concept_id, 'e' type FROM {sql_from} GROUP BY rel._conceptid, rel._plain_conceptid"
             statements.append(full_sql)
     return statements
-def __get_concept_metadata(tenant_json, schema, model_id):
+def __get_concept_metadata(tenant_json, schema, model_id, whitelist):
     target_concept_labels = {}
+    target_table_names = {}
     for d in tenant_json["domains"]:
         for c in d["concepts"]:
             target_concept_labels[c["uri"]] = d["name"] + " " + c["label"]
+            target_table_names[c["uri"]] = {
+                "prop": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_np",
+                "con": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_c"
+            }
     concepts = []
     for d in tenant_json["domains"]:
         for c in d["concepts"]:
             con_props = []
             for col in c["properties"]:
                 con_props.append({
-                    #"col_name": d["name"] + "_" + c["label"] + "_" + col["label"],
                     "label": col["label"],
                     "name": f"{model_id}/{d['name']}/{c['label']}#{col['label']}"
                     })
             con_rels = []
             for rel in c["relations"]:
+                if whitelist is not None and target_concept_labels[rel["relationTypeUri"]] not in whitelist:
+                    continue
                 con_rels.append({
                     "label": rel["label"],
                     "name": f"{model_id}/{d['name']}/{c['label']}#{rel['label']}",
-                    "target_con_label": target_concept_labels[rel["relationTypeUri"]]
+                    "target_con_label": target_concept_labels[rel["relationTypeUri"]],
+                    "target_table_name": target_table_names[rel["relationTypeUri"]]["prop"]
                 })
+            con_parents = []
+            for p in c["inheritedConcepts"]:
+                con_parents.append(p)
             concepts.append({
+                "uri": c["uri"],
                 "label": d["name"] + " " + c["label"],
-                #"id_column": d["name"] + "_" + c["label"],
                 "relations": con_rels,
                 "properties": con_props,
-                #"table_name": "data_" + k.model_id + "_" + d["name"] + "_" + c["label"] + "_w",
-                "prop_table_name": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_np",
-                "con_table_name": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_c",
+                "parents": con_parents,
+                "prop_table_name": target_table_names[c["uri"]]["prop"],
+                "con_table_name": target_table_names[c["uri"]]["con"]
                 })
-    return concepts
+    for ci, c in enumerate(concepts):
+        if len(c["parents"]) > 0:
+            for p in c["parents"]:
+                for a in concepts:
+                    if a["uri"] == p:
+                        concepts[ci]["properties"].extend(a["properties"])
+    out_concepts = []
+    for c in concepts:
+        if whitelist is not None and c["label"] not in whitelist:
+            continue
+        out_concepts.append(c)
+    return out_concepts

{kobai_sdk-0.2.8rc1.dist-info → kobai_sdk-0.2.8rc3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: kobai-sdk
-Version: 0.2.8rc1
+Version: 0.2.8rc3
 Summary: A package that enables interaction with a Kobai tenant.
 Author-email: Ryan Oattes <ryan@kobai.io>
 License:                                  Apache License
@@ -211,7 +211,7 @@ Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3
-Requires-Python: >=3.9
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: pyspark
@@ -222,6 +222,7 @@ Requires-Dist: azure-storage-blob
 Requires-Dist: langchain-core
 Requires-Dist: langchain-community
 Requires-Dist: langchain_openai
+Requires-Dist: sentence_transformers
 Provides-Extra: dev
 Requires-Dist: black; extra == "dev"
 Requires-Dist: bumpver; extra == "dev"

{kobai_sdk-0.2.8rc1.dist-info → kobai_sdk-0.2.8rc3.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 kobai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kobai/ai_query.py,sha256=fMTcfj-6Ma3FRB08VYEDj8PwOEOtFGsJHyQrha5yvPg,4512
-kobai/ai_rag.py,sha256=y_N7qVu8HfUHHZPIyQSO7L995RBeNtDhva7U5HBHSfY,5063
+kobai/ai_rag.py,sha256=TtUbUcSN9mIsauGyS_nw8j58T9jEd4OFiAwNvzo-rr8,13593
 kobai/databricks_client.py,sha256=fyqqMly2Qm0r1AHWsQjkYeNsDdH0G1JSgTkF9KJ55qA,2118
 kobai/demo_tenant_client.py,sha256=wlNc-bdI2wotRXo8ppUOalv4hYdBlek_WzJNARZV-AE,9293
 kobai/llm_config.py,sha256=ZFx81cUAOHYZgRoTkTY-utQYaWYlmR8773ZJpj74C1A,1900
 kobai/spark_client.py,sha256=opM_F-4Ut5Hq5zZjWMuLvUps9sDULvyPNZHXGL8dW1k,776
 kobai/tenant_api.py,sha256=9U6UbxpaAb-kpbuADXx3kbkNKaOzYy0I-GGwbpiCCOk,4212
 kobai/tenant_client.py,sha256=AyJ5R2oukEv3q1dcItpojvTUVp5-gwUKvyGjofjBKyc,41821
-kobai_sdk-0.2.8rc1.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
-kobai_sdk-0.2.8rc1.dist-info/METADATA,sha256=nZTb2svQk01wT32zBZDPKgeYnSAx22YER5YLHEIjoAQ,19167
-kobai_sdk-0.2.8rc1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-kobai_sdk-0.2.8rc1.dist-info/top_level.txt,sha256=ns1El3BrTTHKvoAgU1XtiSaVIudYeCXbEEUVY8HFDZ4,6
-kobai_sdk-0.2.8rc1.dist-info/RECORD,,
+kobai_sdk-0.2.8rc3.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
+kobai_sdk-0.2.8rc3.dist-info/METADATA,sha256=f75oEdxRWLrr0bVmH1OvIlvc0KS9TrpNTh65eTlKX6k,19205
+kobai_sdk-0.2.8rc3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+kobai_sdk-0.2.8rc3.dist-info/top_level.txt,sha256=ns1El3BrTTHKvoAgU1XtiSaVIudYeCXbEEUVY8HFDZ4,6
+kobai_sdk-0.2.8rc3.dist-info/RECORD,,

{kobai_sdk-0.2.8rc1.dist-info → kobai_sdk-0.2.8rc3.dist-info}/LICENSE RENAMED Viewed

File without changes

{kobai_sdk-0.2.8rc1.dist-info → kobai_sdk-0.2.8rc3.dist-info}/WHEEL RENAMED Viewed

File without changes

{kobai_sdk-0.2.8rc1.dist-info → kobai_sdk-0.2.8rc3.dist-info}/top_level.txt RENAMED Viewed

File without changes

kobai-sdk 0.2.8rc1__py3-none-any.whl → 0.2.8rc3__py3-none-any.whl

Potentially problematic release.

kobai-sdk 0.2.8rc1py3-none-any.whl → 0.2.8rc3py3-none-any.whl