PyPI - kobai-sdk - Versions diffs - 0.2.8rc13__tar.gz → 0.3.5rc6__tar.gz - Mend

kobai-sdk 0.2.8rc13tar.gz → 0.3.5rc6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{kobai_sdk-0.2.8rc13/kobai_sdk.egg-info → kobai_sdk-0.3.5rc6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: kobai-sdk
-Version: 0.2.8rc13
+Version: 0.3.5rc6
 Summary: A package that enables interaction with a Kobai tenant.
 Author-email: Ryan Oattes <ryan@kobai.io>
 License:                                  Apache License
@@ -221,14 +221,15 @@ Requires-Dist: azure-identity
 Requires-Dist: azure-storage-blob
 Requires-Dist: langchain-core
 Requires-Dist: langchain-community
-Requires-Dist: langchain_openai
-Requires-Dist: databricks_langchain
+Requires-Dist: langchain-classic
+Requires-Dist: delta-spark
 Provides-Extra: dev
 Requires-Dist: black; extra == "dev"
 Requires-Dist: bumpver; extra == "dev"
 Requires-Dist: isort; extra == "dev"
 Requires-Dist: pip-tools; extra == "dev"
 Requires-Dist: pytest; extra == "dev"
+Dynamic: license-file
 # Kobai SDK for Python (Alpha)
@@ -247,21 +248,50 @@ from kobai import tenant_client, spark_client, databricks_client
 schema = 'main.demo'
 uri = 'https://demo.kobai.io'
-tenant_id = '1'
 tenant_name = 'My Demo Tenant'
-k = tenant_client.TenantClient(tenant_name, tenant_id, uri, schema)
+k = tenant_client.TenantClient(tenant_name, uri, schema)
 ```
 2. Authenticate with the Kobai instance:
+Authentication can be performed using different methods, such as device code flow, on-behalf-of flow, or browser-based tokens.
+#### Authentication via device code
+Step 1: Obtain the access token from IDM (Identity and Access Management)
 ```python
-client_id = 'your_Entra_app_id_here'
+from kobai import ms_authenticate
 tenant_id = 'your_Entra_directory_id_here'
+client_id = 'your_Entra_app_id_here'
+access_token = ms_authenticate.device_code(tenant_id, client_id)
+```
+Step 2: Use the token to retrieve the list of Kobai tenants (unless the tenant ID is already known).
-k.authenticate(client_id, tenant_id)
+```python
+tenants = k.get_tenants(id_token=access_token)
+print(tenants)
 ```
+Step 3: Authenticate with Kobai for the specific tenant using the IDM access token.
+```python
+kobai_tenant_id = "5c1ba715-3961-4835-8a10-6f6f963b53ff"
+k.use_access_token(access_token = access_token, tenant_id=kobai_tenant_id)
+```
+At this point, authentication to the Kobai tenant is successfully completed.
+#### Authentication via browser token
+```python
+k.use_browser_token(access_token="KOBAI_ACESS_TOKEN_FROM_BROWSER")
+```
+#### Authentication via on-behalf-of flow
+The sample code demonstrating authentication via the on-behalf-of flow will be provided, if requested.
 3. Initialize a Spark client using your current `SparkSession`, and generate semantically-rich SQL views describing this Kobai tenant:
 ```python
@@ -303,68 +333,41 @@ kobai_query_name = "Set ownership"
 question_json = k.run_question_remote(k.get_question_id(kobai_query_name)) # By questionName
 ```
-3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using either Azure OpenAI, Databricks or a user-provided chat model.
-#### Using Azure OpenAI
-###### Authentication Methods:
-1. ApiKey
-```python
-from kobai import ai_query, llm_config
-import json
-followup_question = "Which owner owns the most sets?"
-llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", api_key="YOUR_API_KEY", deployment="gpt-4o-mini", llm_provider="azure_openai")
-output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
-print(output)
-```
-2. Azure Active Directory Authentication
+3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using the user-provided chat and embedding model.
-Ensure that the logged-in tenant has access to Azure OpenAI.
-In case of databricks notebook, the logged in service principal should have access to Azure OpenAI.
+#### Using Databricks Embeddings and Chat Models in a Databricks Notebook
+Initialize the AI components by specifying the embedding and chat models, then proceed with follow-up questions for interactive engagement.
 ```python
-from kobai import ai_query, llm_config
+from databricks_langchain import DatabricksEmbeddings
+from langchain_community.chat_models import ChatDatabricks
 import json
-followup_question = "Which owner owns the most sets?"
-llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", deployment="gpt-4o-mini", llm_provider="azure_openai")
-llm_config.get_azure_ad_token()
-output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
-print(output)
-```
-#### Using Databricks (Default Configuration)
-```python
-from kobai import ai_query, llm_config
-import json
+# choose the embedding and chat model of your choice from the databricks serving and initialize.
+embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
+chat_model = ChatDatabricks(endpoint="databricks-gpt-oss-20b")
+k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
 followup_question = "Which owner owns the most sets?"
-llm_config = llm_config.LLMConfig()
-output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
+output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
 print(output)
 ```
-#### User Provided Chat Model
+#### Using Azure OpenAI Embeddings and Chat Models
 ```python
-from kobai import ai_query, llm_config
-import json
 from langchain_openai import AzureChatOpenAI
+from langchain_openai import AzureOpenAIEmbeddings
+import json
 followup_question = "Which owner owns the most sets?"
-llm_config = llm_config.LLMConfig(debug=True)
+embedding_model = AzureOpenAIEmbeddings(
+    model="text-embedding-3-small",
+    azure_endpoint="https://kobaipoc.openai.azure.com/",
+    api_key="YOUR_API_KEY",
+    openai_api_version="2023-05-15"
+)
 chat_model = AzureChatOpenAI(
 azure_endpoint="https://kobaipoc.openai.azure.com/", azure_deployment="gpt-4o-mini",
@@ -373,7 +376,10 @@ openai_api_version="2024-02-15-preview",
 temperature=0.5,
 max_tokens=150,)
-output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, override_model=chat_model, llm_config=llm_config)
+k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
+followup_question = "Which theme has the most sets?"
+output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
 print(output)
 ```

{kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/README.md RENAMED Viewed

@@ -15,21 +15,50 @@ from kobai import tenant_client, spark_client, databricks_client
 schema = 'main.demo'
 uri = 'https://demo.kobai.io'
-tenant_id = '1'
 tenant_name = 'My Demo Tenant'
-k = tenant_client.TenantClient(tenant_name, tenant_id, uri, schema)
+k = tenant_client.TenantClient(tenant_name, uri, schema)
 ```
 2. Authenticate with the Kobai instance:
+Authentication can be performed using different methods, such as device code flow, on-behalf-of flow, or browser-based tokens.
+#### Authentication via device code
+Step 1: Obtain the access token from IDM (Identity and Access Management)
 ```python
-client_id = 'your_Entra_app_id_here'
+from kobai import ms_authenticate
 tenant_id = 'your_Entra_directory_id_here'
+client_id = 'your_Entra_app_id_here'
+access_token = ms_authenticate.device_code(tenant_id, client_id)
+```
+Step 2: Use the token to retrieve the list of Kobai tenants (unless the tenant ID is already known).
-k.authenticate(client_id, tenant_id)
+```python
+tenants = k.get_tenants(id_token=access_token)
+print(tenants)
 ```
+Step 3: Authenticate with Kobai for the specific tenant using the IDM access token.
+```python
+kobai_tenant_id = "5c1ba715-3961-4835-8a10-6f6f963b53ff"
+k.use_access_token(access_token = access_token, tenant_id=kobai_tenant_id)
+```
+At this point, authentication to the Kobai tenant is successfully completed.
+#### Authentication via browser token
+```python
+k.use_browser_token(access_token="KOBAI_ACESS_TOKEN_FROM_BROWSER")
+```
+#### Authentication via on-behalf-of flow
+The sample code demonstrating authentication via the on-behalf-of flow will be provided, if requested.
 3. Initialize a Spark client using your current `SparkSession`, and generate semantically-rich SQL views describing this Kobai tenant:
 ```python
@@ -71,68 +100,41 @@ kobai_query_name = "Set ownership"
 question_json = k.run_question_remote(k.get_question_id(kobai_query_name)) # By questionName
 ```
-3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using either Azure OpenAI, Databricks or a user-provided chat model.
-#### Using Azure OpenAI
-###### Authentication Methods:
-1. ApiKey
-```python
-from kobai import ai_query, llm_config
-import json
-followup_question = "Which owner owns the most sets?"
-llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", api_key="YOUR_API_KEY", deployment="gpt-4o-mini", llm_provider="azure_openai")
-output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
-print(output)
-```
-2. Azure Active Directory Authentication
+3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using the user-provided chat and embedding model.
-Ensure that the logged-in tenant has access to Azure OpenAI.
-In case of databricks notebook, the logged in service principal should have access to Azure OpenAI.
+#### Using Databricks Embeddings and Chat Models in a Databricks Notebook
+Initialize the AI components by specifying the embedding and chat models, then proceed with follow-up questions for interactive engagement.
 ```python
-from kobai import ai_query, llm_config
+from databricks_langchain import DatabricksEmbeddings
+from langchain_community.chat_models import ChatDatabricks
 import json
-followup_question = "Which owner owns the most sets?"
-llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", deployment="gpt-4o-mini", llm_provider="azure_openai")
-llm_config.get_azure_ad_token()
-output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
-print(output)
-```
-#### Using Databricks (Default Configuration)
-```python
-from kobai import ai_query, llm_config
-import json
+# choose the embedding and chat model of your choice from the databricks serving and initialize.
+embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
+chat_model = ChatDatabricks(endpoint="databricks-gpt-oss-20b")
+k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
 followup_question = "Which owner owns the most sets?"
-llm_config = llm_config.LLMConfig()
-output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
+output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
 print(output)
 ```
-#### User Provided Chat Model
+#### Using Azure OpenAI Embeddings and Chat Models
 ```python
-from kobai import ai_query, llm_config
-import json
 from langchain_openai import AzureChatOpenAI
+from langchain_openai import AzureOpenAIEmbeddings
+import json
 followup_question = "Which owner owns the most sets?"
-llm_config = llm_config.LLMConfig(debug=True)
+embedding_model = AzureOpenAIEmbeddings(
+    model="text-embedding-3-small",
+    azure_endpoint="https://kobaipoc.openai.azure.com/",
+    api_key="YOUR_API_KEY",
+    openai_api_version="2023-05-15"
+)
 chat_model = AzureChatOpenAI(
 azure_endpoint="https://kobaipoc.openai.azure.com/", azure_deployment="gpt-4o-mini",
@@ -141,7 +143,10 @@ openai_api_version="2024-02-15-preview",
 temperature=0.5,
 max_tokens=150,)
-output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, override_model=chat_model, llm_config=llm_config)
+k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
+followup_question = "Which theme has the most sets?"
+output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
 print(output)
 ```

{kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/ai_query.py RENAMED Viewed

@@ -1,8 +1,6 @@
 from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-from sentence_transformers import SentenceTransformer, util
 from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_core.embeddings import Embeddings
 from langchain_core.documents import Document
@@ -10,8 +8,9 @@ from langchain_core.retrievers import BaseRetriever
 from langchain_core.callbacks import CallbackManagerForRetrieverRun
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda
 from langchain_core.vectorstores import InMemoryVectorStore
+import numpy as np
-from typing import Union, List
+from typing import List
 MESSAGE_SYSTEM_TEMPLATE = """
@@ -73,7 +72,7 @@ def format_docs(docs):
 def input_only(inpt):
     return inpt["question"]
-def followup_question(user_question, question_results, question_name, question_def, embedding_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, use_inmem_vectors=False, k=50):
+def followup_question(user_question, question_results, question_name, question_def, embedding_model: Embeddings, chat_model: BaseChatModel, use_inmem_vectors=False, k=50):
     row_texts = process_question_results(question_def, question_results)
     question_documents = [Document(page_content=r, metadata={"source": "kobai"}) for r in row_texts]
@@ -118,22 +117,13 @@ def init_question_search_index(tenant_questions, emb_model):
     q_ids = [q["id"] for q in tenant_questions]
     q_descs = [q["description"] for q in tenant_questions]
-    if isinstance(emb_model, SentenceTransformer):
-        q_vectors = emb_model.encode(q_descs)
-    else:
-        q_vectors = emb_model.embed_documents(q_descs)
+    q_vectors = emb_model.embed_documents(q_descs)
     return {"ids": q_ids, "descs": q_descs, "vectors": q_vectors}
 def question_search(search_text: str, search_index, emb_model, k: int):
-    if isinstance(emb_model, SentenceTransformer):
-        search_vec = emb_model.encode(search_text)
-    else:
-        search_vec = emb_model.embed_query(search_text)
+    search_vec = emb_model.embed_query(search_text)
     #search_vec = emb_model.encode(search_text)
     matches = __top_vector_matches(search_vec, search_index["vectors"], top=k)
     for mi, m in enumerate(matches):
@@ -142,13 +132,25 @@ def question_search(search_text: str, search_index, emb_model, k: int):
     return matches
 def __top_vector_matches(test_vec, options_list_vec, top=1):
-    scores_t = util.cos_sim(test_vec, options_list_vec)[0]
-    scores_l = scores_t.tolist()
-    scores_d = [{"index": i, "value": v} for i, v in enumerate(scores_l)]
-    sorted_d = sorted(scores_d, key=lambda i: i["value"], reverse=True)
-    top_d = sorted_d[0:top]
+    # Normalize the test vector
+    test_vec_norm = test_vec / np.linalg.norm(test_vec)
+    # Normalize the option vectors
+    options_norm = options_list_vec / np.linalg.norm(options_list_vec, axis=1, keepdims=True)
+    # Compute cosine similarity (dot product of normalized vectors)
+    cosine_similarities = np.dot(options_norm, test_vec_norm)
+    # Get indexes and similarity scores as dict
+    scores_d = [{"index": i, "value": float(v)} for i, v in enumerate(cosine_similarities)]
+    # Sort dict by similarity score descending
+    sorted_d = sorted(scores_d, key=lambda x: x["value"], reverse=True)
+    # Return top results
+    top_d = sorted_d[:top]
     return top_d
 def process_question_results(question_def, question_results):
     """
@@ -211,8 +213,9 @@ def process_question_results(question_def, question_results):
     concept_order = [max_src]
-    for t in concept_rels[max_src]["edges"]:
-        concept_order.append(t["dst"])
+    if max_src != "":
+        for t in concept_rels[max_src]["edges"]:
+            concept_order.append(t["dst"])
     for c in concept_props:
         if c not in concept_order:

{kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/ai_rag.py RENAMED Viewed

@@ -3,13 +3,11 @@ from pyspark.sql import SparkSession
 from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
 from pyspark.sql import functions as F
-from sentence_transformers import SentenceTransformer
 from delta import DeltaTable
-from typing import Union
 from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_core.embeddings import Embeddings
 from langchain_community.document_loaders import PySparkDataFrameLoader
-from langchain import hub
+from langchain_classic import hub
 from langchain_core.output_parsers import StrOutputParser
 import urllib
@@ -69,6 +67,7 @@ def generate_sentences(tc: AIContext, replica_schema=None, concept_white_list=No
     print("Dropping and Recreating the RAG Table")
     ss.sql(__create_rag_table_sql(tc.schema, tc.model_id))
+    ss.sql(__clear_rag_table_sql(tc.schema, tc.model_id))
     print("Generating Extraction SQL")
     sql_statements = []
@@ -89,6 +88,7 @@ def generate_sentences(tc: AIContext, replica_schema=None, concept_white_list=No
     if replica_schema is not None:
         print("Replicating Schema")
         ss.sql(__create_rag_table_sql(replica_schema, tc.model_id))
+        ss.sql(__clear_rag_table_sql(tc.schema, tc.model_id))
         ss.sql(__replicate_to_catalog_sql(
             tc.schema, replica_schema, tc.model_id))
@@ -143,13 +143,13 @@ def __generate_sentences_from_questions(tc: AIContext, debug):
     ss.sql(full_sql)
-def encode_to_delta_local(tc: AIContext, st_model: Union[SentenceTransformer, Embeddings], replica_schema=None, batch_size=100000):
+def encode_to_delta_local(tc: AIContext, st_model: Embeddings, replica_schema=None, batch_size=100000):
     """
     Encode Semantic Data to Vectors in Delta Table
     Parameters:
     tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
-    st_model (SentenceTransformer): A sentence_transformers model to use for encoding.
+    st_model (Embeddings): A langchain embedding model to use for encoding.
     replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
     """
@@ -172,12 +172,8 @@ def encode_to_delta_local(tc: AIContext, st_model: Union[SentenceTransformer, Em
         content_list = [r["content"] for r in sentences_df.collect()]
         id_list = [r["id"] for r in sentences_df.collect()]
-        if isinstance(st_model, SentenceTransformer):
-            vector_list = st_model.encode(
-                content_list, normalize_embeddings=True, show_progress_bar=True).tolist()
-        else:
-            vector_list = st_model.embed_documents(content_list)
-            for i, v in enumerate(vector_list):
+        vector_list = st_model.embed_documents(content_list)
+        for i, v in enumerate(vector_list):
                 vector_list[i] = [float(x) for x in v]
         #vector_list = st_model.encode(
         #    content_list, normalize_embeddings=True, show_progress_bar=True)
@@ -212,13 +208,13 @@ def encode_to_delta_local(tc: AIContext, st_model: Union[SentenceTransformer, Em
     #      """)
-def rag_delta(tc: AIContext, emb_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, question, k=5, replica_schema=None):
+def rag_delta(tc: AIContext, emb_model: Embeddings, chat_model: BaseChatModel, question, k=5, replica_schema=None):
     """
     Run a RAG query using vectors in Delta table.
     Parameters:
     tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
-    emb_model (UNION[SentenceTransformer, Embeddings]): A sentence_transformers or langchain embedding model to use for encoding the query.
+    emb_model (Embeddings): A langchain embedding model to use for encoding the query.
     chat_model (BaseChatModel): A langchain chat model to use in the RAG pipeline.
     question (str): The user's query.
     k (int) OPTIONAL: The number of RAG documents to retrieve.
@@ -231,10 +227,7 @@ def rag_delta(tc: AIContext, emb_model: Union[SentenceTransformer, Embeddings],
     ss = tc.spark_session
-    if isinstance(emb_model, SentenceTransformer):
-        vector_list = emb_model.encode(
-            question, normalize_embeddings=True).tolist()
-    elif isinstance(emb_model, Embeddings):
+    if isinstance(emb_model, Embeddings):
         vector_list = emb_model.embed_query(question)
     else:
         print("Invalid Embedding Model Type")
@@ -274,8 +267,14 @@ def rag_delta(tc: AIContext, emb_model: Union[SentenceTransformer, Embeddings],
     return response
+#def __create_rag_table_sql(schema, model_id):
+#    return f"CREATE OR REPLACE TABLE {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
 def __create_rag_table_sql(schema, model_id):
-    return f"CREATE OR REPLACE TABLE {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
+    return f"CREATE TABLE IF NOT EXISTS {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
+def __clear_rag_table_sql(schema, model_id):
+    return f"DELETE FROM {schema}.rag_{model_id}"
 def __replicate_to_catalog_sql(base_schema, target_schema, model_id):

kobai-sdk 0.2.8rc13__tar.gz → 0.3.5rc6__tar.gz

kobai-sdk 0.2.8rc13tar.gz → 0.3.5rc6tar.gz