kobai-sdk 0.2.8rc13__tar.gz → 0.3.5rc6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {kobai_sdk-0.2.8rc13/kobai_sdk.egg-info → kobai_sdk-0.3.5rc6}/PKG-INFO +62 -56
  2. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/README.md +57 -52
  3. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/ai_query.py +25 -22
  4. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/ai_rag.py +17 -18
  5. kobai_sdk-0.3.5rc6/kobai/genie.py +194 -0
  6. kobai_sdk-0.3.5rc6/kobai/mobi.py +733 -0
  7. kobai_sdk-0.3.5rc6/kobai/mobi_config.py +19 -0
  8. kobai_sdk-0.3.5rc6/kobai/ms_authenticate.py +66 -0
  9. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/tenant_api.py +5 -2
  10. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/tenant_client.py +213 -101
  11. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6/kobai_sdk.egg-info}/PKG-INFO +62 -56
  12. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai_sdk.egg-info/SOURCES.txt +4 -0
  13. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai_sdk.egg-info/requires.txt +2 -2
  14. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/pyproject.toml +3 -3
  15. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/LICENSE +0 -0
  16. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/MANIFEST.in +0 -0
  17. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/__init__.py +0 -0
  18. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/databricks_client.py +0 -0
  19. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/demo_tenant_client.py +0 -0
  20. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/spark_client.py +0 -0
  21. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai_sdk.egg-info/dependency_links.txt +0 -0
  22. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai_sdk.egg-info/top_level.txt +0 -0
  23. {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: kobai-sdk
3
- Version: 0.2.8rc13
3
+ Version: 0.3.5rc6
4
4
  Summary: A package that enables interaction with a Kobai tenant.
5
5
  Author-email: Ryan Oattes <ryan@kobai.io>
6
6
  License: Apache License
@@ -221,14 +221,15 @@ Requires-Dist: azure-identity
221
221
  Requires-Dist: azure-storage-blob
222
222
  Requires-Dist: langchain-core
223
223
  Requires-Dist: langchain-community
224
- Requires-Dist: langchain_openai
225
- Requires-Dist: databricks_langchain
224
+ Requires-Dist: langchain-classic
225
+ Requires-Dist: delta-spark
226
226
  Provides-Extra: dev
227
227
  Requires-Dist: black; extra == "dev"
228
228
  Requires-Dist: bumpver; extra == "dev"
229
229
  Requires-Dist: isort; extra == "dev"
230
230
  Requires-Dist: pip-tools; extra == "dev"
231
231
  Requires-Dist: pytest; extra == "dev"
232
+ Dynamic: license-file
232
233
 
233
234
  # Kobai SDK for Python (Alpha)
234
235
 
@@ -247,21 +248,50 @@ from kobai import tenant_client, spark_client, databricks_client
247
248
 
248
249
  schema = 'main.demo'
249
250
  uri = 'https://demo.kobai.io'
250
- tenant_id = '1'
251
251
  tenant_name = 'My Demo Tenant'
252
-
253
- k = tenant_client.TenantClient(tenant_name, tenant_id, uri, schema)
252
+ k = tenant_client.TenantClient(tenant_name, uri, schema)
254
253
  ```
255
254
 
256
255
  2. Authenticate with the Kobai instance:
256
+ Authentication can be performed using different methods, such as device code flow, on-behalf-of flow, or browser-based tokens.
257
+
258
+ #### Authentication via device code
259
+ Step 1: Obtain the access token from IDM (Identity and Access Management)
257
260
 
258
261
  ```python
259
- client_id = 'your_Entra_app_id_here'
262
+ from kobai import ms_authenticate
263
+
260
264
  tenant_id = 'your_Entra_directory_id_here'
265
+ client_id = 'your_Entra_app_id_here'
266
+
267
+ access_token = ms_authenticate.device_code(tenant_id, client_id)
268
+ ```
269
+
270
+ Step 2: Use the token to retrieve the list of Kobai tenants (unless the tenant ID is already known).
261
271
 
262
- k.authenticate(client_id, tenant_id)
272
+ ```python
273
+ tenants = k.get_tenants(id_token=access_token)
274
+ print(tenants)
263
275
  ```
264
276
 
277
+ Step 3: Authenticate with Kobai for the specific tenant using the IDM access token.
278
+
279
+ ```python
280
+ kobai_tenant_id = "5c1ba715-3961-4835-8a10-6f6f963b53ff"
281
+ k.use_access_token(access_token = access_token, tenant_id=kobai_tenant_id)
282
+ ```
283
+
284
+ At this point, authentication to the Kobai tenant is successfully completed.
285
+
286
+ #### Authentication via browser token
287
+
288
+ ```python
289
+ k.use_browser_token(access_token="KOBAI_ACESS_TOKEN_FROM_BROWSER")
290
+ ```
291
+
292
+ #### Authentication via on-behalf-of flow
293
+ The sample code demonstrating authentication via the on-behalf-of flow will be provided, if requested.
294
+
265
295
  3. Initialize a Spark client using your current `SparkSession`, and generate semantically-rich SQL views describing this Kobai tenant:
266
296
 
267
297
  ```python
@@ -303,68 +333,41 @@ kobai_query_name = "Set ownership"
303
333
  question_json = k.run_question_remote(k.get_question_id(kobai_query_name)) # By questionName
304
334
  ```
305
335
 
306
- 3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using either Azure OpenAI, Databricks or a user-provided chat model.
307
-
308
- #### Using Azure OpenAI
309
-
310
- ###### Authentication Methods:
311
-
312
- 1. ApiKey
313
-
314
- ```python
315
- from kobai import ai_query, llm_config
316
- import json
317
-
318
- followup_question = "Which owner owns the most sets?"
319
-
320
- llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", api_key="YOUR_API_KEY", deployment="gpt-4o-mini", llm_provider="azure_openai")
321
-
322
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
323
- print(output)
324
- ```
325
-
326
- 2. Azure Active Directory Authentication
336
+ 3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using the user-provided chat and embedding model.
327
337
 
328
- Ensure that the logged-in tenant has access to Azure OpenAI.
329
- In case of databricks notebook, the logged in service principal should have access to Azure OpenAI.
338
+ #### Using Databricks Embeddings and Chat Models in a Databricks Notebook
339
+ Initialize the AI components by specifying the embedding and chat models, then proceed with follow-up questions for interactive engagement.
330
340
 
331
341
  ```python
332
- from kobai import ai_query, llm_config
342
+ from databricks_langchain import DatabricksEmbeddings
343
+ from langchain_community.chat_models import ChatDatabricks
333
344
  import json
334
345
 
335
- followup_question = "Which owner owns the most sets?"
336
-
337
- llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", deployment="gpt-4o-mini", llm_provider="azure_openai")
338
- llm_config.get_azure_ad_token()
339
-
340
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
341
- print(output)
342
- ```
343
-
344
- #### Using Databricks (Default Configuration)
345
-
346
- ```python
347
- from kobai import ai_query, llm_config
348
- import json
346
+ # choose the embedding and chat model of your choice from the databricks serving and initialize.
347
+ embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
348
+ chat_model = ChatDatabricks(endpoint="databricks-gpt-oss-20b")
349
+ k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
349
350
 
350
351
  followup_question = "Which owner owns the most sets?"
351
-
352
- llm_config = llm_config.LLMConfig()
353
-
354
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
352
+ output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
355
353
  print(output)
356
354
  ```
357
355
 
358
- #### User Provided Chat Model
356
+ #### Using Azure OpenAI Embeddings and Chat Models
359
357
 
360
358
  ```python
361
- from kobai import ai_query, llm_config
362
- import json
363
359
  from langchain_openai import AzureChatOpenAI
360
+ from langchain_openai import AzureOpenAIEmbeddings
361
+ import json
364
362
 
365
363
  followup_question = "Which owner owns the most sets?"
366
364
 
367
- llm_config = llm_config.LLMConfig(debug=True)
365
+ embedding_model = AzureOpenAIEmbeddings(
366
+ model="text-embedding-3-small",
367
+ azure_endpoint="https://kobaipoc.openai.azure.com/",
368
+ api_key="YOUR_API_KEY",
369
+ openai_api_version="2023-05-15"
370
+ )
368
371
 
369
372
  chat_model = AzureChatOpenAI(
370
373
  azure_endpoint="https://kobaipoc.openai.azure.com/", azure_deployment="gpt-4o-mini",
@@ -373,7 +376,10 @@ openai_api_version="2024-02-15-preview",
373
376
  temperature=0.5,
374
377
  max_tokens=150,)
375
378
 
376
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, override_model=chat_model, llm_config=llm_config)
379
+ k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
380
+
381
+ followup_question = "Which theme has the most sets?"
382
+ output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
377
383
  print(output)
378
384
  ```
379
385
 
@@ -15,21 +15,50 @@ from kobai import tenant_client, spark_client, databricks_client
15
15
 
16
16
  schema = 'main.demo'
17
17
  uri = 'https://demo.kobai.io'
18
- tenant_id = '1'
19
18
  tenant_name = 'My Demo Tenant'
20
-
21
- k = tenant_client.TenantClient(tenant_name, tenant_id, uri, schema)
19
+ k = tenant_client.TenantClient(tenant_name, uri, schema)
22
20
  ```
23
21
 
24
22
  2. Authenticate with the Kobai instance:
23
+ Authentication can be performed using different methods, such as device code flow, on-behalf-of flow, or browser-based tokens.
24
+
25
+ #### Authentication via device code
26
+ Step 1: Obtain the access token from IDM (Identity and Access Management)
25
27
 
26
28
  ```python
27
- client_id = 'your_Entra_app_id_here'
29
+ from kobai import ms_authenticate
30
+
28
31
  tenant_id = 'your_Entra_directory_id_here'
32
+ client_id = 'your_Entra_app_id_here'
33
+
34
+ access_token = ms_authenticate.device_code(tenant_id, client_id)
35
+ ```
36
+
37
+ Step 2: Use the token to retrieve the list of Kobai tenants (unless the tenant ID is already known).
29
38
 
30
- k.authenticate(client_id, tenant_id)
39
+ ```python
40
+ tenants = k.get_tenants(id_token=access_token)
41
+ print(tenants)
31
42
  ```
32
43
 
44
+ Step 3: Authenticate with Kobai for the specific tenant using the IDM access token.
45
+
46
+ ```python
47
+ kobai_tenant_id = "5c1ba715-3961-4835-8a10-6f6f963b53ff"
48
+ k.use_access_token(access_token = access_token, tenant_id=kobai_tenant_id)
49
+ ```
50
+
51
+ At this point, authentication to the Kobai tenant is successfully completed.
52
+
53
+ #### Authentication via browser token
54
+
55
+ ```python
56
+ k.use_browser_token(access_token="KOBAI_ACESS_TOKEN_FROM_BROWSER")
57
+ ```
58
+
59
+ #### Authentication via on-behalf-of flow
60
+ The sample code demonstrating authentication via the on-behalf-of flow will be provided, if requested.
61
+
33
62
  3. Initialize a Spark client using your current `SparkSession`, and generate semantically-rich SQL views describing this Kobai tenant:
34
63
 
35
64
  ```python
@@ -71,68 +100,41 @@ kobai_query_name = "Set ownership"
71
100
  question_json = k.run_question_remote(k.get_question_id(kobai_query_name)) # By questionName
72
101
  ```
73
102
 
74
- 3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using either Azure OpenAI, Databricks or a user-provided chat model.
75
-
76
- #### Using Azure OpenAI
77
-
78
- ###### Authentication Methods:
79
-
80
- 1. ApiKey
81
-
82
- ```python
83
- from kobai import ai_query, llm_config
84
- import json
85
-
86
- followup_question = "Which owner owns the most sets?"
87
-
88
- llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", api_key="YOUR_API_KEY", deployment="gpt-4o-mini", llm_provider="azure_openai")
89
-
90
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
91
- print(output)
92
- ```
93
-
94
- 2. Azure Active Directory Authentication
103
+ 3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using the user-provided chat and embedding model.
95
104
 
96
- Ensure that the logged-in tenant has access to Azure OpenAI.
97
- In case of databricks notebook, the logged in service principal should have access to Azure OpenAI.
105
+ #### Using Databricks Embeddings and Chat Models in a Databricks Notebook
106
+ Initialize the AI components by specifying the embedding and chat models, then proceed with follow-up questions for interactive engagement.
98
107
 
99
108
  ```python
100
- from kobai import ai_query, llm_config
109
+ from databricks_langchain import DatabricksEmbeddings
110
+ from langchain_community.chat_models import ChatDatabricks
101
111
  import json
102
112
 
103
- followup_question = "Which owner owns the most sets?"
104
-
105
- llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", deployment="gpt-4o-mini", llm_provider="azure_openai")
106
- llm_config.get_azure_ad_token()
107
-
108
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
109
- print(output)
110
- ```
111
-
112
- #### Using Databricks (Default Configuration)
113
-
114
- ```python
115
- from kobai import ai_query, llm_config
116
- import json
113
+ # choose the embedding and chat model of your choice from the databricks serving and initialize.
114
+ embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
115
+ chat_model = ChatDatabricks(endpoint="databricks-gpt-oss-20b")
116
+ k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
117
117
 
118
118
  followup_question = "Which owner owns the most sets?"
119
-
120
- llm_config = llm_config.LLMConfig()
121
-
122
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
119
+ output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
123
120
  print(output)
124
121
  ```
125
122
 
126
- #### User Provided Chat Model
123
+ #### Using Azure OpenAI Embeddings and Chat Models
127
124
 
128
125
  ```python
129
- from kobai import ai_query, llm_config
130
- import json
131
126
  from langchain_openai import AzureChatOpenAI
127
+ from langchain_openai import AzureOpenAIEmbeddings
128
+ import json
132
129
 
133
130
  followup_question = "Which owner owns the most sets?"
134
131
 
135
- llm_config = llm_config.LLMConfig(debug=True)
132
+ embedding_model = AzureOpenAIEmbeddings(
133
+ model="text-embedding-3-small",
134
+ azure_endpoint="https://kobaipoc.openai.azure.com/",
135
+ api_key="YOUR_API_KEY",
136
+ openai_api_version="2023-05-15"
137
+ )
136
138
 
137
139
  chat_model = AzureChatOpenAI(
138
140
  azure_endpoint="https://kobaipoc.openai.azure.com/", azure_deployment="gpt-4o-mini",
@@ -141,7 +143,10 @@ openai_api_version="2024-02-15-preview",
141
143
  temperature=0.5,
142
144
  max_tokens=150,)
143
145
 
144
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, override_model=chat_model, llm_config=llm_config)
146
+ k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
147
+
148
+ followup_question = "Which theme has the most sets?"
149
+ output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
145
150
  print(output)
146
151
  ```
147
152
 
@@ -1,8 +1,6 @@
1
1
  from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate
2
2
  from langchain_core.output_parsers import StrOutputParser
3
3
 
4
- from sentence_transformers import SentenceTransformer, util
5
-
6
4
  from langchain_core.language_models.chat_models import BaseChatModel
7
5
  from langchain_core.embeddings import Embeddings
8
6
  from langchain_core.documents import Document
@@ -10,8 +8,9 @@ from langchain_core.retrievers import BaseRetriever
10
8
  from langchain_core.callbacks import CallbackManagerForRetrieverRun
11
9
  from langchain_core.runnables import RunnablePassthrough, RunnableLambda
12
10
  from langchain_core.vectorstores import InMemoryVectorStore
11
+ import numpy as np
13
12
 
14
- from typing import Union, List
13
+ from typing import List
15
14
 
16
15
 
17
16
  MESSAGE_SYSTEM_TEMPLATE = """
@@ -73,7 +72,7 @@ def format_docs(docs):
73
72
  def input_only(inpt):
74
73
  return inpt["question"]
75
74
 
76
- def followup_question(user_question, question_results, question_name, question_def, embedding_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, use_inmem_vectors=False, k=50):
75
+ def followup_question(user_question, question_results, question_name, question_def, embedding_model: Embeddings, chat_model: BaseChatModel, use_inmem_vectors=False, k=50):
77
76
 
78
77
  row_texts = process_question_results(question_def, question_results)
79
78
  question_documents = [Document(page_content=r, metadata={"source": "kobai"}) for r in row_texts]
@@ -118,22 +117,13 @@ def init_question_search_index(tenant_questions, emb_model):
118
117
 
119
118
  q_ids = [q["id"] for q in tenant_questions]
120
119
  q_descs = [q["description"] for q in tenant_questions]
121
-
122
- if isinstance(emb_model, SentenceTransformer):
123
- q_vectors = emb_model.encode(q_descs)
124
- else:
125
- q_vectors = emb_model.embed_documents(q_descs)
126
-
120
+ q_vectors = emb_model.embed_documents(q_descs)
127
121
  return {"ids": q_ids, "descs": q_descs, "vectors": q_vectors}
128
122
 
129
123
 
130
124
  def question_search(search_text: str, search_index, emb_model, k: int):
131
- if isinstance(emb_model, SentenceTransformer):
132
- search_vec = emb_model.encode(search_text)
133
- else:
134
- search_vec = emb_model.embed_query(search_text)
125
+ search_vec = emb_model.embed_query(search_text)
135
126
  #search_vec = emb_model.encode(search_text)
136
-
137
127
  matches = __top_vector_matches(search_vec, search_index["vectors"], top=k)
138
128
 
139
129
  for mi, m in enumerate(matches):
@@ -142,13 +132,25 @@ def question_search(search_text: str, search_index, emb_model, k: int):
142
132
  return matches
143
133
 
144
134
  def __top_vector_matches(test_vec, options_list_vec, top=1):
145
- scores_t = util.cos_sim(test_vec, options_list_vec)[0]
146
- scores_l = scores_t.tolist()
147
- scores_d = [{"index": i, "value": v} for i, v in enumerate(scores_l)]
148
- sorted_d = sorted(scores_d, key=lambda i: i["value"], reverse=True)
149
- top_d = sorted_d[0:top]
135
+ # Normalize the test vector
136
+ test_vec_norm = test_vec / np.linalg.norm(test_vec)
137
+ # Normalize the option vectors
138
+ options_norm = options_list_vec / np.linalg.norm(options_list_vec, axis=1, keepdims=True)
139
+
140
+ # Compute cosine similarity (dot product of normalized vectors)
141
+ cosine_similarities = np.dot(options_norm, test_vec_norm)
142
+
143
+ # Get indexes and similarity scores as dict
144
+ scores_d = [{"index": i, "value": float(v)} for i, v in enumerate(cosine_similarities)]
145
+
146
+ # Sort dict by similarity score descending
147
+ sorted_d = sorted(scores_d, key=lambda x: x["value"], reverse=True)
148
+
149
+ # Return top results
150
+ top_d = sorted_d[:top]
150
151
  return top_d
151
152
 
153
+
152
154
  def process_question_results(question_def, question_results):
153
155
 
154
156
  """
@@ -211,8 +213,9 @@ def process_question_results(question_def, question_results):
211
213
 
212
214
 
213
215
  concept_order = [max_src]
214
- for t in concept_rels[max_src]["edges"]:
215
- concept_order.append(t["dst"])
216
+ if max_src != "":
217
+ for t in concept_rels[max_src]["edges"]:
218
+ concept_order.append(t["dst"])
216
219
 
217
220
  for c in concept_props:
218
221
  if c not in concept_order:
@@ -3,13 +3,11 @@ from pyspark.sql import SparkSession
3
3
 
4
4
  from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
5
5
  from pyspark.sql import functions as F
6
- from sentence_transformers import SentenceTransformer
7
6
  from delta import DeltaTable
8
- from typing import Union
9
7
  from langchain_core.language_models.chat_models import BaseChatModel
10
8
  from langchain_core.embeddings import Embeddings
11
9
  from langchain_community.document_loaders import PySparkDataFrameLoader
12
- from langchain import hub
10
+ from langchain_classic import hub
13
11
  from langchain_core.output_parsers import StrOutputParser
14
12
 
15
13
  import urllib
@@ -69,6 +67,7 @@ def generate_sentences(tc: AIContext, replica_schema=None, concept_white_list=No
69
67
 
70
68
  print("Dropping and Recreating the RAG Table")
71
69
  ss.sql(__create_rag_table_sql(tc.schema, tc.model_id))
70
+ ss.sql(__clear_rag_table_sql(tc.schema, tc.model_id))
72
71
 
73
72
  print("Generating Extraction SQL")
74
73
  sql_statements = []
@@ -89,6 +88,7 @@ def generate_sentences(tc: AIContext, replica_schema=None, concept_white_list=No
89
88
  if replica_schema is not None:
90
89
  print("Replicating Schema")
91
90
  ss.sql(__create_rag_table_sql(replica_schema, tc.model_id))
91
+ ss.sql(__clear_rag_table_sql(tc.schema, tc.model_id))
92
92
  ss.sql(__replicate_to_catalog_sql(
93
93
  tc.schema, replica_schema, tc.model_id))
94
94
 
@@ -143,13 +143,13 @@ def __generate_sentences_from_questions(tc: AIContext, debug):
143
143
  ss.sql(full_sql)
144
144
 
145
145
 
146
- def encode_to_delta_local(tc: AIContext, st_model: Union[SentenceTransformer, Embeddings], replica_schema=None, batch_size=100000):
146
+ def encode_to_delta_local(tc: AIContext, st_model: Embeddings, replica_schema=None, batch_size=100000):
147
147
  """
148
148
  Encode Semantic Data to Vectors in Delta Table
149
149
 
150
150
  Parameters:
151
151
  tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
152
- st_model (SentenceTransformer): A sentence_transformers model to use for encoding.
152
+ st_model (Embeddings): A langchain embedding model to use for encoding.
153
153
  replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
154
154
  """
155
155
 
@@ -172,12 +172,8 @@ def encode_to_delta_local(tc: AIContext, st_model: Union[SentenceTransformer, Em
172
172
  content_list = [r["content"] for r in sentences_df.collect()]
173
173
  id_list = [r["id"] for r in sentences_df.collect()]
174
174
 
175
- if isinstance(st_model, SentenceTransformer):
176
- vector_list = st_model.encode(
177
- content_list, normalize_embeddings=True, show_progress_bar=True).tolist()
178
- else:
179
- vector_list = st_model.embed_documents(content_list)
180
- for i, v in enumerate(vector_list):
175
+ vector_list = st_model.embed_documents(content_list)
176
+ for i, v in enumerate(vector_list):
181
177
  vector_list[i] = [float(x) for x in v]
182
178
  #vector_list = st_model.encode(
183
179
  # content_list, normalize_embeddings=True, show_progress_bar=True)
@@ -212,13 +208,13 @@ def encode_to_delta_local(tc: AIContext, st_model: Union[SentenceTransformer, Em
212
208
  # """)
213
209
 
214
210
 
215
- def rag_delta(tc: AIContext, emb_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, question, k=5, replica_schema=None):
211
+ def rag_delta(tc: AIContext, emb_model: Embeddings, chat_model: BaseChatModel, question, k=5, replica_schema=None):
216
212
  """
217
213
  Run a RAG query using vectors in Delta table.
218
214
 
219
215
  Parameters:
220
216
  tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
221
- emb_model (UNION[SentenceTransformer, Embeddings]): A sentence_transformers or langchain embedding model to use for encoding the query.
217
+ emb_model (Embeddings): A langchain embedding model to use for encoding the query.
222
218
  chat_model (BaseChatModel): A langchain chat model to use in the RAG pipeline.
223
219
  question (str): The user's query.
224
220
  k (int) OPTIONAL: The number of RAG documents to retrieve.
@@ -231,10 +227,7 @@ def rag_delta(tc: AIContext, emb_model: Union[SentenceTransformer, Embeddings],
231
227
 
232
228
  ss = tc.spark_session
233
229
 
234
- if isinstance(emb_model, SentenceTransformer):
235
- vector_list = emb_model.encode(
236
- question, normalize_embeddings=True).tolist()
237
- elif isinstance(emb_model, Embeddings):
230
+ if isinstance(emb_model, Embeddings):
238
231
  vector_list = emb_model.embed_query(question)
239
232
  else:
240
233
  print("Invalid Embedding Model Type")
@@ -274,8 +267,14 @@ def rag_delta(tc: AIContext, emb_model: Union[SentenceTransformer, Embeddings],
274
267
  return response
275
268
 
276
269
 
270
+ #def __create_rag_table_sql(schema, model_id):
271
+ # return f"CREATE OR REPLACE TABLE {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
272
+
277
273
  def __create_rag_table_sql(schema, model_id):
278
- return f"CREATE OR REPLACE TABLE {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
274
+ return f"CREATE TABLE IF NOT EXISTS {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
275
+
276
+ def __clear_rag_table_sql(schema, model_id):
277
+ return f"DELETE FROM {schema}.rag_{model_id}"
279
278
 
280
279
 
281
280
  def __replicate_to_catalog_sql(base_schema, target_schema, model_id):