kobai-sdk 0.2.8rc13__tar.gz → 0.3.5rc6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kobai_sdk-0.2.8rc13/kobai_sdk.egg-info → kobai_sdk-0.3.5rc6}/PKG-INFO +62 -56
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/README.md +57 -52
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/ai_query.py +25 -22
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/ai_rag.py +17 -18
- kobai_sdk-0.3.5rc6/kobai/genie.py +194 -0
- kobai_sdk-0.3.5rc6/kobai/mobi.py +733 -0
- kobai_sdk-0.3.5rc6/kobai/mobi_config.py +19 -0
- kobai_sdk-0.3.5rc6/kobai/ms_authenticate.py +66 -0
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/tenant_api.py +5 -2
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/tenant_client.py +213 -101
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6/kobai_sdk.egg-info}/PKG-INFO +62 -56
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai_sdk.egg-info/SOURCES.txt +4 -0
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai_sdk.egg-info/requires.txt +2 -2
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/pyproject.toml +3 -3
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/LICENSE +0 -0
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/MANIFEST.in +0 -0
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/__init__.py +0 -0
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/databricks_client.py +0 -0
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/demo_tenant_client.py +0 -0
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai/spark_client.py +0 -0
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai_sdk.egg-info/dependency_links.txt +0 -0
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/kobai_sdk.egg-info/top_level.txt +0 -0
- {kobai_sdk-0.2.8rc13 → kobai_sdk-0.3.5rc6}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: kobai-sdk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.5rc6
|
|
4
4
|
Summary: A package that enables interaction with a Kobai tenant.
|
|
5
5
|
Author-email: Ryan Oattes <ryan@kobai.io>
|
|
6
6
|
License: Apache License
|
|
@@ -221,14 +221,15 @@ Requires-Dist: azure-identity
|
|
|
221
221
|
Requires-Dist: azure-storage-blob
|
|
222
222
|
Requires-Dist: langchain-core
|
|
223
223
|
Requires-Dist: langchain-community
|
|
224
|
-
Requires-Dist:
|
|
225
|
-
Requires-Dist:
|
|
224
|
+
Requires-Dist: langchain-classic
|
|
225
|
+
Requires-Dist: delta-spark
|
|
226
226
|
Provides-Extra: dev
|
|
227
227
|
Requires-Dist: black; extra == "dev"
|
|
228
228
|
Requires-Dist: bumpver; extra == "dev"
|
|
229
229
|
Requires-Dist: isort; extra == "dev"
|
|
230
230
|
Requires-Dist: pip-tools; extra == "dev"
|
|
231
231
|
Requires-Dist: pytest; extra == "dev"
|
|
232
|
+
Dynamic: license-file
|
|
232
233
|
|
|
233
234
|
# Kobai SDK for Python (Alpha)
|
|
234
235
|
|
|
@@ -247,21 +248,50 @@ from kobai import tenant_client, spark_client, databricks_client
|
|
|
247
248
|
|
|
248
249
|
schema = 'main.demo'
|
|
249
250
|
uri = 'https://demo.kobai.io'
|
|
250
|
-
tenant_id = '1'
|
|
251
251
|
tenant_name = 'My Demo Tenant'
|
|
252
|
-
|
|
253
|
-
k = tenant_client.TenantClient(tenant_name, tenant_id, uri, schema)
|
|
252
|
+
k = tenant_client.TenantClient(tenant_name, uri, schema)
|
|
254
253
|
```
|
|
255
254
|
|
|
256
255
|
2. Authenticate with the Kobai instance:
|
|
256
|
+
Authentication can be performed using different methods, such as device code flow, on-behalf-of flow, or browser-based tokens.
|
|
257
|
+
|
|
258
|
+
#### Authentication via device code
|
|
259
|
+
Step 1: Obtain the access token from IDM (Identity and Access Management)
|
|
257
260
|
|
|
258
261
|
```python
|
|
259
|
-
|
|
262
|
+
from kobai import ms_authenticate
|
|
263
|
+
|
|
260
264
|
tenant_id = 'your_Entra_directory_id_here'
|
|
265
|
+
client_id = 'your_Entra_app_id_here'
|
|
266
|
+
|
|
267
|
+
access_token = ms_authenticate.device_code(tenant_id, client_id)
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
Step 2: Use the token to retrieve the list of Kobai tenants (unless the tenant ID is already known).
|
|
261
271
|
|
|
262
|
-
|
|
272
|
+
```python
|
|
273
|
+
tenants = k.get_tenants(id_token=access_token)
|
|
274
|
+
print(tenants)
|
|
263
275
|
```
|
|
264
276
|
|
|
277
|
+
Step 3: Authenticate with Kobai for the specific tenant using the IDM access token.
|
|
278
|
+
|
|
279
|
+
```python
|
|
280
|
+
kobai_tenant_id = "5c1ba715-3961-4835-8a10-6f6f963b53ff"
|
|
281
|
+
k.use_access_token(access_token = access_token, tenant_id=kobai_tenant_id)
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
At this point, authentication to the Kobai tenant is successfully completed.
|
|
285
|
+
|
|
286
|
+
#### Authentication via browser token
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
k.use_browser_token(access_token="KOBAI_ACESS_TOKEN_FROM_BROWSER")
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
#### Authentication via on-behalf-of flow
|
|
293
|
+
The sample code demonstrating authentication via the on-behalf-of flow will be provided, if requested.
|
|
294
|
+
|
|
265
295
|
3. Initialize a Spark client using your current `SparkSession`, and generate semantically-rich SQL views describing this Kobai tenant:
|
|
266
296
|
|
|
267
297
|
```python
|
|
@@ -303,68 +333,41 @@ kobai_query_name = "Set ownership"
|
|
|
303
333
|
question_json = k.run_question_remote(k.get_question_id(kobai_query_name)) # By questionName
|
|
304
334
|
```
|
|
305
335
|
|
|
306
|
-
3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using
|
|
307
|
-
|
|
308
|
-
#### Using Azure OpenAI
|
|
309
|
-
|
|
310
|
-
###### Authentication Methods:
|
|
311
|
-
|
|
312
|
-
1. ApiKey
|
|
313
|
-
|
|
314
|
-
```python
|
|
315
|
-
from kobai import ai_query, llm_config
|
|
316
|
-
import json
|
|
317
|
-
|
|
318
|
-
followup_question = "Which owner owns the most sets?"
|
|
319
|
-
|
|
320
|
-
llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", api_key="YOUR_API_KEY", deployment="gpt-4o-mini", llm_provider="azure_openai")
|
|
321
|
-
|
|
322
|
-
output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
|
|
323
|
-
print(output)
|
|
324
|
-
```
|
|
325
|
-
|
|
326
|
-
2. Azure Active Directory Authentication
|
|
336
|
+
3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using the user-provided chat and embedding model.
|
|
327
337
|
|
|
328
|
-
|
|
329
|
-
|
|
338
|
+
#### Using Databricks Embeddings and Chat Models in a Databricks Notebook
|
|
339
|
+
Initialize the AI components by specifying the embedding and chat models, then proceed with follow-up questions for interactive engagement.
|
|
330
340
|
|
|
331
341
|
```python
|
|
332
|
-
from
|
|
342
|
+
from databricks_langchain import DatabricksEmbeddings
|
|
343
|
+
from langchain_community.chat_models import ChatDatabricks
|
|
333
344
|
import json
|
|
334
345
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
|
|
341
|
-
print(output)
|
|
342
|
-
```
|
|
343
|
-
|
|
344
|
-
#### Using Databricks (Default Configuration)
|
|
345
|
-
|
|
346
|
-
```python
|
|
347
|
-
from kobai import ai_query, llm_config
|
|
348
|
-
import json
|
|
346
|
+
# choose the embedding and chat model of your choice from the databricks serving and initialize.
|
|
347
|
+
embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
|
|
348
|
+
chat_model = ChatDatabricks(endpoint="databricks-gpt-oss-20b")
|
|
349
|
+
k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
|
|
349
350
|
|
|
350
351
|
followup_question = "Which owner owns the most sets?"
|
|
351
|
-
|
|
352
|
-
llm_config = llm_config.LLMConfig()
|
|
353
|
-
|
|
354
|
-
output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
|
|
352
|
+
output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
|
|
355
353
|
print(output)
|
|
356
354
|
```
|
|
357
355
|
|
|
358
|
-
####
|
|
356
|
+
#### Using Azure OpenAI Embeddings and Chat Models
|
|
359
357
|
|
|
360
358
|
```python
|
|
361
|
-
from kobai import ai_query, llm_config
|
|
362
|
-
import json
|
|
363
359
|
from langchain_openai import AzureChatOpenAI
|
|
360
|
+
from langchain_openai import AzureOpenAIEmbeddings
|
|
361
|
+
import json
|
|
364
362
|
|
|
365
363
|
followup_question = "Which owner owns the most sets?"
|
|
366
364
|
|
|
367
|
-
|
|
365
|
+
embedding_model = AzureOpenAIEmbeddings(
|
|
366
|
+
model="text-embedding-3-small",
|
|
367
|
+
azure_endpoint="https://kobaipoc.openai.azure.com/",
|
|
368
|
+
api_key="YOUR_API_KEY",
|
|
369
|
+
openai_api_version="2023-05-15"
|
|
370
|
+
)
|
|
368
371
|
|
|
369
372
|
chat_model = AzureChatOpenAI(
|
|
370
373
|
azure_endpoint="https://kobaipoc.openai.azure.com/", azure_deployment="gpt-4o-mini",
|
|
@@ -373,7 +376,10 @@ openai_api_version="2024-02-15-preview",
|
|
|
373
376
|
temperature=0.5,
|
|
374
377
|
max_tokens=150,)
|
|
375
378
|
|
|
376
|
-
|
|
379
|
+
k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
|
|
380
|
+
|
|
381
|
+
followup_question = "Which theme has the most sets?"
|
|
382
|
+
output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
|
|
377
383
|
print(output)
|
|
378
384
|
```
|
|
379
385
|
|
|
@@ -15,21 +15,50 @@ from kobai import tenant_client, spark_client, databricks_client
|
|
|
15
15
|
|
|
16
16
|
schema = 'main.demo'
|
|
17
17
|
uri = 'https://demo.kobai.io'
|
|
18
|
-
tenant_id = '1'
|
|
19
18
|
tenant_name = 'My Demo Tenant'
|
|
20
|
-
|
|
21
|
-
k = tenant_client.TenantClient(tenant_name, tenant_id, uri, schema)
|
|
19
|
+
k = tenant_client.TenantClient(tenant_name, uri, schema)
|
|
22
20
|
```
|
|
23
21
|
|
|
24
22
|
2. Authenticate with the Kobai instance:
|
|
23
|
+
Authentication can be performed using different methods, such as device code flow, on-behalf-of flow, or browser-based tokens.
|
|
24
|
+
|
|
25
|
+
#### Authentication via device code
|
|
26
|
+
Step 1: Obtain the access token from IDM (Identity and Access Management)
|
|
25
27
|
|
|
26
28
|
```python
|
|
27
|
-
|
|
29
|
+
from kobai import ms_authenticate
|
|
30
|
+
|
|
28
31
|
tenant_id = 'your_Entra_directory_id_here'
|
|
32
|
+
client_id = 'your_Entra_app_id_here'
|
|
33
|
+
|
|
34
|
+
access_token = ms_authenticate.device_code(tenant_id, client_id)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Step 2: Use the token to retrieve the list of Kobai tenants (unless the tenant ID is already known).
|
|
29
38
|
|
|
30
|
-
|
|
39
|
+
```python
|
|
40
|
+
tenants = k.get_tenants(id_token=access_token)
|
|
41
|
+
print(tenants)
|
|
31
42
|
```
|
|
32
43
|
|
|
44
|
+
Step 3: Authenticate with Kobai for the specific tenant using the IDM access token.
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
kobai_tenant_id = "5c1ba715-3961-4835-8a10-6f6f963b53ff"
|
|
48
|
+
k.use_access_token(access_token = access_token, tenant_id=kobai_tenant_id)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
At this point, authentication to the Kobai tenant is successfully completed.
|
|
52
|
+
|
|
53
|
+
#### Authentication via browser token
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
k.use_browser_token(access_token="KOBAI_ACESS_TOKEN_FROM_BROWSER")
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
#### Authentication via on-behalf-of flow
|
|
60
|
+
The sample code demonstrating authentication via the on-behalf-of flow will be provided, if requested.
|
|
61
|
+
|
|
33
62
|
3. Initialize a Spark client using your current `SparkSession`, and generate semantically-rich SQL views describing this Kobai tenant:
|
|
34
63
|
|
|
35
64
|
```python
|
|
@@ -71,68 +100,41 @@ kobai_query_name = "Set ownership"
|
|
|
71
100
|
question_json = k.run_question_remote(k.get_question_id(kobai_query_name)) # By questionName
|
|
72
101
|
```
|
|
73
102
|
|
|
74
|
-
3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using
|
|
75
|
-
|
|
76
|
-
#### Using Azure OpenAI
|
|
77
|
-
|
|
78
|
-
###### Authentication Methods:
|
|
79
|
-
|
|
80
|
-
1. ApiKey
|
|
81
|
-
|
|
82
|
-
```python
|
|
83
|
-
from kobai import ai_query, llm_config
|
|
84
|
-
import json
|
|
85
|
-
|
|
86
|
-
followup_question = "Which owner owns the most sets?"
|
|
87
|
-
|
|
88
|
-
llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", api_key="YOUR_API_KEY", deployment="gpt-4o-mini", llm_provider="azure_openai")
|
|
89
|
-
|
|
90
|
-
output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
|
|
91
|
-
print(output)
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
2. Azure Active Directory Authentication
|
|
103
|
+
3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using the user-provided chat and embedding model.
|
|
95
104
|
|
|
96
|
-
|
|
97
|
-
|
|
105
|
+
#### Using Databricks Embeddings and Chat Models in a Databricks Notebook
|
|
106
|
+
Initialize the AI components by specifying the embedding and chat models, then proceed with follow-up questions for interactive engagement.
|
|
98
107
|
|
|
99
108
|
```python
|
|
100
|
-
from
|
|
109
|
+
from databricks_langchain import DatabricksEmbeddings
|
|
110
|
+
from langchain_community.chat_models import ChatDatabricks
|
|
101
111
|
import json
|
|
102
112
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
|
|
109
|
-
print(output)
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
#### Using Databricks (Default Configuration)
|
|
113
|
-
|
|
114
|
-
```python
|
|
115
|
-
from kobai import ai_query, llm_config
|
|
116
|
-
import json
|
|
113
|
+
# choose the embedding and chat model of your choice from the databricks serving and initialize.
|
|
114
|
+
embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
|
|
115
|
+
chat_model = ChatDatabricks(endpoint="databricks-gpt-oss-20b")
|
|
116
|
+
k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
|
|
117
117
|
|
|
118
118
|
followup_question = "Which owner owns the most sets?"
|
|
119
|
-
|
|
120
|
-
llm_config = llm_config.LLMConfig()
|
|
121
|
-
|
|
122
|
-
output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
|
|
119
|
+
output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
|
|
123
120
|
print(output)
|
|
124
121
|
```
|
|
125
122
|
|
|
126
|
-
####
|
|
123
|
+
#### Using Azure OpenAI Embeddings and Chat Models
|
|
127
124
|
|
|
128
125
|
```python
|
|
129
|
-
from kobai import ai_query, llm_config
|
|
130
|
-
import json
|
|
131
126
|
from langchain_openai import AzureChatOpenAI
|
|
127
|
+
from langchain_openai import AzureOpenAIEmbeddings
|
|
128
|
+
import json
|
|
132
129
|
|
|
133
130
|
followup_question = "Which owner owns the most sets?"
|
|
134
131
|
|
|
135
|
-
|
|
132
|
+
embedding_model = AzureOpenAIEmbeddings(
|
|
133
|
+
model="text-embedding-3-small",
|
|
134
|
+
azure_endpoint="https://kobaipoc.openai.azure.com/",
|
|
135
|
+
api_key="YOUR_API_KEY",
|
|
136
|
+
openai_api_version="2023-05-15"
|
|
137
|
+
)
|
|
136
138
|
|
|
137
139
|
chat_model = AzureChatOpenAI(
|
|
138
140
|
azure_endpoint="https://kobaipoc.openai.azure.com/", azure_deployment="gpt-4o-mini",
|
|
@@ -141,7 +143,10 @@ openai_api_version="2024-02-15-preview",
|
|
|
141
143
|
temperature=0.5,
|
|
142
144
|
max_tokens=150,)
|
|
143
145
|
|
|
144
|
-
|
|
146
|
+
k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
|
|
147
|
+
|
|
148
|
+
followup_question = "Which theme has the most sets?"
|
|
149
|
+
output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
|
|
145
150
|
print(output)
|
|
146
151
|
```
|
|
147
152
|
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate
|
|
2
2
|
from langchain_core.output_parsers import StrOutputParser
|
|
3
3
|
|
|
4
|
-
from sentence_transformers import SentenceTransformer, util
|
|
5
|
-
|
|
6
4
|
from langchain_core.language_models.chat_models import BaseChatModel
|
|
7
5
|
from langchain_core.embeddings import Embeddings
|
|
8
6
|
from langchain_core.documents import Document
|
|
@@ -10,8 +8,9 @@ from langchain_core.retrievers import BaseRetriever
|
|
|
10
8
|
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
|
11
9
|
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
|
|
12
10
|
from langchain_core.vectorstores import InMemoryVectorStore
|
|
11
|
+
import numpy as np
|
|
13
12
|
|
|
14
|
-
from typing import
|
|
13
|
+
from typing import List
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
MESSAGE_SYSTEM_TEMPLATE = """
|
|
@@ -73,7 +72,7 @@ def format_docs(docs):
|
|
|
73
72
|
def input_only(inpt):
|
|
74
73
|
return inpt["question"]
|
|
75
74
|
|
|
76
|
-
def followup_question(user_question, question_results, question_name, question_def, embedding_model:
|
|
75
|
+
def followup_question(user_question, question_results, question_name, question_def, embedding_model: Embeddings, chat_model: BaseChatModel, use_inmem_vectors=False, k=50):
|
|
77
76
|
|
|
78
77
|
row_texts = process_question_results(question_def, question_results)
|
|
79
78
|
question_documents = [Document(page_content=r, metadata={"source": "kobai"}) for r in row_texts]
|
|
@@ -118,22 +117,13 @@ def init_question_search_index(tenant_questions, emb_model):
|
|
|
118
117
|
|
|
119
118
|
q_ids = [q["id"] for q in tenant_questions]
|
|
120
119
|
q_descs = [q["description"] for q in tenant_questions]
|
|
121
|
-
|
|
122
|
-
if isinstance(emb_model, SentenceTransformer):
|
|
123
|
-
q_vectors = emb_model.encode(q_descs)
|
|
124
|
-
else:
|
|
125
|
-
q_vectors = emb_model.embed_documents(q_descs)
|
|
126
|
-
|
|
120
|
+
q_vectors = emb_model.embed_documents(q_descs)
|
|
127
121
|
return {"ids": q_ids, "descs": q_descs, "vectors": q_vectors}
|
|
128
122
|
|
|
129
123
|
|
|
130
124
|
def question_search(search_text: str, search_index, emb_model, k: int):
|
|
131
|
-
|
|
132
|
-
search_vec = emb_model.encode(search_text)
|
|
133
|
-
else:
|
|
134
|
-
search_vec = emb_model.embed_query(search_text)
|
|
125
|
+
search_vec = emb_model.embed_query(search_text)
|
|
135
126
|
#search_vec = emb_model.encode(search_text)
|
|
136
|
-
|
|
137
127
|
matches = __top_vector_matches(search_vec, search_index["vectors"], top=k)
|
|
138
128
|
|
|
139
129
|
for mi, m in enumerate(matches):
|
|
@@ -142,13 +132,25 @@ def question_search(search_text: str, search_index, emb_model, k: int):
|
|
|
142
132
|
return matches
|
|
143
133
|
|
|
144
134
|
def __top_vector_matches(test_vec, options_list_vec, top=1):
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
135
|
+
# Normalize the test vector
|
|
136
|
+
test_vec_norm = test_vec / np.linalg.norm(test_vec)
|
|
137
|
+
# Normalize the option vectors
|
|
138
|
+
options_norm = options_list_vec / np.linalg.norm(options_list_vec, axis=1, keepdims=True)
|
|
139
|
+
|
|
140
|
+
# Compute cosine similarity (dot product of normalized vectors)
|
|
141
|
+
cosine_similarities = np.dot(options_norm, test_vec_norm)
|
|
142
|
+
|
|
143
|
+
# Get indexes and similarity scores as dict
|
|
144
|
+
scores_d = [{"index": i, "value": float(v)} for i, v in enumerate(cosine_similarities)]
|
|
145
|
+
|
|
146
|
+
# Sort dict by similarity score descending
|
|
147
|
+
sorted_d = sorted(scores_d, key=lambda x: x["value"], reverse=True)
|
|
148
|
+
|
|
149
|
+
# Return top results
|
|
150
|
+
top_d = sorted_d[:top]
|
|
150
151
|
return top_d
|
|
151
152
|
|
|
153
|
+
|
|
152
154
|
def process_question_results(question_def, question_results):
|
|
153
155
|
|
|
154
156
|
"""
|
|
@@ -211,8 +213,9 @@ def process_question_results(question_def, question_results):
|
|
|
211
213
|
|
|
212
214
|
|
|
213
215
|
concept_order = [max_src]
|
|
214
|
-
|
|
215
|
-
|
|
216
|
+
if max_src != "":
|
|
217
|
+
for t in concept_rels[max_src]["edges"]:
|
|
218
|
+
concept_order.append(t["dst"])
|
|
216
219
|
|
|
217
220
|
for c in concept_props:
|
|
218
221
|
if c not in concept_order:
|
|
@@ -3,13 +3,11 @@ from pyspark.sql import SparkSession
|
|
|
3
3
|
|
|
4
4
|
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
|
|
5
5
|
from pyspark.sql import functions as F
|
|
6
|
-
from sentence_transformers import SentenceTransformer
|
|
7
6
|
from delta import DeltaTable
|
|
8
|
-
from typing import Union
|
|
9
7
|
from langchain_core.language_models.chat_models import BaseChatModel
|
|
10
8
|
from langchain_core.embeddings import Embeddings
|
|
11
9
|
from langchain_community.document_loaders import PySparkDataFrameLoader
|
|
12
|
-
from
|
|
10
|
+
from langchain_classic import hub
|
|
13
11
|
from langchain_core.output_parsers import StrOutputParser
|
|
14
12
|
|
|
15
13
|
import urllib
|
|
@@ -69,6 +67,7 @@ def generate_sentences(tc: AIContext, replica_schema=None, concept_white_list=No
|
|
|
69
67
|
|
|
70
68
|
print("Dropping and Recreating the RAG Table")
|
|
71
69
|
ss.sql(__create_rag_table_sql(tc.schema, tc.model_id))
|
|
70
|
+
ss.sql(__clear_rag_table_sql(tc.schema, tc.model_id))
|
|
72
71
|
|
|
73
72
|
print("Generating Extraction SQL")
|
|
74
73
|
sql_statements = []
|
|
@@ -89,6 +88,7 @@ def generate_sentences(tc: AIContext, replica_schema=None, concept_white_list=No
|
|
|
89
88
|
if replica_schema is not None:
|
|
90
89
|
print("Replicating Schema")
|
|
91
90
|
ss.sql(__create_rag_table_sql(replica_schema, tc.model_id))
|
|
91
|
+
ss.sql(__clear_rag_table_sql(tc.schema, tc.model_id))
|
|
92
92
|
ss.sql(__replicate_to_catalog_sql(
|
|
93
93
|
tc.schema, replica_schema, tc.model_id))
|
|
94
94
|
|
|
@@ -143,13 +143,13 @@ def __generate_sentences_from_questions(tc: AIContext, debug):
|
|
|
143
143
|
ss.sql(full_sql)
|
|
144
144
|
|
|
145
145
|
|
|
146
|
-
def encode_to_delta_local(tc: AIContext, st_model:
|
|
146
|
+
def encode_to_delta_local(tc: AIContext, st_model: Embeddings, replica_schema=None, batch_size=100000):
|
|
147
147
|
"""
|
|
148
148
|
Encode Semantic Data to Vectors in Delta Table
|
|
149
149
|
|
|
150
150
|
Parameters:
|
|
151
151
|
tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
|
|
152
|
-
st_model (
|
|
152
|
+
st_model (Embeddings): A langchain embedding model to use for encoding.
|
|
153
153
|
replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
|
|
154
154
|
"""
|
|
155
155
|
|
|
@@ -172,12 +172,8 @@ def encode_to_delta_local(tc: AIContext, st_model: Union[SentenceTransformer, Em
|
|
|
172
172
|
content_list = [r["content"] for r in sentences_df.collect()]
|
|
173
173
|
id_list = [r["id"] for r in sentences_df.collect()]
|
|
174
174
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
content_list, normalize_embeddings=True, show_progress_bar=True).tolist()
|
|
178
|
-
else:
|
|
179
|
-
vector_list = st_model.embed_documents(content_list)
|
|
180
|
-
for i, v in enumerate(vector_list):
|
|
175
|
+
vector_list = st_model.embed_documents(content_list)
|
|
176
|
+
for i, v in enumerate(vector_list):
|
|
181
177
|
vector_list[i] = [float(x) for x in v]
|
|
182
178
|
#vector_list = st_model.encode(
|
|
183
179
|
# content_list, normalize_embeddings=True, show_progress_bar=True)
|
|
@@ -212,13 +208,13 @@ def encode_to_delta_local(tc: AIContext, st_model: Union[SentenceTransformer, Em
|
|
|
212
208
|
# """)
|
|
213
209
|
|
|
214
210
|
|
|
215
|
-
def rag_delta(tc: AIContext, emb_model:
|
|
211
|
+
def rag_delta(tc: AIContext, emb_model: Embeddings, chat_model: BaseChatModel, question, k=5, replica_schema=None):
|
|
216
212
|
"""
|
|
217
213
|
Run a RAG query using vectors in Delta table.
|
|
218
214
|
|
|
219
215
|
Parameters:
|
|
220
216
|
tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
|
|
221
|
-
emb_model (
|
|
217
|
+
emb_model (Embeddings): A langchain embedding model to use for encoding the query.
|
|
222
218
|
chat_model (BaseChatModel): A langchain chat model to use in the RAG pipeline.
|
|
223
219
|
question (str): The user's query.
|
|
224
220
|
k (int) OPTIONAL: The number of RAG documents to retrieve.
|
|
@@ -231,10 +227,7 @@ def rag_delta(tc: AIContext, emb_model: Union[SentenceTransformer, Embeddings],
|
|
|
231
227
|
|
|
232
228
|
ss = tc.spark_session
|
|
233
229
|
|
|
234
|
-
if isinstance(emb_model,
|
|
235
|
-
vector_list = emb_model.encode(
|
|
236
|
-
question, normalize_embeddings=True).tolist()
|
|
237
|
-
elif isinstance(emb_model, Embeddings):
|
|
230
|
+
if isinstance(emb_model, Embeddings):
|
|
238
231
|
vector_list = emb_model.embed_query(question)
|
|
239
232
|
else:
|
|
240
233
|
print("Invalid Embedding Model Type")
|
|
@@ -274,8 +267,14 @@ def rag_delta(tc: AIContext, emb_model: Union[SentenceTransformer, Embeddings],
|
|
|
274
267
|
return response
|
|
275
268
|
|
|
276
269
|
|
|
270
|
+
#def __create_rag_table_sql(schema, model_id):
|
|
271
|
+
# return f"CREATE OR REPLACE TABLE {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
|
|
272
|
+
|
|
277
273
|
def __create_rag_table_sql(schema, model_id):
|
|
278
|
-
return f"CREATE
|
|
274
|
+
return f"CREATE TABLE IF NOT EXISTS {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
|
|
275
|
+
|
|
276
|
+
def __clear_rag_table_sql(schema, model_id):
|
|
277
|
+
return f"DELETE FROM {schema}.rag_{model_id}"
|
|
279
278
|
|
|
280
279
|
|
|
281
280
|
def __replicate_to_catalog_sql(base_schema, target_schema, model_id):
|