kobai-sdk 0.2.9__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kobai-sdk might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kobai-sdk
3
- Version: 0.2.9
3
+ Version: 0.3.0
4
4
  Summary: A package that enables interaction with a Kobai tenant.
5
5
  Author-email: Ryan Oattes <ryan@kobai.io>
6
6
  License: Apache License
@@ -223,7 +223,6 @@ Requires-Dist: langchain-core
223
223
  Requires-Dist: langchain-community
224
224
  Requires-Dist: langchain_openai
225
225
  Requires-Dist: databricks_langchain
226
- Requires-Dist: sentence-transformers
227
226
  Provides-Extra: dev
228
227
  Requires-Dist: black; extra == "dev"
229
228
  Requires-Dist: bumpver; extra == "dev"
@@ -249,21 +248,50 @@ from kobai import tenant_client, spark_client, databricks_client
249
248
 
250
249
  schema = 'main.demo'
251
250
  uri = 'https://demo.kobai.io'
252
- tenant_id = '1'
253
251
  tenant_name = 'My Demo Tenant'
254
-
255
- k = tenant_client.TenantClient(tenant_name, tenant_id, uri, schema)
252
+ k = tenant_client.TenantClient(tenant_name, uri, schema)
256
253
  ```
257
254
 
258
255
  2. Authenticate with the Kobai instance:
256
+ Authentication can be performed using different methods, such as device code flow, on-behalf-of flow, or browser-based tokens.
257
+
258
+ #### Authentication via device code
259
+ Step 1: Obtain the access token from IDM (Identity and Access Management)
259
260
 
260
261
  ```python
261
- client_id = 'your_Entra_app_id_here'
262
+ from kobai import ms_authenticate
263
+
262
264
  tenant_id = 'your_Entra_directory_id_here'
265
+ client_id = 'your_Entra_app_id_here'
266
+
267
+ access_token = ms_authenticate.device_code(tenant_id, client_id)
268
+ ```
269
+
270
+ Step 2: Use the token to retrieve the list of Kobai tenants (unless the tenant ID is already known).
263
271
 
264
- k.authenticate(client_id, tenant_id)
272
+ ```python
273
+ tenants = k.get_tenants(id_token=access_token)
274
+ print(tenants)
265
275
  ```
266
276
 
277
+ Step 3: Authenticate with Kobai for the specific tenant using the IDM access token.
278
+
279
+ ```python
280
+ kobai_tenant_id = "5c1ba715-3961-4835-8a10-6f6f963b53ff"
281
+ k.use_access_token(access_token = access_token, tenant_id=kobai_tenant_id)
282
+ ```
283
+
284
+ At this point, authentication to the Kobai tenant is successfully completed.
285
+
286
+ #### Authentication via browser token
287
+
288
+ ```python
289
+ k.use_browser_token(access_token="KOBAI_ACESS_TOKEN_FROM_BROWSER")
290
+ ```
291
+
292
+ #### Authentication via on-behalf-of flow
293
+ The sample code demonstrating authentication via the on-behalf-of flow will be provided, if requested.
294
+
267
295
  3. Initialize a Spark client using your current `SparkSession`, and generate semantically-rich SQL views describing this Kobai tenant:
268
296
 
269
297
  ```python
@@ -305,68 +333,41 @@ kobai_query_name = "Set ownership"
305
333
  question_json = k.run_question_remote(k.get_question_id(kobai_query_name)) # By questionName
306
334
  ```
307
335
 
308
- 3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using either Azure OpenAI, Databricks or a user-provided chat model.
309
-
310
- #### Using Azure OpenAI
311
-
312
- ###### Authentication Methods:
313
-
314
- 1. ApiKey
315
-
316
- ```python
317
- from kobai import ai_query, llm_config
318
- import json
319
-
320
- followup_question = "Which owner owns the most sets?"
321
-
322
- llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", api_key="YOUR_API_KEY", deployment="gpt-4o-mini", llm_provider="azure_openai")
323
-
324
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
325
- print(output)
326
- ```
327
-
328
- 2. Azure Active Directory Authentication
336
+ 3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using the user-provided chat and embedding model.
329
337
 
330
- Ensure that the logged-in tenant has access to Azure OpenAI.
331
- In case of databricks notebook, the logged in service principal should have access to Azure OpenAI.
338
+ #### Using Databricks Embeddings and Chat Models in a Databricks Notebook
339
+ Initialize the AI components by specifying the embedding and chat models, then proceed with follow-up questions for interactive engagement.
332
340
 
333
341
  ```python
334
- from kobai import ai_query, llm_config
342
+ from databricks_langchain import DatabricksEmbeddings
343
+ from langchain_community.chat_models import ChatDatabricks
335
344
  import json
336
345
 
337
- followup_question = "Which owner owns the most sets?"
338
-
339
- llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", deployment="gpt-4o-mini", llm_provider="azure_openai")
340
- llm_config.get_azure_ad_token()
341
-
342
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
343
- print(output)
344
- ```
345
-
346
- #### Using Databricks (Default Configuration)
347
-
348
- ```python
349
- from kobai import ai_query, llm_config
350
- import json
346
+ # choose the embedding and chat model of your choice from the databricks serving and initialize.
347
+ embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
348
+ chat_model = ChatDatabricks(endpoint="databricks-gpt-oss-20b")
349
+ k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
351
350
 
352
351
  followup_question = "Which owner owns the most sets?"
353
-
354
- llm_config = llm_config.LLMConfig()
355
-
356
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
352
+ output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
357
353
  print(output)
358
354
  ```
359
355
 
360
- #### User Provided Chat Model
356
+ #### Using Azure OpenAI Embeddings and Chat Models
361
357
 
362
358
  ```python
363
- from kobai import ai_query, llm_config
364
- import json
365
359
  from langchain_openai import AzureChatOpenAI
360
+ from langchain_openai import AzureOpenAIEmbeddings
361
+ import json
366
362
 
367
363
  followup_question = "Which owner owns the most sets?"
368
364
 
369
- llm_config = llm_config.LLMConfig(debug=True)
365
+ embedding_model = AzureOpenAIEmbeddings(
366
+ model="text-embedding-3-small",
367
+ azure_endpoint="https://kobaipoc.openai.azure.com/",
368
+ api_key="YOUR_API_KEY",
369
+ openai_api_version="2023-05-15"
370
+ )
370
371
 
371
372
  chat_model = AzureChatOpenAI(
372
373
  azure_endpoint="https://kobaipoc.openai.azure.com/", azure_deployment="gpt-4o-mini",
@@ -375,7 +376,10 @@ openai_api_version="2024-02-15-preview",
375
376
  temperature=0.5,
376
377
  max_tokens=150,)
377
378
 
378
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, override_model=chat_model, llm_config=llm_config)
379
+ k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
380
+
381
+ followup_question = "Which theme has the most sets?"
382
+ output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
379
383
  print(output)
380
384
  ```
381
385
 
@@ -15,21 +15,50 @@ from kobai import tenant_client, spark_client, databricks_client
15
15
 
16
16
  schema = 'main.demo'
17
17
  uri = 'https://demo.kobai.io'
18
- tenant_id = '1'
19
18
  tenant_name = 'My Demo Tenant'
20
-
21
- k = tenant_client.TenantClient(tenant_name, tenant_id, uri, schema)
19
+ k = tenant_client.TenantClient(tenant_name, uri, schema)
22
20
  ```
23
21
 
24
22
  2. Authenticate with the Kobai instance:
23
+ Authentication can be performed using different methods, such as device code flow, on-behalf-of flow, or browser-based tokens.
24
+
25
+ #### Authentication via device code
26
+ Step 1: Obtain the access token from IDM (Identity and Access Management)
25
27
 
26
28
  ```python
27
- client_id = 'your_Entra_app_id_here'
29
+ from kobai import ms_authenticate
30
+
28
31
  tenant_id = 'your_Entra_directory_id_here'
32
+ client_id = 'your_Entra_app_id_here'
33
+
34
+ access_token = ms_authenticate.device_code(tenant_id, client_id)
35
+ ```
36
+
37
+ Step 2: Use the token to retrieve the list of Kobai tenants (unless the tenant ID is already known).
29
38
 
30
- k.authenticate(client_id, tenant_id)
39
+ ```python
40
+ tenants = k.get_tenants(id_token=access_token)
41
+ print(tenants)
31
42
  ```
32
43
 
44
+ Step 3: Authenticate with Kobai for the specific tenant using the IDM access token.
45
+
46
+ ```python
47
+ kobai_tenant_id = "5c1ba715-3961-4835-8a10-6f6f963b53ff"
48
+ k.use_access_token(access_token = access_token, tenant_id=kobai_tenant_id)
49
+ ```
50
+
51
+ At this point, authentication to the Kobai tenant is successfully completed.
52
+
53
+ #### Authentication via browser token
54
+
55
+ ```python
56
+ k.use_browser_token(access_token="KOBAI_ACESS_TOKEN_FROM_BROWSER")
57
+ ```
58
+
59
+ #### Authentication via on-behalf-of flow
60
+ The sample code demonstrating authentication via the on-behalf-of flow will be provided, if requested.
61
+
33
62
  3. Initialize a Spark client using your current `SparkSession`, and generate semantically-rich SQL views describing this Kobai tenant:
34
63
 
35
64
  ```python
@@ -71,68 +100,41 @@ kobai_query_name = "Set ownership"
71
100
  question_json = k.run_question_remote(k.get_question_id(kobai_query_name)) # By questionName
72
101
  ```
73
102
 
74
- 3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using either Azure OpenAI, Databricks or a user-provided chat model.
75
-
76
- #### Using Azure OpenAI
77
-
78
- ###### Authentication Methods:
79
-
80
- 1. ApiKey
81
-
82
- ```python
83
- from kobai import ai_query, llm_config
84
- import json
85
-
86
- followup_question = "Which owner owns the most sets?"
87
-
88
- llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", api_key="YOUR_API_KEY", deployment="gpt-4o-mini", llm_provider="azure_openai")
89
-
90
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
91
- print(output)
92
- ```
93
-
94
- 2. Azure Active Directory Authentication
103
+ 3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using the user-provided chat and embedding model.
95
104
 
96
- Ensure that the logged-in tenant has access to Azure OpenAI.
97
- In case of databricks notebook, the logged in service principal should have access to Azure OpenAI.
105
+ #### Using Databricks Embeddings and Chat Models in a Databricks Notebook
106
+ Initialize the AI components by specifying the embedding and chat models, then proceed with follow-up questions for interactive engagement.
98
107
 
99
108
  ```python
100
- from kobai import ai_query, llm_config
109
+ from databricks_langchain import DatabricksEmbeddings
110
+ from langchain_community.chat_models import ChatDatabricks
101
111
  import json
102
112
 
103
- followup_question = "Which owner owns the most sets?"
104
-
105
- llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", deployment="gpt-4o-mini", llm_provider="azure_openai")
106
- llm_config.get_azure_ad_token()
107
-
108
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
109
- print(output)
110
- ```
111
-
112
- #### Using Databricks (Default Configuration)
113
-
114
- ```python
115
- from kobai import ai_query, llm_config
116
- import json
113
+ # choose the embedding and chat model of your choice from the databricks serving and initialize.
114
+ embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
115
+ chat_model = ChatDatabricks(endpoint="databricks-gpt-oss-20b")
116
+ k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
117
117
 
118
118
  followup_question = "Which owner owns the most sets?"
119
-
120
- llm_config = llm_config.LLMConfig()
121
-
122
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
119
+ output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
123
120
  print(output)
124
121
  ```
125
122
 
126
- #### User Provided Chat Model
123
+ #### Using Azure OpenAI Embeddings and Chat Models
127
124
 
128
125
  ```python
129
- from kobai import ai_query, llm_config
130
- import json
131
126
  from langchain_openai import AzureChatOpenAI
127
+ from langchain_openai import AzureOpenAIEmbeddings
128
+ import json
132
129
 
133
130
  followup_question = "Which owner owns the most sets?"
134
131
 
135
- llm_config = llm_config.LLMConfig(debug=True)
132
+ embedding_model = AzureOpenAIEmbeddings(
133
+ model="text-embedding-3-small",
134
+ azure_endpoint="https://kobaipoc.openai.azure.com/",
135
+ api_key="YOUR_API_KEY",
136
+ openai_api_version="2023-05-15"
137
+ )
136
138
 
137
139
  chat_model = AzureChatOpenAI(
138
140
  azure_endpoint="https://kobaipoc.openai.azure.com/", azure_deployment="gpt-4o-mini",
@@ -141,7 +143,10 @@ openai_api_version="2024-02-15-preview",
141
143
  temperature=0.5,
142
144
  max_tokens=150,)
143
145
 
144
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, override_model=chat_model, llm_config=llm_config)
146
+ k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
147
+
148
+ followup_question = "Which theme has the most sets?"
149
+ output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
145
150
  print(output)
146
151
  ```
147
152
 
@@ -1,8 +1,6 @@
1
1
  from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate
2
2
  from langchain_core.output_parsers import StrOutputParser
3
3
 
4
- from sentence_transformers import SentenceTransformer, util
5
-
6
4
  from langchain_core.language_models.chat_models import BaseChatModel
7
5
  from langchain_core.embeddings import Embeddings
8
6
  from langchain_core.documents import Document
@@ -10,8 +8,9 @@ from langchain_core.retrievers import BaseRetriever
10
8
  from langchain_core.callbacks import CallbackManagerForRetrieverRun
11
9
  from langchain_core.runnables import RunnablePassthrough, RunnableLambda
12
10
  from langchain_core.vectorstores import InMemoryVectorStore
11
+ import numpy as np
13
12
 
14
- from typing import Union, List
13
+ from typing import List
15
14
 
16
15
 
17
16
  MESSAGE_SYSTEM_TEMPLATE = """
@@ -73,7 +72,7 @@ def format_docs(docs):
73
72
  def input_only(inpt):
74
73
  return inpt["question"]
75
74
 
76
- def followup_question(user_question, question_results, question_name, question_def, embedding_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, use_inmem_vectors=False, k=50):
75
+ def followup_question(user_question, question_results, question_name, question_def, embedding_model: Embeddings, chat_model: BaseChatModel, use_inmem_vectors=False, k=50):
77
76
 
78
77
  row_texts = process_question_results(question_def, question_results)
79
78
  question_documents = [Document(page_content=r, metadata={"source": "kobai"}) for r in row_texts]
@@ -118,22 +117,13 @@ def init_question_search_index(tenant_questions, emb_model):
118
117
 
119
118
  q_ids = [q["id"] for q in tenant_questions]
120
119
  q_descs = [q["description"] for q in tenant_questions]
121
-
122
- if isinstance(emb_model, SentenceTransformer):
123
- q_vectors = emb_model.encode(q_descs)
124
- else:
125
- q_vectors = emb_model.embed_documents(q_descs)
126
-
120
+ q_vectors = emb_model.embed_documents(q_descs)
127
121
  return {"ids": q_ids, "descs": q_descs, "vectors": q_vectors}
128
122
 
129
123
 
130
124
  def question_search(search_text: str, search_index, emb_model, k: int):
131
- if isinstance(emb_model, SentenceTransformer):
132
- search_vec = emb_model.encode(search_text)
133
- else:
134
- search_vec = emb_model.embed_query(search_text)
125
+ search_vec = emb_model.embed_query(search_text)
135
126
  #search_vec = emb_model.encode(search_text)
136
-
137
127
  matches = __top_vector_matches(search_vec, search_index["vectors"], top=k)
138
128
 
139
129
  for mi, m in enumerate(matches):
@@ -142,13 +132,25 @@ def question_search(search_text: str, search_index, emb_model, k: int):
142
132
  return matches
143
133
 
144
134
  def __top_vector_matches(test_vec, options_list_vec, top=1):
145
- scores_t = util.cos_sim(test_vec, options_list_vec)[0]
146
- scores_l = scores_t.tolist()
147
- scores_d = [{"index": i, "value": v} for i, v in enumerate(scores_l)]
148
- sorted_d = sorted(scores_d, key=lambda i: i["value"], reverse=True)
149
- top_d = sorted_d[0:top]
135
+ # Normalize the test vector
136
+ test_vec_norm = test_vec / np.linalg.norm(test_vec)
137
+ # Normalize the option vectors
138
+ options_norm = options_list_vec / np.linalg.norm(options_list_vec, axis=1, keepdims=True)
139
+
140
+ # Compute cosine similarity (dot product of normalized vectors)
141
+ cosine_similarities = np.dot(options_norm, test_vec_norm)
142
+
143
+ # Get indexes and similarity scores as dict
144
+ scores_d = [{"index": i, "value": float(v)} for i, v in enumerate(cosine_similarities)]
145
+
146
+ # Sort dict by similarity score descending
147
+ sorted_d = sorted(scores_d, key=lambda x: x["value"], reverse=True)
148
+
149
+ # Return top results
150
+ top_d = sorted_d[:top]
150
151
  return top_d
151
152
 
153
+
152
154
  def process_question_results(question_def, question_results):
153
155
 
154
156
  """
@@ -211,8 +213,9 @@ def process_question_results(question_def, question_results):
211
213
 
212
214
 
213
215
  concept_order = [max_src]
214
- for t in concept_rels[max_src]["edges"]:
215
- concept_order.append(t["dst"])
216
+ if max_src != "":
217
+ for t in concept_rels[max_src]["edges"]:
218
+ concept_order.append(t["dst"])
216
219
 
217
220
  for c in concept_props:
218
221
  if c not in concept_order:
@@ -3,9 +3,7 @@ from pyspark.sql import SparkSession
3
3
 
4
4
  from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
5
5
  from pyspark.sql import functions as F
6
- from sentence_transformers import SentenceTransformer
7
6
  from delta import DeltaTable
8
- from typing import Union
9
7
  from langchain_core.language_models.chat_models import BaseChatModel
10
8
  from langchain_core.embeddings import Embeddings
11
9
  from langchain_community.document_loaders import PySparkDataFrameLoader
@@ -145,13 +143,13 @@ def __generate_sentences_from_questions(tc: AIContext, debug):
145
143
  ss.sql(full_sql)
146
144
 
147
145
 
148
- def encode_to_delta_local(tc: AIContext, st_model: Union[SentenceTransformer, Embeddings], replica_schema=None, batch_size=100000):
146
+ def encode_to_delta_local(tc: AIContext, st_model: Embeddings, replica_schema=None, batch_size=100000):
149
147
  """
150
148
  Encode Semantic Data to Vectors in Delta Table
151
149
 
152
150
  Parameters:
153
151
  tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
154
- st_model (SentenceTransformer): A sentence_transformers model to use for encoding.
152
+ st_model (Embeddings): A langchain embedding model to use for encoding.
155
153
  replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
156
154
  """
157
155
 
@@ -174,12 +172,8 @@ def encode_to_delta_local(tc: AIContext, st_model: Union[SentenceTransformer, Em
174
172
  content_list = [r["content"] for r in sentences_df.collect()]
175
173
  id_list = [r["id"] for r in sentences_df.collect()]
176
174
 
177
- if isinstance(st_model, SentenceTransformer):
178
- vector_list = st_model.encode(
179
- content_list, normalize_embeddings=True, show_progress_bar=True).tolist()
180
- else:
181
- vector_list = st_model.embed_documents(content_list)
182
- for i, v in enumerate(vector_list):
175
+ vector_list = st_model.embed_documents(content_list)
176
+ for i, v in enumerate(vector_list):
183
177
  vector_list[i] = [float(x) for x in v]
184
178
  #vector_list = st_model.encode(
185
179
  # content_list, normalize_embeddings=True, show_progress_bar=True)
@@ -214,13 +208,13 @@ def encode_to_delta_local(tc: AIContext, st_model: Union[SentenceTransformer, Em
214
208
  # """)
215
209
 
216
210
 
217
- def rag_delta(tc: AIContext, emb_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, question, k=5, replica_schema=None):
211
+ def rag_delta(tc: AIContext, emb_model: Embeddings, chat_model: BaseChatModel, question, k=5, replica_schema=None):
218
212
  """
219
213
  Run a RAG query using vectors in Delta table.
220
214
 
221
215
  Parameters:
222
216
  tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
223
- emb_model (UNION[SentenceTransformer, Embeddings]): A sentence_transformers or langchain embedding model to use for encoding the query.
217
+ emb_model (Embeddings): A langchain embedding model to use for encoding the query.
224
218
  chat_model (BaseChatModel): A langchain chat model to use in the RAG pipeline.
225
219
  question (str): The user's query.
226
220
  k (int) OPTIONAL: The number of RAG documents to retrieve.
@@ -233,10 +227,7 @@ def rag_delta(tc: AIContext, emb_model: Union[SentenceTransformer, Embeddings],
233
227
 
234
228
  ss = tc.spark_session
235
229
 
236
- if isinstance(emb_model, SentenceTransformer):
237
- vector_list = emb_model.encode(
238
- question, normalize_embeddings=True).tolist()
239
- elif isinstance(emb_model, Embeddings):
230
+ if isinstance(emb_model, Embeddings):
240
231
  vector_list = emb_model.embed_query(question)
241
232
  else:
242
233
  print("Invalid Embedding Model Type")
@@ -0,0 +1,66 @@
1
+ from azure.identity import DeviceCodeCredential
2
+ from azure.identity import OnBehalfOfCredential
3
+ from azure.core.exceptions import AzureError
4
+
5
+ def get_scope(client_id: str = None, target_client_id: str = None, scope: str = None):
6
+
7
+ """
8
+ Get the default scopes
9
+
10
+ Parameters:
11
+ client_id (str): Client ID or Application ID from app registration with IDM.
12
+ target_client_id (str): Kobai IDM client ID.
13
+ scope (str): Scope to be passed
14
+ """
15
+ if scope is not None:
16
+ return scope
17
+
18
+ if target_client_id is None:
19
+ target_client_id = client_id
20
+
21
+ return f"openid profile offline_access api://{target_client_id}/Kobai.Access"
22
+
23
+ def device_code(tenant_id: str, client_id: str, target_client_id: str = None, scope: str = None):
24
+
25
+ """
26
+ Authenticate using the device code flow and get the access token
27
+
28
+ Parameters:
29
+ tenant_id (str): Tenant ID or Directory ID for IDM.
30
+ client_id (str): Client ID or Application ID from app registration with IDM.
31
+ target_client_id (str): Kobai IDM client ID.
32
+ scope (str): Scope to be passed
33
+ """
34
+ credential = DeviceCodeCredential(client_id=client_id, tenant_id=tenant_id)
35
+
36
+ try:
37
+ token = credential.get_token(get_scope(client_id, target_client_id, scope))
38
+ return token.token
39
+ except AzureError as e:
40
+ return e
41
+
42
+ def onbehalf(tenant_id: str, client_id: str, client_secret: str, access_token: str, target_client_id: str = None, scope: str = None):
43
+
44
+ """
45
+ Authenticate using the onbehalf flow and get the access token
46
+
47
+ Parameters:
48
+ tenant_id (str): Tenant ID or Directory ID for IDM.
49
+ client_id (str): Client ID or Application ID from app registration with IDM.
50
+ client_secret (str): Client secret from app registration with IDM.
51
+ access_token (str): Access token to be exchanged.
52
+ target_client_id (str): Kobai IDM client ID.
53
+ scope (str): Scope to be passed
54
+ """
55
+ credential = OnBehalfOfCredential(
56
+ tenant_id=tenant_id,
57
+ client_id=client_id,
58
+ client_secret=client_secret,
59
+ user_assertion=access_token
60
+ )
61
+
62
+ try:
63
+ token = credential.get_token(get_scope(client_id, target_client_id, scope))
64
+ return token.token
65
+ except AzureError as e:
66
+ return e
@@ -19,7 +19,10 @@ class TenantAPI:
19
19
  self.session = requests.Session()
20
20
 
21
21
  if token is not None:
22
- self.session.headers.update({'Authorization': 'Bearer %s' % self.token})
22
+ if token.startswith('Bearer'):
23
+ self.session.headers.update({'Authorization': '%s' % self.token})
24
+ else:
25
+ self.session.headers.update({'Authorization': 'Bearer %s' % self.token})
23
26
 
24
27
  self.ssl_verify = verify
25
28
  self.session.verify = verify
@@ -112,7 +115,7 @@ class TenantAPI:
112
115
 
113
116
  if op_desc is None:
114
117
  op_desc = "operation"
115
-
118
+
116
119
  response = self.session.get(
117
120
  self.base_uri + uri,
118
121
  params=params,
@@ -3,15 +3,12 @@ import json
3
3
  import urllib
4
4
  import urllib.parse
5
5
 
6
- from azure.identity import DeviceCodeCredential
7
6
  from pyspark.sql import SparkSession
8
7
 
9
8
  from langchain_community.chat_models import ChatDatabricks
10
9
  from databricks_langchain import DatabricksEmbeddings
11
- from sentence_transformers import SentenceTransformer
12
10
  from langchain_core.language_models.chat_models import BaseChatModel
13
11
  from langchain_core.embeddings import Embeddings
14
- from typing import Union
15
12
 
16
13
  from . import spark_client, databricks_client, ai_query, tenant_api, ai_rag
17
14
 
@@ -64,83 +61,73 @@ class TenantClient:
64
61
  # MS Entra Auth
65
62
  ########################################
66
63
 
67
- def authenticate(self, client_id: str, tenant_id: str, run_ai_init: bool = True, override_username: str = None):
64
+ def use_browser_token(self, access_token):
68
65
 
69
66
  """
70
67
  Authenticate the TenantClient with the Kobai instance. Returns nothing, but stores bearer token in client.
71
-
72
- Limitations:
73
- Currently supports only authentication via Microsoft Entra (AzureAD) using DecideCode OAuth flow.
68
+ This is a fall-back method for instances not using OAuth. It is inconvenient as a Kobai Bearer Token must be retrieved from the users browser.
74
69
 
75
70
  Parameters:
76
- client_id (str): Client ID or Application ID from app registration with IDM.
77
- tenant_id (str): Tenant ID or Directory ID for IDM.
71
+ access_token (str): Bearer token for Kobai app session.
78
72
  """
73
+ self._init_post_auth_success(access_token)
79
74
 
80
- credential = DeviceCodeCredential(client_id=client_id, tenant_id=tenant_id)
81
-
82
- access = credential.authenticate()
83
-
84
- oauth_token = access.serialize()
85
- print(oauth_token)
86
- user_name = json.loads(access.serialize())["username"]
87
-
88
- if override_username is not None:
89
- user_name = override_username
75
+ def use_access_token(self, access_token: str, id_token: str = None, tenant_id: str = None):
90
76
 
91
- user_name_query_params={ 'userName' : user_name}
92
- tenants_response = self.api_client._TenantAPI__run_get('/user-mgmt-svcs/auth/tenants?'+urllib.parse.urlencode(user_name_query_params))
93
-
94
-
95
- tenant_list = json.loads(tenants_response.content.decode("utf-8"))
77
+ """
78
+ Authenticate the TenantClient with the Kobai instance. Returns nothing, but stores bearer token in client.
96
79
 
97
- tenant_id = ""
98
- for t in tenant_list:
99
- if t["name"] == self.tenant_name:
100
- tenant_id = t["id"]
80
+ Parameters:
81
+ access_token (str): Access token of the IDM server to be used to obtained the kobai access token.
82
+ id_token (str): ID token of the IDM server to be used to obtained the onbehalf access token.
83
+ tenant_id (str): Kobai tenant id.
84
+ """
101
85
 
102
86
  token_request_payload={
87
+ "tenantName" : self.tenant_name,
103
88
  "tenantId" : tenant_id,
104
- "oauthToken" : oauth_token,
105
- "userName" : user_name
89
+ "idToken" : id_token,
90
+ "accessToken" : access_token
106
91
  }
107
- token_response = self.api_client._TenantAPI__run_post(
108
- '/user-mgmt-svcs/auth/oauth/devicecode',
92
+
93
+ response = self.api_client._TenantAPI__run_post(
94
+ '/user-mgmt-svcs/auth/oauth/external/onbehalf/token',
109
95
  token_request_payload
110
96
  )
111
- access_token = token_response.content.decode()
112
- self.token = access_token
113
-
114
- self.__api_init_session()
115
- self.__set_tenant_solutionid()
116
- if run_ai_init:
117
- self.init_ai_components()
118
-
119
- print("Authentication Successful.")
120
97
 
121
- def authenticate_browser_token(self, access_token, run_ai_init: bool = True):
98
+ kb_access_token = response.headers.get('Authorization')
99
+ self.use_browser_token(kb_access_token)
100
+
101
+ def get_tenants(self, id_token: str = None):
122
102
 
123
103
  """
124
- Authenticate the TenantClient with the Kobai instance. Returns nothing, but stores bearer token in client.
125
- This is a fall-back method for instances not using OAuth. It is inconvenient as a Kobai Bearer Token must be retrieved from the users browser.
104
+ Get the tenants associated with the given id token of the IDM. Returns tenants list.
126
105
 
127
106
  Parameters:
128
- access_token (str): Bearer token for Kobai app session.
107
+ id_token (str): ID token of the IDM server to be used to obtain user tenants.
129
108
  """
130
109
 
131
- self.token = access_token
132
-
133
- self.__api_init_session()
134
- self.__set_tenant_solutionid()
135
- if run_ai_init:
136
- self.init_ai_components()
110
+ if (id_token is not None) :
111
+ token_request_payload={
112
+ "idToken" : id_token
113
+ }
137
114
 
115
+ response = self.api_client._TenantAPI__run_post(
116
+ '/user-mgmt-svcs/auth/oauth/external/token/tenants',
117
+ token_request_payload
118
+ )
138
119
 
139
- print("Authentication Successful.")
120
+ self.tenant_list = response.json()
121
+ return self.tenant_list
140
122
 
141
123
  def __api_init_session(self):
142
124
  self.api_client = tenant_api.TenantAPI(self.token, self.uri, verify=self.ssl_verify, proxies=self.proxies )
143
-
125
+
126
+ def _init_post_auth_success(self, access_token):
127
+ self.token = access_token
128
+ self.__api_init_session()
129
+ self.__set_tenant_solutionid()
130
+ print("Authentication Successful.")
144
131
 
145
132
  ########################################
146
133
  # Basic Config
@@ -452,7 +439,7 @@ class TenantClient:
452
439
  """
453
440
  ai_rag.generate_sentences(self.get_ai_context(), replica_schema=replica_schema, concept_white_list=concept_white_list, use_questions=use_questions, debug=debug)
454
441
 
455
- def rag_encode_to_delta_local(self, st_model: Union[SentenceTransformer, Embeddings], replica_schema=None, batch_size=100000):
442
+ def rag_encode_to_delta_local(self, st_model: Embeddings, replica_schema=None, batch_size=100000):
456
443
  """
457
444
  Encode Semantic Data to Vectors in Delta Table
458
445
 
@@ -462,7 +449,7 @@ class TenantClient:
462
449
  """
463
450
  ai_rag.encode_to_delta_local(self.get_ai_context(), st_model=st_model, replica_schema=replica_schema, batch_size=batch_size)
464
451
 
465
- def rag_delta(self, emb_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, question, k=5, replica_schema=None):
452
+ def rag_delta(self, emb_model: Embeddings, chat_model: BaseChatModel, question, k=5, replica_schema=None):
466
453
  """
467
454
  Run a RAG query using vectors in Delta table.
468
455
 
@@ -490,9 +477,7 @@ class TenantClient:
490
477
  """
491
478
 
492
479
  if question_id is None:
493
-
494
480
  suggestions = self.question_search(user_question, k=1)
495
-
496
481
  question_id = suggestions[0]["id"]
497
482
 
498
483
  question_results = self.run_question_remote(question_id, dynamic_filters=dynamic_filters)
@@ -502,26 +487,16 @@ class TenantClient:
502
487
 
503
488
  return ai_query.followup_question(user_question, question_results, question_name, question_def, self.embedding_model, self.chat_model, use_inmem_vectors=use_inmem_vectors, k=k)
504
489
 
505
- def init_ai_components(self, embedding_model: Union[SentenceTransformer, Embeddings] = None, chat_model: BaseChatModel = None):
490
+ def init_ai_components(self, embedding_model: Embeddings, chat_model: BaseChatModel):
506
491
  """
507
492
  Set Chat and Embedding models for AI functions to use. If no arguments provided, Databricks hosted services are used.
508
493
 
509
494
  Parameters:
510
- embedding_model (Union[SentenceTransformer, Embeddings]) OPTIONAL: A sentence_transformer or Langchain Embedding model.
511
- chat_model (BaseChatModel) OPTIONAL: A Langchain BaseChatModel chat model.
495
+ embedding_model (Embeddings): A Langchain Embedding model.
496
+ chat_model (BaseChatModel): A Langchain BaseChatModel chat model.
512
497
  """
513
-
514
- if embedding_model is not None:
515
- self.embedding_model = embedding_model
516
- else:
517
- #self.embedding_model = SentenceTransformer("baai/bge-large-en-v1.5")
518
- self.embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
519
-
520
- if chat_model is not None:
521
- self.chat_model = chat_model
522
- else:
523
- self.chat_model = ChatDatabricks(endpoint="databricks-dbrx-instruct")
524
-
498
+ self.embedding_model = embedding_model
499
+ self.chat_model = chat_model
525
500
  self.question_search_index = ai_query.init_question_search_index(self.list_questions(), self.embedding_model)
526
501
 
527
502
  def question_search(self, search_text, k: int = 1):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kobai-sdk
3
- Version: 0.2.9
3
+ Version: 0.3.0
4
4
  Summary: A package that enables interaction with a Kobai tenant.
5
5
  Author-email: Ryan Oattes <ryan@kobai.io>
6
6
  License: Apache License
@@ -223,7 +223,6 @@ Requires-Dist: langchain-core
223
223
  Requires-Dist: langchain-community
224
224
  Requires-Dist: langchain_openai
225
225
  Requires-Dist: databricks_langchain
226
- Requires-Dist: sentence-transformers
227
226
  Provides-Extra: dev
228
227
  Requires-Dist: black; extra == "dev"
229
228
  Requires-Dist: bumpver; extra == "dev"
@@ -249,21 +248,50 @@ from kobai import tenant_client, spark_client, databricks_client
249
248
 
250
249
  schema = 'main.demo'
251
250
  uri = 'https://demo.kobai.io'
252
- tenant_id = '1'
253
251
  tenant_name = 'My Demo Tenant'
254
-
255
- k = tenant_client.TenantClient(tenant_name, tenant_id, uri, schema)
252
+ k = tenant_client.TenantClient(tenant_name, uri, schema)
256
253
  ```
257
254
 
258
255
  2. Authenticate with the Kobai instance:
256
+ Authentication can be performed using different methods, such as device code flow, on-behalf-of flow, or browser-based tokens.
257
+
258
+ #### Authentication via device code
259
+ Step 1: Obtain the access token from IDM (Identity and Access Management)
259
260
 
260
261
  ```python
261
- client_id = 'your_Entra_app_id_here'
262
+ from kobai import ms_authenticate
263
+
262
264
  tenant_id = 'your_Entra_directory_id_here'
265
+ client_id = 'your_Entra_app_id_here'
266
+
267
+ access_token = ms_authenticate.device_code(tenant_id, client_id)
268
+ ```
269
+
270
+ Step 2: Use the token to retrieve the list of Kobai tenants (unless the tenant ID is already known).
263
271
 
264
- k.authenticate(client_id, tenant_id)
272
+ ```python
273
+ tenants = k.get_tenants(id_token=access_token)
274
+ print(tenants)
265
275
  ```
266
276
 
277
+ Step 3: Authenticate with Kobai for the specific tenant using the IDM access token.
278
+
279
+ ```python
280
+ kobai_tenant_id = "5c1ba715-3961-4835-8a10-6f6f963b53ff"
281
+ k.use_access_token(access_token = access_token, tenant_id=kobai_tenant_id)
282
+ ```
283
+
284
+ At this point, authentication to the Kobai tenant is successfully completed.
285
+
286
+ #### Authentication via browser token
287
+
288
+ ```python
289
+ k.use_browser_token(access_token="KOBAI_ACESS_TOKEN_FROM_BROWSER")
290
+ ```
291
+
292
+ #### Authentication via on-behalf-of flow
293
+ The sample code demonstrating authentication via the on-behalf-of flow will be provided, if requested.
294
+
267
295
  3. Initialize a Spark client using your current `SparkSession`, and generate semantically-rich SQL views describing this Kobai tenant:
268
296
 
269
297
  ```python
@@ -305,68 +333,41 @@ kobai_query_name = "Set ownership"
305
333
  question_json = k.run_question_remote(k.get_question_id(kobai_query_name)) # By questionName
306
334
  ```
307
335
 
308
- 3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using either Azure OpenAI, Databricks or a user-provided chat model.
309
-
310
- #### Using Azure OpenAI
311
-
312
- ###### Authentication Methods:
313
-
314
- 1. ApiKey
315
-
316
- ```python
317
- from kobai import ai_query, llm_config
318
- import json
319
-
320
- followup_question = "Which owner owns the most sets?"
321
-
322
- llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", api_key="YOUR_API_KEY", deployment="gpt-4o-mini", llm_provider="azure_openai")
323
-
324
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
325
- print(output)
326
- ```
327
-
328
- 2. Azure Active Directory Authentication
336
+ 3. Ask a Follow-Up Question: Based on the initial results, you can ask a follow-up question using the user-provided chat and embedding model.
329
337
 
330
- Ensure that the logged-in tenant has access to Azure OpenAI.
331
- In case of databricks notebook, the logged in service principal should have access to Azure OpenAI.
338
+ #### Using Databricks Embeddings and Chat Models in a Databricks Notebook
339
+ Initialize the AI components by specifying the embedding and chat models, then proceed with follow-up questions for interactive engagement.
332
340
 
333
341
  ```python
334
- from kobai import ai_query, llm_config
342
+ from databricks_langchain import DatabricksEmbeddings
343
+ from langchain_community.chat_models import ChatDatabricks
335
344
  import json
336
345
 
337
- followup_question = "Which owner owns the most sets?"
338
-
339
- llm_config = llm_config.LLMConfig(endpoint="https://kobaipoc.openai.azure.com/", deployment="gpt-4o-mini", llm_provider="azure_openai")
340
- llm_config.get_azure_ad_token()
341
-
342
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
343
- print(output)
344
- ```
345
-
346
- #### Using Databricks (Default Configuration)
347
-
348
- ```python
349
- from kobai import ai_query, llm_config
350
- import json
346
+ # choose the embedding and chat model of your choice from the databricks serving and initialize.
347
+ embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
348
+ chat_model = ChatDatabricks(endpoint="databricks-gpt-oss-20b")
349
+ k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
351
350
 
352
351
  followup_question = "Which owner owns the most sets?"
353
-
354
- llm_config = llm_config.LLMConfig()
355
-
356
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, llm_config=llm_config)
352
+ output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
357
353
  print(output)
358
354
  ```
359
355
 
360
- #### User Provided Chat Model
356
+ #### Using Azure OpenAI Embeddings and Chat Models
361
357
 
362
358
  ```python
363
- from kobai import ai_query, llm_config
364
- import json
365
359
  from langchain_openai import AzureChatOpenAI
360
+ from langchain_openai import AzureOpenAIEmbeddings
361
+ import json
366
362
 
367
363
  followup_question = "Which owner owns the most sets?"
368
364
 
369
- llm_config = llm_config.LLMConfig(debug=True)
365
+ embedding_model = AzureOpenAIEmbeddings(
366
+ model="text-embedding-3-small",
367
+ azure_endpoint="https://kobaipoc.openai.azure.com/",
368
+ api_key="YOUR_API_KEY",
369
+ openai_api_version="2023-05-15"
370
+ )
370
371
 
371
372
  chat_model = AzureChatOpenAI(
372
373
  azure_endpoint="https://kobaipoc.openai.azure.com/", azure_deployment="gpt-4o-mini",
@@ -375,7 +376,10 @@ openai_api_version="2024-02-15-preview",
375
376
  temperature=0.5,
376
377
  max_tokens=150,)
377
378
 
378
- output = ai_query.followup_question(followup_question, json.dumps(question_json), kobai_query_name, override_model=chat_model, llm_config=llm_config)
379
+ k.init_ai_components(embedding_model=embedding_model, chat_model=chat_model)
380
+
381
+ followup_question = "Which theme has the most sets?"
382
+ output = k.followup_question(followup_question, question_id=k.get_question_id(kobai_query_name))
379
383
  print(output)
380
384
  ```
381
385
 
@@ -7,10 +7,10 @@ kobai/ai_query.py
7
7
  kobai/ai_rag.py
8
8
  kobai/databricks_client.py
9
9
  kobai/demo_tenant_client.py
10
+ kobai/ms_authenticate.py
10
11
  kobai/spark_client.py
11
12
  kobai/tenant_api.py
12
13
  kobai/tenant_client.py
13
- kobai/test.py
14
14
  kobai_sdk.egg-info/PKG-INFO
15
15
  kobai_sdk.egg-info/SOURCES.txt
16
16
  kobai_sdk.egg-info/dependency_links.txt
@@ -7,7 +7,6 @@ langchain-core
7
7
  langchain-community
8
8
  langchain_openai
9
9
  databricks_langchain
10
- sentence-transformers
11
10
 
12
11
  [dev]
13
12
  black
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "kobai-sdk"
7
- version = "0.2.9"
7
+ version = "0.3.0"
8
8
  description = "A package that enables interaction with a Kobai tenant."
9
9
  readme = "README.md"
10
10
  authors = [{ name = "Ryan Oattes", email = "ryan@kobai.io" }]
@@ -26,8 +26,7 @@ dependencies = [
26
26
  "langchain-core",
27
27
  "langchain-community",
28
28
  "langchain_openai",
29
- "databricks_langchain",
30
- "sentence-transformers"
29
+ "databricks_langchain"
31
30
  ]
32
31
  requires-python = ">=3.11"
33
32
 
@@ -1,5 +0,0 @@
1
- import llm_config, ai_query
2
-
3
- llm_config = llm_config.LLMConfig(api_key="sV9LuoA5n0PwqggMXOYMhhZlt56FpgnMXFohimPhD7Ug3CnBLbO8JQQJ99ALACYeBjFXJ3w3AAABACOGZm8X", llm_provider="azure_openai")
4
- llm_config.get_azure_ad_token()
5
- ai_query.followup_question_1(question="abc", data={}, question_name="sample", llm_config=llm_config)
File without changes
File without changes
File without changes
File without changes