kobai-sdk 0.2.8rc2__py3-none-any.whl → 0.2.8rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kobai-sdk might be problematic. Click here for more details.

kobai/ai_rag.py CHANGED
@@ -1,15 +1,12 @@
1
1
  from kobai import tenant_client
2
2
 
3
3
  from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
4
- #from delta.tables import *
5
- from sentence_transformers import SentenceTransformer, util
6
- #from deltalake import DeltaTable
4
+ from pyspark.sql import functions as F
5
+ from sentence_transformers import SentenceTransformer
7
6
  from delta import DeltaTable
8
7
  from typing import Union
9
8
  from langchain_core.language_models.chat_models import BaseChatModel
10
9
  from langchain_core.embeddings import Embeddings
11
- from langchain_core.documents import Document
12
- #from databricks_langchain import DatabricksEmbeddings, ChatDatabricks
13
10
  from langchain_community.document_loaders import PySparkDataFrameLoader
14
11
  from langchain import hub
15
12
  from langchain_core.output_parsers import StrOutputParser
@@ -18,14 +15,16 @@ from langchain_core.output_parsers import StrOutputParser
18
15
 
19
16
 
20
17
 
21
- def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None, concept_white_list=None):
18
+ def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None, concept_white_list=None, use_questions=False):
22
19
 
23
20
  """
24
21
  Extract Semantic Data from Graph to Delta Table
25
22
 
26
23
  Parameters:
27
24
  tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
28
- replica_schema (str): An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
25
+ replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
26
+ concept_white_list ([str]) OPTIONAL: A list of Domain and Concept names for extraction.
27
+ use_questions (bool) OPTIONAL: Extract facts from published Kobai questions.
29
28
  """
30
29
 
31
30
  if tc.spark_client is None:
@@ -37,8 +36,6 @@ def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None, conc
37
36
  tenant_json = tc.get_tenant_config()
38
37
 
39
38
  concepts = __get_concept_metadata(tenant_json, tc.schema, tc.model_id, concept_white_list)
40
- print(concepts)
41
- print("")
42
39
 
43
40
  print("Dropping and Recreating the RAG Table")
44
41
  ss.sql(__create_rag_table_sql(tc.schema, tc.model_id))
@@ -50,17 +47,77 @@ def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None, conc
50
47
 
51
48
  print("Running the Extraction")
52
49
  for sql_statement in sql_statements:
53
- print(sql_statement)
54
- print("")
55
50
  ss.sql(sql_statement)
56
51
 
52
+ if use_questions:
53
+ __generate_sentences_from_questions(tc)
54
+
57
55
  if replica_schema is not None:
58
56
  print("Replicating Schema")
59
57
  ss.sql(__create_rag_table_sql(replica_schema, tc.model_id))
60
58
  ss.sql(__replicate_to_catalog_sql(tc.schema, replica_schema, tc.model_id))
61
59
 
60
+ def __generate_sentences_from_questions(tc: tenant_client.TenantClient):
61
+ ss = tc.spark_client.spark_session
62
+
63
+ print("Getting Question Data")
64
+
65
+ tenant_json = tc.get_tenant_config()
66
+
67
+ published_queries = []
68
+ for p in tenant_json["publishedAPIs"]:
69
+ published_queries.append(p["queryId"])
70
+
71
+ question_names = {}
72
+ for q in tenant_json["queries"]:
73
+ if q["id"] in published_queries:
74
+ question_names[q["id"]] = q["description"]
75
+
76
+ schemaV = StructType([
77
+ StructField("sentence",StringType(),True),
78
+ StructField("query_id", StringType(), True)
79
+ ])
80
+
81
+ sentences = []
82
+ for p in published_queries:
83
+ output = tc.run_question_remote(p)
84
+ for r in output:
85
+ sentence = f"For {question_names[p]}: "
86
+ for c in r:
87
+ sentence += f"The {c.replace('_', ' ')} is {r[c]}. "
88
+ sentences.append([sentence, p])
89
+
90
+
91
+ sentences_df = ss.createDataFrame(sentences, schemaV)
92
+ sentences_df = sentences_df.select(
93
+ F.col("sentence").alias("sentence"),
94
+ F.col("query_id").alias("concept_id"),
95
+ F.lit("q").alias("type"),
96
+ )
97
+
98
+ schema = tc.schema
99
+
100
+ view_name = f"rag_{tc.model_id}_question_sentences"
101
+ sentences_df.createOrReplaceTempView(view_name)
102
+
103
+ full_sql = f"INSERT INTO {schema}.rag_{tc.model_id} (content, concept_id, type)"
104
+ full_sql += f" SELECT sentence, concept_id, type FROM {view_name}"
105
+
106
+ ss.sql(full_sql)
107
+
108
+
109
+
62
110
  def encode_to_delta_local(tc: tenant_client.TenantClient, st_model: SentenceTransformer, replica_schema=None):
63
111
 
112
+ """
113
+ Encode Semantic Data to Vectors in Delta Table
114
+
115
+ Parameters:
116
+ tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
117
+ st_model (SentenceTransformer): A sentence_transformers model to use for encoding.
118
+ replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
119
+ """
120
+
64
121
  if tc.spark_client is None:
65
122
  return None
66
123
 
@@ -76,7 +133,6 @@ def encode_to_delta_local(tc: tenant_client.TenantClient, st_model: SentenceTran
76
133
  num_records = sentences_df.count()
77
134
  query_batch_size = 100000
78
135
 
79
- #pool = model.start_multi_process_pool()
80
136
 
81
137
  for x in range(0, num_records, query_batch_size):
82
138
  print(f"Running Batch Starting at {x}")
@@ -84,14 +140,8 @@ def encode_to_delta_local(tc: tenant_client.TenantClient, st_model: SentenceTran
84
140
  sentences_df = ss.sql(sentences_sql)
85
141
  content_list = [r["content"] for r in sentences_df.collect()]
86
142
  id_list = [r["id"] for r in sentences_df.collect()]
87
- #num_records_batch = len(content_list)
88
- #print("Done Getting Data")
89
143
 
90
-
91
144
  vector_list = st_model.encode(content_list, normalize_embeddings=True, show_progress_bar=True)
92
- #vector_list = model.encode_multi_process(content_list, pool)
93
-
94
- #print("Done Encoding")
95
145
 
96
146
  schemaV = StructType([
97
147
  StructField("id",IntegerType(),True),
@@ -123,6 +173,18 @@ def encode_to_delta_local(tc: tenant_client.TenantClient, st_model: SentenceTran
123
173
 
124
174
  def rag_delta(tc: tenant_client.TenantClient, emb_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, question, k=5, replica_schema=None):
125
175
 
176
+ """
177
+ Run a RAG query using vectors in Delta table.
178
+
179
+ Parameters:
180
+ tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
181
+ emb_model (UNION[SentenceTransformer, Embeddings]): A sentence_transformers or langchain embedding model to use for encoding the query.
182
+ chat_model (BaseChatModel): A langchain chat model to use in the RAG pipeline.
183
+ question (str): The user's query.
184
+ k (int) OPTIONAL: The number of RAG documents to retrieve.
185
+ replica_schema (str) OPTIONAL: An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
186
+ """
187
+
126
188
  schema = tc.schema
127
189
  if replica_schema is not None:
128
190
  schema = replica_schema
@@ -145,11 +207,8 @@ def rag_delta(tc: tenant_client.TenantClient, emb_model: Union[SentenceTransform
145
207
  print("Invalid Chat Model Type")
146
208
  return None
147
209
 
148
- #print(vector_list)
149
210
  vector_list = [str(x) for x in vector_list]
150
- #print(vector_list)
151
211
  vector_sql = ", ".join(vector_list)
152
- #print(vector_sql)
153
212
 
154
213
  results = ss.sql(f"""
155
214
  SELECT content, reduce(zip_with(vector, cast(array({vector_sql}) as array<float>), (x,y) -> x*y), float(0.0), (acc,x) -> acc + x) score
@@ -162,8 +221,6 @@ def rag_delta(tc: tenant_client.TenantClient, emb_model: Union[SentenceTransform
162
221
  documents = loader.load()
163
222
  docs_content = "\n\n".join(doc.page_content for doc in documents)
164
223
 
165
- #print(docs_content)
166
-
167
224
  prompt = hub.pull("rlm/rag-prompt")
168
225
 
169
226
  output_parser = StrOutputParser()
@@ -179,34 +236,6 @@ def rag_delta(tc: tenant_client.TenantClient, emb_model: Union[SentenceTransform
179
236
 
180
237
  return response
181
238
 
182
- def dep_rag_delta(tc: tenant_client.TenantClient, st_model: SentenceTransformer, question, k=5, replica_schema=None):
183
-
184
- schema = tc.schema
185
- if replica_schema is not None:
186
- schema = replica_schema
187
-
188
- if tc.spark_client is None:
189
- return None
190
-
191
- ss = tc.spark_client.spark_session
192
-
193
- vector_list = st_model.encode(question, normalize_embeddings=True).tolist()
194
-
195
- #print(vector_list)
196
- vector_list = [str(x) for x in vector_list]
197
- #print(vector_list)
198
- vector_sql = ", ".join(vector_list)
199
- #print(vector_sql)
200
-
201
- results = ss.sql(f"""
202
- SELECT content, reduce(zip_with(vector, cast(array({vector_sql}) as array<float>), (x,y) -> x*y), float(0.0), (acc,x) -> acc + x) score
203
- FROM {schema}.rag_{tc.model_id}
204
- ORDER BY score DESC
205
- LIMIT {k}
206
- """)
207
-
208
- return results
209
-
210
239
  def __create_rag_table_sql(schema, model_id):
211
240
  return f"CREATE OR REPLACE TABLE {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
212
241
 
@@ -219,10 +248,8 @@ def __generate_sentence_sql_concept_literals(concepts, schema, model_id):
219
248
  statements = []
220
249
  for con in concepts:
221
250
  sql = f"'This is a {con['label']}. '"
222
- #sql += " || 'It is identified by ' || split(cid._conceptid,'#')[1] || '. '"
223
251
  sql += " || 'It is identified by ' || cid._plain_conceptid || '. '"
224
252
 
225
- #sql_from = f"{con['con_table_name']} cid"
226
253
  sql_from = f"(SELECT _conceptid, _plain_conceptid FROM {con['prop_table_name']} GROUP BY _conceptid, _plain_conceptid) cid"
227
254
  for prop in con["properties"]:
228
255
 
@@ -237,7 +264,6 @@ def __generate_sentence_sql_concept_literals(concepts, schema, model_id):
237
264
  full_sql += f" SELECT {sql} content, cid._conceptid concept_id, 'c' type FROM {sql_from} GROUP BY cid._conceptid, cid._plain_conceptid"
238
265
 
239
266
  statements.append(full_sql)
240
- #test_df = spark.sql(full_sql)
241
267
  return statements
242
268
 
243
269
  def __generate_sentence_sql_concept_relations(concepts, schema, model_id):
@@ -252,7 +278,6 @@ def __generate_sentence_sql_concept_relations(concepts, schema, model_id):
252
278
 
253
279
  sql = f"'The {con['label']} identified by ' || rel._plain_conceptid"
254
280
  sql += f" || ' has a relationship called {rel['label']} that connects it to one or more {rel['target_con_label']} identified by '"
255
- #sql += " || concat_ws(', ', array_agg(split(value, '#')[1])) || '. '"
256
281
  sql += " || concat_ws(', ', array_agg(cid._plain_conceptid)) || '. '"
257
282
 
258
283
 
@@ -274,15 +299,11 @@ def __get_concept_metadata(tenant_json, schema, model_id, whitelist):
274
299
  }
275
300
 
276
301
  concepts = []
277
- #parents = {}
278
302
  for d in tenant_json["domains"]:
279
303
  for c in d["concepts"]:
280
- #if whitelist is not None and d["name"] + " " + c["label"] not in whitelist:
281
- # continue
282
304
  con_props = []
283
305
  for col in c["properties"]:
284
306
  con_props.append({
285
- #"col_name": d["name"] + "_" + c["label"] + "_" + col["label"],
286
307
  "label": col["label"],
287
308
  "name": f"{model_id}/{d['name']}/{c['label']}#{col['label']}"
288
309
  })
@@ -302,13 +323,9 @@ def __get_concept_metadata(tenant_json, schema, model_id, whitelist):
302
323
  concepts.append({
303
324
  "uri": c["uri"],
304
325
  "label": d["name"] + " " + c["label"],
305
- #"id_column": d["name"] + "_" + c["label"],
306
326
  "relations": con_rels,
307
327
  "properties": con_props,
308
328
  "parents": con_parents,
309
- #"table_name": "data_" + k.model_id + "_" + d["name"] + "_" + c["label"] + "_w",
310
- #"prop_table_name": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_np",
311
- #"con_table_name": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_c",
312
329
  "prop_table_name": target_table_names[c["uri"]]["prop"],
313
330
  "con_table_name": target_table_names[c["uri"]]["con"]
314
331
  })
@@ -319,7 +336,6 @@ def __get_concept_metadata(tenant_json, schema, model_id, whitelist):
319
336
  for a in concepts:
320
337
  if a["uri"] == p:
321
338
  concepts[ci]["properties"].extend(a["properties"])
322
- #concepts[ci]["properties"] = list(set(concepts[ci]["properties"]))
323
339
 
324
340
  out_concepts = []
325
341
  for c in concepts:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kobai-sdk
3
- Version: 0.2.8rc2
3
+ Version: 0.2.8rc3
4
4
  Summary: A package that enables interaction with a Kobai tenant.
5
5
  Author-email: Ryan Oattes <ryan@kobai.io>
6
6
  License: Apache License
@@ -211,7 +211,7 @@ Classifier: Intended Audience :: Developers
211
211
  Classifier: License :: OSI Approved :: Apache Software License
212
212
  Classifier: Programming Language :: Python
213
213
  Classifier: Programming Language :: Python :: 3
214
- Requires-Python: >=3.9
214
+ Requires-Python: >=3.11
215
215
  Description-Content-Type: text/markdown
216
216
  License-File: LICENSE
217
217
  Requires-Dist: pyspark
@@ -1,14 +1,14 @@
1
1
  kobai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  kobai/ai_query.py,sha256=fMTcfj-6Ma3FRB08VYEDj8PwOEOtFGsJHyQrha5yvPg,4512
3
- kobai/ai_rag.py,sha256=KbIlrbOX-0hbt7HaOh7nyzIrROGotGt0ghQSlzN6ZUA,13096
3
+ kobai/ai_rag.py,sha256=TtUbUcSN9mIsauGyS_nw8j58T9jEd4OFiAwNvzo-rr8,13593
4
4
  kobai/databricks_client.py,sha256=fyqqMly2Qm0r1AHWsQjkYeNsDdH0G1JSgTkF9KJ55qA,2118
5
5
  kobai/demo_tenant_client.py,sha256=wlNc-bdI2wotRXo8ppUOalv4hYdBlek_WzJNARZV-AE,9293
6
6
  kobai/llm_config.py,sha256=ZFx81cUAOHYZgRoTkTY-utQYaWYlmR8773ZJpj74C1A,1900
7
7
  kobai/spark_client.py,sha256=opM_F-4Ut5Hq5zZjWMuLvUps9sDULvyPNZHXGL8dW1k,776
8
8
  kobai/tenant_api.py,sha256=9U6UbxpaAb-kpbuADXx3kbkNKaOzYy0I-GGwbpiCCOk,4212
9
9
  kobai/tenant_client.py,sha256=AyJ5R2oukEv3q1dcItpojvTUVp5-gwUKvyGjofjBKyc,41821
10
- kobai_sdk-0.2.8rc2.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
11
- kobai_sdk-0.2.8rc2.dist-info/METADATA,sha256=FiYuYjOjY5Hf5X58Cgv3Qhu_KnIWkeeCNmyzD8k-r4A,19204
12
- kobai_sdk-0.2.8rc2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
- kobai_sdk-0.2.8rc2.dist-info/top_level.txt,sha256=ns1El3BrTTHKvoAgU1XtiSaVIudYeCXbEEUVY8HFDZ4,6
14
- kobai_sdk-0.2.8rc2.dist-info/RECORD,,
10
+ kobai_sdk-0.2.8rc3.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
11
+ kobai_sdk-0.2.8rc3.dist-info/METADATA,sha256=f75oEdxRWLrr0bVmH1OvIlvc0KS9TrpNTh65eTlKX6k,19205
12
+ kobai_sdk-0.2.8rc3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
+ kobai_sdk-0.2.8rc3.dist-info/top_level.txt,sha256=ns1El3BrTTHKvoAgU1XtiSaVIudYeCXbEEUVY8HFDZ4,6
14
+ kobai_sdk-0.2.8rc3.dist-info/RECORD,,