kobai-sdk 0.2.8rc1__py3-none-any.whl → 0.2.8rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kobai-sdk might be problematic. Click here for more details.

kobai/ai_rag.py CHANGED
@@ -1,6 +1,33 @@
1
1
  from kobai import tenant_client
2
2
 
3
- def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None):
3
+ from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
4
+ #from delta.tables import *
5
+ from sentence_transformers import SentenceTransformer, util
6
+ #from deltalake import DeltaTable
7
+ from delta import DeltaTable
8
+ from typing import Union
9
+ from langchain_core.language_models.chat_models import BaseChatModel
10
+ from langchain_core.embeddings import Embeddings
11
+ from langchain_core.documents import Document
12
+ #from databricks_langchain import DatabricksEmbeddings, ChatDatabricks
13
+ from langchain_community.document_loaders import PySparkDataFrameLoader
14
+ from langchain import hub
15
+ from langchain_core.output_parsers import StrOutputParser
16
+
17
+
18
+
19
+
20
+
21
+ def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None, concept_white_list=None):
22
+
23
+ """
24
+ Extract Semantic Data from Graph to Delta Table
25
+
26
+ Parameters:
27
+ tc (TenantClient): The Kobai tenant_client instance instantiated via the SDK.
28
+ replica_schema (str): An alternate schema (catalog.database) to create the Delta table. Useful when the base Kobai schema is not on a Unity Catalog.
29
+ """
30
+
4
31
  if tc.spark_client is None:
5
32
  return None
6
33
 
@@ -9,7 +36,9 @@ def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None):
9
36
  print("Getting Tenant Config")
10
37
  tenant_json = tc.get_tenant_config()
11
38
 
12
- concepts = __get_concept_metadata(tenant_json, tc.schema, tc.model_id)
39
+ concepts = __get_concept_metadata(tenant_json, tc.schema, tc.model_id, concept_white_list)
40
+ print(concepts)
41
+ print("")
13
42
 
14
43
  print("Dropping and Recreating the RAG Table")
15
44
  ss.sql(__create_rag_table_sql(tc.schema, tc.model_id))
@@ -21,6 +50,8 @@ def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None):
21
50
 
22
51
  print("Running the Extraction")
23
52
  for sql_statement in sql_statements:
53
+ print(sql_statement)
54
+ print("")
24
55
  ss.sql(sql_statement)
25
56
 
26
57
  if replica_schema is not None:
@@ -28,6 +59,154 @@ def generate_sentences(tc: tenant_client.TenantClient, replica_schema=None):
28
59
  ss.sql(__create_rag_table_sql(replica_schema, tc.model_id))
29
60
  ss.sql(__replicate_to_catalog_sql(tc.schema, replica_schema, tc.model_id))
30
61
 
62
+ def encode_to_delta_local(tc: tenant_client.TenantClient, st_model: SentenceTransformer, replica_schema=None):
63
+
64
+ if tc.spark_client is None:
65
+ return None
66
+
67
+ ss = tc.spark_client.spark_session
68
+
69
+ schema = tc.schema
70
+ if replica_schema is not None:
71
+ schema = replica_schema
72
+
73
+ sentences_sql = f"SELECT content FROM {schema}.rag_{tc.model_id}"
74
+ sentences_df = ss.sql(sentences_sql)
75
+
76
+ num_records = sentences_df.count()
77
+ query_batch_size = 100000
78
+
79
+ #pool = model.start_multi_process_pool()
80
+
81
+ for x in range(0, num_records, query_batch_size):
82
+ print(f"Running Batch Starting at {x}")
83
+ sentences_sql = f" SELECT id, content FROM {schema}.rag_{tc.model_id} ORDER BY id LIMIT {str(query_batch_size)} OFFSET {str(x)}"
84
+ sentences_df = ss.sql(sentences_sql)
85
+ content_list = [r["content"] for r in sentences_df.collect()]
86
+ id_list = [r["id"] for r in sentences_df.collect()]
87
+ #num_records_batch = len(content_list)
88
+ #print("Done Getting Data")
89
+
90
+
91
+ vector_list = st_model.encode(content_list, normalize_embeddings=True, show_progress_bar=True)
92
+ #vector_list = model.encode_multi_process(content_list, pool)
93
+
94
+ #print("Done Encoding")
95
+
96
+ schemaV = StructType([
97
+ StructField("id",IntegerType(),True),
98
+ StructField("vector", ArrayType(FloatType()), False)
99
+ ])
100
+
101
+ updated_list = [[r[0], r[1].tolist()] for r in zip(id_list, vector_list)]
102
+ updated_df = ss.createDataFrame(updated_list, schemaV)
103
+
104
+ target_table = DeltaTable.forName(ss, f"{schema}.rag_{tc.model_id}")
105
+
106
+ target_table.alias("t") \
107
+ .merge(
108
+ updated_df.alias("s"),
109
+ 't.id = s.id'
110
+ ) \
111
+ .whenMatchedUpdate(set = {"vector": "s.vector"}) \
112
+ .execute()
113
+
114
+ ss.sql(f"""
115
+ CREATE FUNCTION IF NOT EXISTS {schema}.cos_sim(a ARRAY<FLOAT>, b ARRAY<FLOAT>)
116
+ RETURNS FLOAT
117
+ LANGUAGE PYTHON
118
+ AS $$
119
+ import numpy as np
120
+ return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
121
+ $$
122
+ """)
123
+
124
+ def rag_delta(tc: tenant_client.TenantClient, emb_model: Union[SentenceTransformer, Embeddings], chat_model: BaseChatModel, question, k=5, replica_schema=None):
125
+
126
+ schema = tc.schema
127
+ if replica_schema is not None:
128
+ schema = replica_schema
129
+
130
+ if tc.spark_client is None:
131
+ print("Instantiate Spark Client First")
132
+ return None
133
+
134
+ ss = tc.spark_client.spark_session
135
+
136
+ if isinstance(emb_model, SentenceTransformer):
137
+ vector_list = emb_model.encode(question, normalize_embeddings=True).tolist()
138
+ elif isinstance(emb_model, Embeddings):
139
+ vector_list = emb_model.embed_query(question)
140
+ else:
141
+ print("Invalid Embedding Model Type")
142
+ return None
143
+
144
+ if not isinstance(chat_model, BaseChatModel):
145
+ print("Invalid Chat Model Type")
146
+ return None
147
+
148
+ #print(vector_list)
149
+ vector_list = [str(x) for x in vector_list]
150
+ #print(vector_list)
151
+ vector_sql = ", ".join(vector_list)
152
+ #print(vector_sql)
153
+
154
+ results = ss.sql(f"""
155
+ SELECT content, reduce(zip_with(vector, cast(array({vector_sql}) as array<float>), (x,y) -> x*y), float(0.0), (acc,x) -> acc + x) score
156
+ FROM {schema}.rag_{tc.model_id}
157
+ ORDER BY score DESC
158
+ LIMIT {k}
159
+ """)
160
+
161
+ loader = PySparkDataFrameLoader(ss, results, page_content_column="content")
162
+ documents = loader.load()
163
+ docs_content = "\n\n".join(doc.page_content for doc in documents)
164
+
165
+ #print(docs_content)
166
+
167
+ prompt = hub.pull("rlm/rag-prompt")
168
+
169
+ output_parser = StrOutputParser()
170
+
171
+ chain = prompt | chat_model | output_parser
172
+
173
+ response = chain.invoke(
174
+ {
175
+ "context": docs_content,
176
+ "question": question
177
+ }
178
+ )
179
+
180
+ return response
181
+
182
+ def dep_rag_delta(tc: tenant_client.TenantClient, st_model: SentenceTransformer, question, k=5, replica_schema=None):
183
+
184
+ schema = tc.schema
185
+ if replica_schema is not None:
186
+ schema = replica_schema
187
+
188
+ if tc.spark_client is None:
189
+ return None
190
+
191
+ ss = tc.spark_client.spark_session
192
+
193
+ vector_list = st_model.encode(question, normalize_embeddings=True).tolist()
194
+
195
+ #print(vector_list)
196
+ vector_list = [str(x) for x in vector_list]
197
+ #print(vector_list)
198
+ vector_sql = ", ".join(vector_list)
199
+ #print(vector_sql)
200
+
201
+ results = ss.sql(f"""
202
+ SELECT content, reduce(zip_with(vector, cast(array({vector_sql}) as array<float>), (x,y) -> x*y), float(0.0), (acc,x) -> acc + x) score
203
+ FROM {schema}.rag_{tc.model_id}
204
+ ORDER BY score DESC
205
+ LIMIT {k}
206
+ """)
207
+
208
+ return results
209
+
31
210
  def __create_rag_table_sql(schema, model_id):
32
211
  return f"CREATE OR REPLACE TABLE {schema}.rag_{model_id} (id BIGINT GENERATED BY DEFAULT AS IDENTITY, content STRING, type string, concept_id string, vector ARRAY<FLOAT>) TBLPROPERTIES (delta.enableChangeDataFeed = true)"
33
212
 
@@ -40,53 +219,66 @@ def __generate_sentence_sql_concept_literals(concepts, schema, model_id):
40
219
  statements = []
41
220
  for con in concepts:
42
221
  sql = f"'This is a {con['label']}. '"
43
- sql += " || 'It is identified by ' || split(cid._conceptid,'#')[1] || '. '"
222
+ #sql += " || 'It is identified by ' || split(cid._conceptid,'#')[1] || '. '"
223
+ sql += " || 'It is identified by ' || cid._plain_conceptid || '. '"
44
224
 
45
- sql_from = f"{con['con_table_name']} cid"
225
+ #sql_from = f"{con['con_table_name']} cid"
226
+ sql_from = f"(SELECT _conceptid, _plain_conceptid FROM {con['prop_table_name']} GROUP BY _conceptid, _plain_conceptid) cid"
46
227
  for prop in con["properties"]:
47
228
 
48
- sql_from += f" INNER JOIN {con['prop_table_name']} AS {prop['label']}"
229
+ sql_from += f" LEFT JOIN {con['prop_table_name']} AS {prop['label']}"
49
230
  sql_from += f" ON cid._conceptid = {prop['label']}._conceptid"
50
231
  sql_from += f" AND {prop['label']}.type = 'l'"
51
232
  sql_from += f" AND {prop['label']}.name = '{prop['name']}'"
52
233
 
53
- sql += f" || 'The {prop['label']} is ' || any_value({prop['label']}.value) IGNORE NULLS || '. '"
54
-
55
- full_sql = f"INSERT INTO {schema}.rag_{model_id} (content, concept_id, type)"
56
- full_sql += f" SELECT {sql} content, cid._conceptid concept_id, 'c' type FROM {sql_from} GROUP BY cid._conceptid"
57
-
58
- statements.append(full_sql)
234
+ sql += f" || 'The {prop['label']} is ' || ifnull(any_value({prop['label']}.value) IGNORE NULLS, 'unknown') || '. '"
235
+
236
+ full_sql = f"INSERT INTO {schema}.rag_{model_id} (content, concept_id, type)"
237
+ full_sql += f" SELECT {sql} content, cid._conceptid concept_id, 'c' type FROM {sql_from} GROUP BY cid._conceptid, cid._plain_conceptid"
238
+
239
+ statements.append(full_sql)
59
240
  #test_df = spark.sql(full_sql)
60
241
  return statements
61
242
 
62
243
  def __generate_sentence_sql_concept_relations(concepts, schema, model_id):
63
244
  statements = []
64
245
  for con in concepts:
65
-
66
- sql_from = f"{con['prop_table_name']} "
67
246
  for rel in con["relations"]:
247
+ sql_from = f"{con['prop_table_name']} rel"
248
+ sql_from += f" INNER JOIN (SELECT _conceptid, _plain_conceptid FROM {rel['target_table_name']} GROUP BY _conceptid, _plain_conceptid) cid"
249
+ sql_from += f" ON rel.value = cid._conceptid"
250
+ sql_from += f" AND rel.type = 'r'"
251
+ sql_from += f" AND rel.name = '{rel['name']}'"
68
252
 
69
- sql = f"'The {con['label']} identified by ' || split(_conceptid,'#')[1]"
253
+ sql = f"'The {con['label']} identified by ' || rel._plain_conceptid"
70
254
  sql += f" || ' has a relationship called {rel['label']} that connects it to one or more {rel['target_con_label']} identified by '"
71
- sql += " || concat_ws(', ', array_agg(split(value, '#')[1])) || '. '"
255
+ #sql += " || concat_ws(', ', array_agg(split(value, '#')[1])) || '. '"
256
+ sql += " || concat_ws(', ', array_agg(cid._plain_conceptid)) || '. '"
72
257
 
73
258
 
74
259
  full_sql = f"INSERT INTO {schema}.rag_{model_id} (content, concept_id, type)"
75
- full_sql += f" SELECT {sql} content, _conceptid concept_id, 'e' type FROM {sql_from} GROUP BY _conceptid"
260
+ full_sql += f" SELECT {sql} content, rel._conceptid concept_id, 'e' type FROM {sql_from} GROUP BY rel._conceptid, rel._plain_conceptid"
76
261
 
77
262
  statements.append(full_sql)
78
263
  return statements
79
264
 
80
- def __get_concept_metadata(tenant_json, schema, model_id):
265
+ def __get_concept_metadata(tenant_json, schema, model_id, whitelist):
81
266
  target_concept_labels = {}
267
+ target_table_names = {}
82
268
  for d in tenant_json["domains"]:
83
269
  for c in d["concepts"]:
84
270
  target_concept_labels[c["uri"]] = d["name"] + " " + c["label"]
85
-
271
+ target_table_names[c["uri"]] = {
272
+ "prop": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_np",
273
+ "con": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_c"
274
+ }
275
+
86
276
  concepts = []
87
-
277
+ #parents = {}
88
278
  for d in tenant_json["domains"]:
89
279
  for c in d["concepts"]:
280
+ #if whitelist is not None and d["name"] + " " + c["label"] not in whitelist:
281
+ # continue
90
282
  con_props = []
91
283
  for col in c["properties"]:
92
284
  con_props.append({
@@ -96,18 +288,43 @@ def __get_concept_metadata(tenant_json, schema, model_id):
96
288
  })
97
289
  con_rels = []
98
290
  for rel in c["relations"]:
291
+ if whitelist is not None and target_concept_labels[rel["relationTypeUri"]] not in whitelist:
292
+ continue
99
293
  con_rels.append({
100
294
  "label": rel["label"],
101
295
  "name": f"{model_id}/{d['name']}/{c['label']}#{rel['label']}",
102
- "target_con_label": target_concept_labels[rel["relationTypeUri"]]
296
+ "target_con_label": target_concept_labels[rel["relationTypeUri"]],
297
+ "target_table_name": target_table_names[rel["relationTypeUri"]]["prop"]
103
298
  })
299
+ con_parents = []
300
+ for p in c["inheritedConcepts"]:
301
+ con_parents.append(p)
104
302
  concepts.append({
303
+ "uri": c["uri"],
105
304
  "label": d["name"] + " " + c["label"],
106
305
  #"id_column": d["name"] + "_" + c["label"],
107
306
  "relations": con_rels,
108
307
  "properties": con_props,
308
+ "parents": con_parents,
109
309
  #"table_name": "data_" + k.model_id + "_" + d["name"] + "_" + c["label"] + "_w",
110
- "prop_table_name": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_np",
111
- "con_table_name": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_c",
310
+ #"prop_table_name": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_np",
311
+ #"con_table_name": f"{schema}.data_{model_id}_{d['name']}_{c['label']}_c",
312
+ "prop_table_name": target_table_names[c["uri"]]["prop"],
313
+ "con_table_name": target_table_names[c["uri"]]["con"]
112
314
  })
113
- return concepts
315
+
316
+ for ci, c in enumerate(concepts):
317
+ if len(c["parents"]) > 0:
318
+ for p in c["parents"]:
319
+ for a in concepts:
320
+ if a["uri"] == p:
321
+ concepts[ci]["properties"].extend(a["properties"])
322
+ #concepts[ci]["properties"] = list(set(concepts[ci]["properties"]))
323
+
324
+ out_concepts = []
325
+ for c in concepts:
326
+ if whitelist is not None and c["label"] not in whitelist:
327
+ continue
328
+ out_concepts.append(c)
329
+
330
+ return out_concepts
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kobai-sdk
3
- Version: 0.2.8rc1
3
+ Version: 0.2.8rc2
4
4
  Summary: A package that enables interaction with a Kobai tenant.
5
5
  Author-email: Ryan Oattes <ryan@kobai.io>
6
6
  License: Apache License
@@ -222,6 +222,7 @@ Requires-Dist: azure-storage-blob
222
222
  Requires-Dist: langchain-core
223
223
  Requires-Dist: langchain-community
224
224
  Requires-Dist: langchain_openai
225
+ Requires-Dist: sentence_transformers
225
226
  Provides-Extra: dev
226
227
  Requires-Dist: black; extra == "dev"
227
228
  Requires-Dist: bumpver; extra == "dev"
@@ -1,14 +1,14 @@
1
1
  kobai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  kobai/ai_query.py,sha256=fMTcfj-6Ma3FRB08VYEDj8PwOEOtFGsJHyQrha5yvPg,4512
3
- kobai/ai_rag.py,sha256=y_N7qVu8HfUHHZPIyQSO7L995RBeNtDhva7U5HBHSfY,5063
3
+ kobai/ai_rag.py,sha256=KbIlrbOX-0hbt7HaOh7nyzIrROGotGt0ghQSlzN6ZUA,13096
4
4
  kobai/databricks_client.py,sha256=fyqqMly2Qm0r1AHWsQjkYeNsDdH0G1JSgTkF9KJ55qA,2118
5
5
  kobai/demo_tenant_client.py,sha256=wlNc-bdI2wotRXo8ppUOalv4hYdBlek_WzJNARZV-AE,9293
6
6
  kobai/llm_config.py,sha256=ZFx81cUAOHYZgRoTkTY-utQYaWYlmR8773ZJpj74C1A,1900
7
7
  kobai/spark_client.py,sha256=opM_F-4Ut5Hq5zZjWMuLvUps9sDULvyPNZHXGL8dW1k,776
8
8
  kobai/tenant_api.py,sha256=9U6UbxpaAb-kpbuADXx3kbkNKaOzYy0I-GGwbpiCCOk,4212
9
9
  kobai/tenant_client.py,sha256=AyJ5R2oukEv3q1dcItpojvTUVp5-gwUKvyGjofjBKyc,41821
10
- kobai_sdk-0.2.8rc1.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
11
- kobai_sdk-0.2.8rc1.dist-info/METADATA,sha256=nZTb2svQk01wT32zBZDPKgeYnSAx22YER5YLHEIjoAQ,19167
12
- kobai_sdk-0.2.8rc1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
- kobai_sdk-0.2.8rc1.dist-info/top_level.txt,sha256=ns1El3BrTTHKvoAgU1XtiSaVIudYeCXbEEUVY8HFDZ4,6
14
- kobai_sdk-0.2.8rc1.dist-info/RECORD,,
10
+ kobai_sdk-0.2.8rc2.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
11
+ kobai_sdk-0.2.8rc2.dist-info/METADATA,sha256=FiYuYjOjY5Hf5X58Cgv3Qhu_KnIWkeeCNmyzD8k-r4A,19204
12
+ kobai_sdk-0.2.8rc2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
+ kobai_sdk-0.2.8rc2.dist-info/top_level.txt,sha256=ns1El3BrTTHKvoAgU1XtiSaVIudYeCXbEEUVY8HFDZ4,6
14
+ kobai_sdk-0.2.8rc2.dist-info/RECORD,,