MindsDB 25.5.4.2__py3-none-any.whl → 25.6.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/a2a/agent.py +50 -26
- mindsdb/api/a2a/common/server/server.py +32 -26
- mindsdb/api/a2a/task_manager.py +68 -6
- mindsdb/api/executor/command_executor.py +69 -14
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
- mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +91 -84
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
- mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
- mindsdb/api/executor/planner/plan_join.py +67 -77
- mindsdb/api/executor/planner/query_planner.py +176 -155
- mindsdb/api/executor/planner/steps.py +37 -12
- mindsdb/api/executor/sql_query/result_set.py +45 -64
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
- mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
- mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
- mindsdb/api/executor/utilities/sql.py +42 -48
- mindsdb/api/http/namespaces/config.py +1 -1
- mindsdb/api/http/namespaces/file.py +14 -23
- mindsdb/api/http/namespaces/knowledge_bases.py +132 -154
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
- mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
- mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +219 -28
- mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
- mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +29 -33
- mindsdb/integrations/handlers/openai_handler/openai_handler.py +277 -356
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +145 -40
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +352 -83
- mindsdb/integrations/libs/api_handler.py +279 -57
- mindsdb/integrations/libs/base.py +185 -30
- mindsdb/integrations/utilities/files/file_reader.py +99 -73
- mindsdb/integrations/utilities/handler_utils.py +23 -8
- mindsdb/integrations/utilities/sql_utils.py +35 -40
- mindsdb/interfaces/agents/agents_controller.py +226 -196
- mindsdb/interfaces/agents/constants.py +8 -1
- mindsdb/interfaces/agents/langchain_agent.py +42 -11
- mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
- mindsdb/interfaces/agents/mindsdb_database_agent.py +23 -18
- mindsdb/interfaces/data_catalog/__init__.py +0 -0
- mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +375 -0
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +38 -0
- mindsdb/interfaces/database/database.py +81 -57
- mindsdb/interfaces/database/integrations.py +222 -234
- mindsdb/interfaces/database/log.py +72 -104
- mindsdb/interfaces/database/projects.py +156 -193
- mindsdb/interfaces/file/file_controller.py +21 -65
- mindsdb/interfaces/knowledge_base/controller.py +66 -25
- mindsdb/interfaces/knowledge_base/evaluate.py +516 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
- mindsdb/interfaces/skills/skills_controller.py +31 -36
- mindsdb/interfaces/skills/sql_agent.py +113 -86
- mindsdb/interfaces/storage/db.py +242 -82
- mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
- mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
- mindsdb/utilities/config.py +13 -2
- mindsdb/utilities/log.py +35 -26
- mindsdb/utilities/ml_task_queue/task.py +19 -22
- mindsdb/utilities/render/sqlalchemy_render.py +129 -181
- mindsdb/utilities/starters.py +40 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/METADATA +257 -257
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/RECORD +76 -68
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import math
|
|
3
|
+
import time
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import datetime as dt
|
|
8
|
+
|
|
9
|
+
from mindsdb.api.executor.sql_query.result_set import ResultSet
|
|
10
|
+
from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql
|
|
11
|
+
from mindsdb.utilities import log
|
|
12
|
+
|
|
13
|
+
from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
|
|
14
|
+
|
|
15
|
+
logger = log.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
GENERATE_QA_SYSTEM_PROMPT = """
|
|
19
|
+
Your task is to generate question and answer pairs for a search engine.
|
|
20
|
+
The search engine will take your query and return a list of documents.
|
|
21
|
+
You will be given a text and you need to generate a question that can be answered using the information in the text.
|
|
22
|
+
Your questions will be used to evaluate the search engine.
|
|
23
|
+
Question should always have enough clues to identify the specific text that this question is generated from.
|
|
24
|
+
Never ask questions like "What license number is associated with Amend 6" because Amend 6 could be found in many documents and the question is not specific enough.
|
|
25
|
+
Example output 1: {\"query\": \"What processor does the HP 2023 14\" FHD IPS Laptop use?\", \"reference_answer\": \"Ryzen 3 5300U\"}
|
|
26
|
+
Example output 2: {\"query\": \"What is the name of the river in Paris?\", \"reference_answer\": \"Seine\"}
|
|
27
|
+
Don't generate questions like "What is being amended in the application?" because these questions cannot be answered using the text and without knowing which document it refers to.
|
|
28
|
+
The question should be answerable without the text, but the answer should be present in the text.
|
|
29
|
+
Return ONLY a json response. No other text.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def calc_entropy(values: List[float]) -> float:
|
|
34
|
+
"""
|
|
35
|
+
Alternative of scipy.stats.entropy, to not add `scipy` dependency
|
|
36
|
+
:param values: Input distribution
|
|
37
|
+
:return: The calculated entropy.
|
|
38
|
+
"""
|
|
39
|
+
# normalize & filter
|
|
40
|
+
total = sum(values)
|
|
41
|
+
values = [i / total for i in values if i > 0]
|
|
42
|
+
# calc
|
|
43
|
+
return -sum([pk * math.log(pk) for pk in values])
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class EvaluateBase:
|
|
47
|
+
DEFAULT_QUESTION_COUNT = 20
|
|
48
|
+
DEFAULT_SAMPLE_SIZE = 10000
|
|
49
|
+
|
|
50
|
+
def __init__(self, session, knowledge_base):
|
|
51
|
+
self.kb = knowledge_base
|
|
52
|
+
self.name = knowledge_base._kb.name
|
|
53
|
+
self.session = session
|
|
54
|
+
|
|
55
|
+
self._llm_client = None
|
|
56
|
+
|
|
57
|
+
def generate(self, sampled_df: pd.DataFrame) -> pd.DataFrame:
|
|
58
|
+
# generate test data from sample
|
|
59
|
+
raise NotImplementedError
|
|
60
|
+
|
|
61
|
+
def evaluate(self, test_data: pd.DataFrame) -> pd.DataFrame:
|
|
62
|
+
# create evaluate metric from test data
|
|
63
|
+
raise NotImplementedError
|
|
64
|
+
|
|
65
|
+
def _set_llm_client(self, llm_params: dict):
|
|
66
|
+
"""
|
|
67
|
+
Logic to get LLM setting:
|
|
68
|
+
- first get `llm` setting of ‘evaluate’ command
|
|
69
|
+
- if not defined, look at the knowledge base reranker config
|
|
70
|
+
"""
|
|
71
|
+
if llm_params is None:
|
|
72
|
+
llm_params = self.kb._kb.params.get("reranking_model")
|
|
73
|
+
|
|
74
|
+
self.llm_client = LLMClient(llm_params)
|
|
75
|
+
|
|
76
|
+
def generate_test_data(self, gen_params: dict) -> pd.DataFrame:
|
|
77
|
+
# Extract source data (from users query or from KB itself) and call `generate` to get test data
|
|
78
|
+
|
|
79
|
+
if "from_sql" in gen_params:
|
|
80
|
+
# get data from sql
|
|
81
|
+
query = parse_sql(gen_params["from_sql"])
|
|
82
|
+
if not isinstance(query, Select) or query.from_table is None:
|
|
83
|
+
raise ValueError(f"Query not supported {gen_params['from_sql']}")
|
|
84
|
+
|
|
85
|
+
dn, table_name = self._get_dn_table(query.from_table)
|
|
86
|
+
query.from_table = table_name
|
|
87
|
+
query.limit = Constant(self.DEFAULT_SAMPLE_SIZE)
|
|
88
|
+
|
|
89
|
+
response = dn.query(query=query, session=self.session)
|
|
90
|
+
df = response.data_frame
|
|
91
|
+
|
|
92
|
+
if "content" not in df.columns:
|
|
93
|
+
raise ValueError("`content` column isn't found in source data")
|
|
94
|
+
|
|
95
|
+
df.rename(columns={"content": "chunk_content"}, inplace=True)
|
|
96
|
+
else:
|
|
97
|
+
# get data from knowledge base
|
|
98
|
+
df = self.kb.select_query(
|
|
99
|
+
Select(
|
|
100
|
+
targets=[Identifier("chunk_content"), Identifier("id")], limit=Constant(self.DEFAULT_SAMPLE_SIZE)
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if "count" in gen_params:
|
|
105
|
+
number_of_questions = gen_params["count"]
|
|
106
|
+
else:
|
|
107
|
+
number_of_questions = self.DEFAULT_QUESTION_COUNT
|
|
108
|
+
|
|
109
|
+
number_of_questions = min(number_of_questions, len(df))
|
|
110
|
+
sampled_df = df.sample(n=number_of_questions)
|
|
111
|
+
|
|
112
|
+
return self.generate(sampled_df)
|
|
113
|
+
|
|
114
|
+
def read_from_table(self, test_table: Identifier) -> pd.DataFrame:
|
|
115
|
+
# read data from table
|
|
116
|
+
|
|
117
|
+
dn, table_name = self._get_dn_table(test_table)
|
|
118
|
+
|
|
119
|
+
query = Select(
|
|
120
|
+
targets=[Star()],
|
|
121
|
+
from_table=table_name,
|
|
122
|
+
)
|
|
123
|
+
response = dn.query(query=query, session=self.session)
|
|
124
|
+
return response.data_frame
|
|
125
|
+
|
|
126
|
+
def _get_dn_table(self, table_name: Identifier):
|
|
127
|
+
if len(table_name.parts) < 2:
|
|
128
|
+
raise ValueError(f"Can't find database, table name must have at least 2 parts: {table_name}")
|
|
129
|
+
|
|
130
|
+
integration_name = table_name.parts[0]
|
|
131
|
+
table_name = Identifier(parts=table_name.parts[1:])
|
|
132
|
+
dn = self.session.datahub.get(integration_name)
|
|
133
|
+
return dn, table_name
|
|
134
|
+
|
|
135
|
+
def save_to_table(self, table_name: Identifier, df: pd.DataFrame, is_replace=False):
|
|
136
|
+
# save data to table
|
|
137
|
+
|
|
138
|
+
dn, table_name = self._get_dn_table(table_name)
|
|
139
|
+
|
|
140
|
+
data = ResultSet.from_df(df)
|
|
141
|
+
|
|
142
|
+
dn.create_table(
|
|
143
|
+
table_name=table_name,
|
|
144
|
+
result_set=data,
|
|
145
|
+
is_replace=is_replace,
|
|
146
|
+
is_create=True,
|
|
147
|
+
raise_if_exists=False,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def run_evaluate(self, params: dict) -> pd.DataFrame:
|
|
151
|
+
# evaluate function entry point
|
|
152
|
+
|
|
153
|
+
self._set_llm_client(params.get("llm"))
|
|
154
|
+
|
|
155
|
+
if "test_table" not in params:
|
|
156
|
+
raise ValueError('The table with has to be defined in "test_table" parameter')
|
|
157
|
+
|
|
158
|
+
test_table = params["test_table"]
|
|
159
|
+
|
|
160
|
+
if isinstance(test_table, str):
|
|
161
|
+
test_table = Identifier(test_table)
|
|
162
|
+
|
|
163
|
+
if "generate_data" in params:
|
|
164
|
+
# generate question / answers using llm
|
|
165
|
+
gen_params = params["generate_data"]
|
|
166
|
+
if not isinstance(gen_params, dict):
|
|
167
|
+
gen_params = {}
|
|
168
|
+
test_data = self.generate_test_data(gen_params)
|
|
169
|
+
|
|
170
|
+
self.save_to_table(test_table, test_data, is_replace=True)
|
|
171
|
+
else:
|
|
172
|
+
test_data = self.read_from_table(test_table)
|
|
173
|
+
|
|
174
|
+
if params.get("evaluate", True) is False:
|
|
175
|
+
# no evaluate is required
|
|
176
|
+
return pd.DataFrame()
|
|
177
|
+
|
|
178
|
+
scores = self.evaluate(test_data)
|
|
179
|
+
scores["name"] = self.name
|
|
180
|
+
scores["created_at"] = dt.datetime.now()
|
|
181
|
+
|
|
182
|
+
# save scores
|
|
183
|
+
if "save_to" in params:
|
|
184
|
+
to_table = params["save_to"]
|
|
185
|
+
if isinstance(to_table, str):
|
|
186
|
+
to_table = Identifier(to_table)
|
|
187
|
+
self.save_to_table(to_table, scores)
|
|
188
|
+
|
|
189
|
+
return scores
|
|
190
|
+
|
|
191
|
+
@staticmethod
|
|
192
|
+
def run(session, kb_table, params) -> pd.DataFrame:
|
|
193
|
+
# choose the evaluator version according to the 'version' parameter in config
|
|
194
|
+
|
|
195
|
+
evaluate_version = params.get("version", "doc_id")
|
|
196
|
+
|
|
197
|
+
if evaluate_version == "llm_relevancy":
|
|
198
|
+
cls = EvaluateRerank
|
|
199
|
+
elif evaluate_version == "doc_id":
|
|
200
|
+
cls = EvaluateDocID
|
|
201
|
+
else:
|
|
202
|
+
raise NotImplementedError(f"Version of evaluator is not implemented: {evaluate_version}")
|
|
203
|
+
|
|
204
|
+
return cls(session, kb_table).run_evaluate(params)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class EvaluateRerank(EvaluateBase):
|
|
208
|
+
"""
|
|
209
|
+
Rank responses from KB using LLM (by calling KB reranker function)
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
TOP_K = 10
|
|
213
|
+
|
|
214
|
+
def generate(self, sampled_df: pd.DataFrame) -> pd.DataFrame:
|
|
215
|
+
qa_data = []
|
|
216
|
+
count_errors = 0
|
|
217
|
+
for chunk_content in sampled_df["chunk_content"]:
|
|
218
|
+
try:
|
|
219
|
+
question, answer = self.generate_question_answer(chunk_content)
|
|
220
|
+
except ValueError as e:
|
|
221
|
+
# allow some numbers of error
|
|
222
|
+
count_errors += 1
|
|
223
|
+
if count_errors > 5:
|
|
224
|
+
raise e
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
qa_data.append({"text": chunk_content, "question": question, "answer": answer})
|
|
228
|
+
|
|
229
|
+
df = pd.DataFrame(qa_data)
|
|
230
|
+
df["id"] = df.index
|
|
231
|
+
return df
|
|
232
|
+
|
|
233
|
+
def generate_question_answer(self, text: str) -> (str, str):
|
|
234
|
+
messages = [
|
|
235
|
+
{"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
|
|
236
|
+
{"role": "user", "content": f"\n\nText:\n{text}\n\n"},
|
|
237
|
+
]
|
|
238
|
+
answer = self.llm_client.completion(messages)
|
|
239
|
+
try:
|
|
240
|
+
output = json.loads(answer)
|
|
241
|
+
except json.JSONDecodeError:
|
|
242
|
+
raise ValueError(f"Could not parse response from LLM: {answer}")
|
|
243
|
+
|
|
244
|
+
if "query" not in output or "reference_answer" not in output:
|
|
245
|
+
raise ValueError("Cant find question/answer in LLM response")
|
|
246
|
+
|
|
247
|
+
return output.get("query"), output.get("reference_answer")
|
|
248
|
+
|
|
249
|
+
def evaluate(self, test_data: pd.DataFrame) -> pd.DataFrame:
|
|
250
|
+
json_to_log_list = []
|
|
251
|
+
questions = test_data.to_dict("records")
|
|
252
|
+
|
|
253
|
+
for i, item in enumerate(questions):
|
|
254
|
+
question = item["question"]
|
|
255
|
+
ground_truth = item["answer"]
|
|
256
|
+
|
|
257
|
+
start_time = time.time()
|
|
258
|
+
logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
|
|
259
|
+
df_answers = self.kb.select_query(Select(targets=[Identifier("chunk_content")], limit=Constant(self.TOP_K)))
|
|
260
|
+
query_time = time.time() - start_time
|
|
261
|
+
|
|
262
|
+
proposed_responses = list(df_answers["chunk_content"])
|
|
263
|
+
|
|
264
|
+
# generate answer using llm
|
|
265
|
+
relevance_score_list = self.kb.score_documents(question, proposed_responses, self.llm_client.params)
|
|
266
|
+
|
|
267
|
+
# set binary relevancy
|
|
268
|
+
binary_relevancy_list = [1 if score >= 0.5 else 0 for score in relevance_score_list]
|
|
269
|
+
|
|
270
|
+
# calculate first relevant position
|
|
271
|
+
first_relevant_position = next((i for i, x in enumerate(binary_relevancy_list) if x == 1), None)
|
|
272
|
+
json_to_log = {
|
|
273
|
+
"question": question,
|
|
274
|
+
"ground_truth": ground_truth,
|
|
275
|
+
# "relevancy_at_k": relevancy_at_k,
|
|
276
|
+
"binary_relevancy_list": binary_relevancy_list,
|
|
277
|
+
"relevance_score_list": relevance_score_list,
|
|
278
|
+
"first_relevant_position": first_relevant_position,
|
|
279
|
+
"query_time": query_time,
|
|
280
|
+
}
|
|
281
|
+
json_to_log_list.append(json_to_log)
|
|
282
|
+
|
|
283
|
+
evaluation_results = self.evaluate_retrieval_metrics(json_to_log_list)
|
|
284
|
+
return pd.DataFrame([evaluation_results])
|
|
285
|
+
|
|
286
|
+
def evaluate_retrieval_metrics(self, json_to_log_list):
|
|
287
|
+
"""
|
|
288
|
+
Computes retrieval evaluation metrics from the result log.
|
|
289
|
+
|
|
290
|
+
Metrics computed:
|
|
291
|
+
- Average Relevancy (mean soft relevance score)
|
|
292
|
+
- Average Relevancy@k (soft score)
|
|
293
|
+
- Average First Relevant Position
|
|
294
|
+
- Mean Reciprocal Rank (MRR)
|
|
295
|
+
- Hit@k
|
|
296
|
+
- Binary Precision@k
|
|
297
|
+
- Average Entropy of Relevance Scores
|
|
298
|
+
- Average nDCG
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
json_to_log_list (list): List of evaluation logs per query.
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
dict: A dictionary containing all computed metrics.
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
mrr_list = []
|
|
308
|
+
hit_at_k_matrix = []
|
|
309
|
+
binary_precision_at_k_matrix = []
|
|
310
|
+
ndcg_list = []
|
|
311
|
+
entropy_list = []
|
|
312
|
+
|
|
313
|
+
total_relevancy = 0
|
|
314
|
+
relevance_score_matrix = []
|
|
315
|
+
first_relevant_positions = []
|
|
316
|
+
|
|
317
|
+
for item in json_to_log_list:
|
|
318
|
+
binary_relevancy = item["binary_relevancy_list"]
|
|
319
|
+
relevance_scores = item["relevance_score_list"]
|
|
320
|
+
|
|
321
|
+
# Skip if empty
|
|
322
|
+
if not relevance_scores:
|
|
323
|
+
continue
|
|
324
|
+
|
|
325
|
+
# Mean relevancy per query
|
|
326
|
+
query_relevancy = sum(relevance_scores) / len(relevance_scores)
|
|
327
|
+
total_relevancy += query_relevancy
|
|
328
|
+
|
|
329
|
+
# Build score matrix for later average@k
|
|
330
|
+
relevance_score_matrix.append(relevance_scores)
|
|
331
|
+
|
|
332
|
+
# First relevant position
|
|
333
|
+
pos = item["first_relevant_position"]
|
|
334
|
+
if pos is not None:
|
|
335
|
+
first_relevant_positions.append(pos)
|
|
336
|
+
|
|
337
|
+
# MRR
|
|
338
|
+
reciprocal_rank = 1 / (pos + 1) if pos is not None else 0
|
|
339
|
+
mrr_list.append(reciprocal_rank)
|
|
340
|
+
|
|
341
|
+
# Hit@k and Binary Precision@k
|
|
342
|
+
hit_row = []
|
|
343
|
+
precision_row = []
|
|
344
|
+
for k in range(1, len(binary_relevancy) + 1):
|
|
345
|
+
hit = int(any(binary_relevancy[:k]))
|
|
346
|
+
precision = sum(binary_relevancy[:k]) / k
|
|
347
|
+
hit_row.append(hit)
|
|
348
|
+
precision_row.append(precision)
|
|
349
|
+
hit_at_k_matrix.append(hit_row)
|
|
350
|
+
binary_precision_at_k_matrix.append(precision_row)
|
|
351
|
+
|
|
352
|
+
# Entropy
|
|
353
|
+
|
|
354
|
+
entropy = calc_entropy(relevance_scores) if len(relevance_scores) > 1 else 0
|
|
355
|
+
entropy_list.append(entropy)
|
|
356
|
+
|
|
357
|
+
# nDCG
|
|
358
|
+
def dcg(scores):
|
|
359
|
+
return sum(score / math.log2(idx + 2) for idx, score in enumerate(scores))
|
|
360
|
+
|
|
361
|
+
ideal = sorted(relevance_scores, reverse=True)
|
|
362
|
+
actual_dcg = dcg(relevance_scores)
|
|
363
|
+
ideal_dcg = dcg(ideal)
|
|
364
|
+
ndcg = actual_dcg / ideal_dcg if ideal_dcg > 0 else 0
|
|
365
|
+
ndcg_list.append(ndcg)
|
|
366
|
+
|
|
367
|
+
# Aggregated metrics
|
|
368
|
+
num_queries = len(json_to_log_list)
|
|
369
|
+
average_relevancy = total_relevancy / num_queries if num_queries else 0
|
|
370
|
+
|
|
371
|
+
# Relevancy@k
|
|
372
|
+
average_relevance_score_by_k = []
|
|
373
|
+
if relevance_score_matrix:
|
|
374
|
+
relevance_score_matrix = list(zip(*relevance_score_matrix))
|
|
375
|
+
for col in relevance_score_matrix:
|
|
376
|
+
avg_k = sum(col) / len(col)
|
|
377
|
+
average_relevance_score_by_k.append(round(avg_k, 2))
|
|
378
|
+
|
|
379
|
+
average_first_relevant_position = (
|
|
380
|
+
sum(first_relevant_positions) / len(first_relevant_positions) if first_relevant_positions else None
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
mean_mrr = sum(mrr_list) / len(mrr_list) if mrr_list else 0
|
|
384
|
+
hit_at_k_avg = [round(sum(col) / len(col), 2) for col in zip(*hit_at_k_matrix)] if hit_at_k_matrix else []
|
|
385
|
+
binary_precision_at_k_avg = (
|
|
386
|
+
[round(sum(col) / len(col), 2) for col in zip(*binary_precision_at_k_matrix)]
|
|
387
|
+
if binary_precision_at_k_matrix
|
|
388
|
+
else []
|
|
389
|
+
)
|
|
390
|
+
avg_entropy = sum(entropy_list) / len(entropy_list) if entropy_list else 0
|
|
391
|
+
avg_ndcg = sum(ndcg_list) / len(ndcg_list) if ndcg_list else 0
|
|
392
|
+
|
|
393
|
+
avg_query_time = sum(item["query_time"] for item in json_to_log_list) / num_queries
|
|
394
|
+
|
|
395
|
+
return {
|
|
396
|
+
"avg_relevancy": average_relevancy,
|
|
397
|
+
"avg_relevance_score_by_k": average_relevance_score_by_k,
|
|
398
|
+
"avg_first_relevant_position": average_first_relevant_position,
|
|
399
|
+
"mean_mrr": mean_mrr,
|
|
400
|
+
"hit_at_k": hit_at_k_avg,
|
|
401
|
+
"bin_precision_at_k": binary_precision_at_k_avg,
|
|
402
|
+
"avg_entropy": avg_entropy,
|
|
403
|
+
"avg_ndcg": avg_ndcg,
|
|
404
|
+
"avg_query_time": avg_query_time,
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
class EvaluateDocID(EvaluateBase):
|
|
409
|
+
"""
|
|
410
|
+
Checks if ID in response from KB is matched with doc ID in test dataset
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
TOP_K = 100
|
|
414
|
+
|
|
415
|
+
def generate(self, sampled_df: pd.DataFrame) -> pd.DataFrame:
|
|
416
|
+
if "id" not in sampled_df.columns:
|
|
417
|
+
raise ValueError("'id' column is required for generating test dataset")
|
|
418
|
+
|
|
419
|
+
qa_data = []
|
|
420
|
+
count_errors = 0
|
|
421
|
+
for _, item in sampled_df.iterrows():
|
|
422
|
+
chunk_content = item["chunk_content"]
|
|
423
|
+
try:
|
|
424
|
+
question, answer = self.generate_question_answer(chunk_content)
|
|
425
|
+
except ValueError as e:
|
|
426
|
+
# allow some numbers of error
|
|
427
|
+
count_errors += 1
|
|
428
|
+
if count_errors > 5:
|
|
429
|
+
raise e
|
|
430
|
+
continue
|
|
431
|
+
|
|
432
|
+
qa_data.append({"text": chunk_content, "question": question, "answer": answer, "doc_id": item["id"]})
|
|
433
|
+
if len(qa_data) == 0:
|
|
434
|
+
raise ValueError("No data in generated test dataset")
|
|
435
|
+
df = pd.DataFrame(qa_data)
|
|
436
|
+
return df
|
|
437
|
+
|
|
438
|
+
def generate_question_answer(self, text: str) -> (str, str):
|
|
439
|
+
messages = [
|
|
440
|
+
{"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
|
|
441
|
+
{"role": "user", "content": f"\n\nText:\n{text}\n\n"},
|
|
442
|
+
]
|
|
443
|
+
answer = self.llm_client.completion(messages)
|
|
444
|
+
try:
|
|
445
|
+
output = json.loads(answer)
|
|
446
|
+
except json.JSONDecodeError:
|
|
447
|
+
raise ValueError(f"Could not parse response from LLM: {answer}")
|
|
448
|
+
|
|
449
|
+
if "query" not in output or "reference_answer" not in output:
|
|
450
|
+
raise ValueError("Cant find question/answer in LLM response")
|
|
451
|
+
|
|
452
|
+
return output.get("query"), output.get("reference_answer")
|
|
453
|
+
|
|
454
|
+
def evaluate(self, test_data: pd.DataFrame) -> pd.DataFrame:
|
|
455
|
+
stats = []
|
|
456
|
+
questions = test_data.to_dict("records")
|
|
457
|
+
|
|
458
|
+
for i, item in enumerate(questions):
|
|
459
|
+
question = item["question"]
|
|
460
|
+
doc_id = item["doc_id"]
|
|
461
|
+
|
|
462
|
+
start_time = time.time()
|
|
463
|
+
logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
|
|
464
|
+
df_answers = self.kb.select_query(
|
|
465
|
+
Select(targets=[Identifier("chunk_content"), Identifier("id")], limit=Constant(self.TOP_K))
|
|
466
|
+
)
|
|
467
|
+
query_time = time.time() - start_time
|
|
468
|
+
|
|
469
|
+
retrieved_doc_ids = list(df_answers["id"])
|
|
470
|
+
|
|
471
|
+
if doc_id in retrieved_doc_ids:
|
|
472
|
+
doc_found = True
|
|
473
|
+
doc_position = retrieved_doc_ids.index(doc_id)
|
|
474
|
+
else:
|
|
475
|
+
doc_found = False
|
|
476
|
+
doc_position = -1
|
|
477
|
+
|
|
478
|
+
stats.append(
|
|
479
|
+
{
|
|
480
|
+
"question": question,
|
|
481
|
+
"doc_id": doc_id,
|
|
482
|
+
"doc_found": doc_found,
|
|
483
|
+
"doc_position": doc_position,
|
|
484
|
+
"query_time": query_time,
|
|
485
|
+
}
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
evaluation_results = self.summarize_results(stats)
|
|
489
|
+
return pd.DataFrame([evaluation_results])
|
|
490
|
+
|
|
491
|
+
def summarize_results(self, stats):
|
|
492
|
+
total_questions = len(stats)
|
|
493
|
+
total_found = sum([1 for stat in stats if stat["doc_found"]])
|
|
494
|
+
|
|
495
|
+
accurate_in_top_10 = sum([1 for stat in stats if stat["doc_found"] and stat["doc_position"] < 10])
|
|
496
|
+
|
|
497
|
+
# calculate recall curve by position
|
|
498
|
+
recall_curve = {}
|
|
499
|
+
for i in range(self.TOP_K):
|
|
500
|
+
recall_curve[i] = sum([1 for stat in stats if stat["doc_found"] and stat["doc_position"] == i])
|
|
501
|
+
# convert to proportion of total questions
|
|
502
|
+
for i in range(self.TOP_K):
|
|
503
|
+
recall_curve[i] = recall_curve[i] / total_questions
|
|
504
|
+
# calculate cumulative recall
|
|
505
|
+
cumulative_recall = {}
|
|
506
|
+
for i in range(self.TOP_K):
|
|
507
|
+
cumulative_recall[i] = sum([recall_curve[j] for j in range(i + 1)])
|
|
508
|
+
|
|
509
|
+
avg_query_time = sum(item["query_time"] for item in stats) / total_questions
|
|
510
|
+
return {
|
|
511
|
+
"total": total_questions,
|
|
512
|
+
"total_found": total_found,
|
|
513
|
+
"retrieved_in_top_10": accurate_in_top_10,
|
|
514
|
+
"cumulative_recall": cumulative_recall,
|
|
515
|
+
"avg_query_time": avg_query_time,
|
|
516
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from openai import OpenAI, AzureOpenAI
|
|
6
|
+
|
|
7
|
+
from mindsdb.integrations.utilities.handler_utils import get_api_key
|
|
8
|
+
from mindsdb.utilities.config import config
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LLMClient:
|
|
12
|
+
"""
|
|
13
|
+
Class for accession to LLM.
|
|
14
|
+
It chooses openai client or litellm handler depending on the config
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, llm_params: dict = None):
|
|
18
|
+
params = copy.deepcopy(config.get("default_llm", {}))
|
|
19
|
+
|
|
20
|
+
if llm_params:
|
|
21
|
+
params.update(llm_params)
|
|
22
|
+
|
|
23
|
+
self.params = params
|
|
24
|
+
|
|
25
|
+
self.provider = params.get("provider", "openai")
|
|
26
|
+
|
|
27
|
+
if "api_key" not in params:
|
|
28
|
+
params["api_key"] = get_api_key(self.provider, params, strict=False)
|
|
29
|
+
|
|
30
|
+
if self.provider == "azure_openai":
|
|
31
|
+
azure_api_key = params.get("api_key") or os.getenv("AZURE_OPENAI_API_KEY")
|
|
32
|
+
azure_api_endpoint = params.get("base_url") or os.environ.get("AZURE_OPENAI_ENDPOINT")
|
|
33
|
+
azure_api_version = params.get("api_version") or os.environ.get("AZURE_OPENAI_API_VERSION")
|
|
34
|
+
self._llm_client = AzureOpenAI(
|
|
35
|
+
api_key=azure_api_key, azure_endpoint=azure_api_endpoint, api_version=azure_api_version, max_retries=2
|
|
36
|
+
)
|
|
37
|
+
elif self.provider == "openai":
|
|
38
|
+
openai_api_key = params.get("api_key") or os.getenv("OPENAI_API_KEY")
|
|
39
|
+
base_url = params.get("base_url")
|
|
40
|
+
self.client = OpenAI(api_key=openai_api_key, base_url=base_url, max_retries=2)
|
|
41
|
+
|
|
42
|
+
else:
|
|
43
|
+
# try to use litellm
|
|
44
|
+
from mindsdb.api.executor.controllers.session_controller import SessionController
|
|
45
|
+
|
|
46
|
+
session = SessionController()
|
|
47
|
+
module = session.integration_controller.get_handler_module("litellm")
|
|
48
|
+
|
|
49
|
+
if module is None or module.Handler is None:
|
|
50
|
+
raise ValueError(f'Unable to use "{self.provider}" provider. Litellm handler is not installed')
|
|
51
|
+
|
|
52
|
+
self.client = module.Handler
|
|
53
|
+
|
|
54
|
+
def completion(self, messages: List[dict]) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Call LLM completion and get response
|
|
57
|
+
"""
|
|
58
|
+
params = self.params
|
|
59
|
+
|
|
60
|
+
if self.provider in ("azure_openai", "openai"):
|
|
61
|
+
response = self.client.chat.completions.create(
|
|
62
|
+
model=params["model_name"],
|
|
63
|
+
messages=messages,
|
|
64
|
+
)
|
|
65
|
+
return response.choices[0].message.content
|
|
66
|
+
else:
|
|
67
|
+
kwargs = params.copy()
|
|
68
|
+
model = kwargs.pop("model_name")
|
|
69
|
+
|
|
70
|
+
base_url = params.pop("base_url", None)
|
|
71
|
+
if base_url is not None:
|
|
72
|
+
kwargs["api_base"] = base_url
|
|
73
|
+
|
|
74
|
+
response = self.client.completion(model=f"{self.provider}/{model}", messages=messages, args=kwargs)
|
|
75
|
+
return response.choices[0].message.content
|