bisheng-langchain 0.3.0rc0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. bisheng_langchain/chat_models/host_llm.py +1 -1
  2. bisheng_langchain/document_loaders/elem_unstrcutured_loader.py +5 -3
  3. bisheng_langchain/gpts/agent_types/llm_functions_agent.py +7 -1
  4. bisheng_langchain/gpts/assistant.py +8 -5
  5. bisheng_langchain/gpts/auto_optimization.py +28 -27
  6. bisheng_langchain/gpts/auto_tool_selected.py +14 -15
  7. bisheng_langchain/gpts/load_tools.py +53 -1
  8. bisheng_langchain/gpts/prompts/__init__.py +4 -2
  9. bisheng_langchain/gpts/prompts/assistant_prompt_base.py +1 -0
  10. bisheng_langchain/gpts/prompts/assistant_prompt_cohere.py +19 -0
  11. bisheng_langchain/gpts/prompts/opening_dialog_prompt.py +1 -1
  12. bisheng_langchain/gpts/tools/api_tools/__init__.py +1 -1
  13. bisheng_langchain/gpts/tools/api_tools/base.py +3 -3
  14. bisheng_langchain/gpts/tools/api_tools/flow.py +19 -7
  15. bisheng_langchain/gpts/tools/api_tools/macro_data.py +175 -4
  16. bisheng_langchain/gpts/tools/api_tools/openapi.py +101 -0
  17. bisheng_langchain/gpts/tools/api_tools/sina.py +2 -2
  18. bisheng_langchain/gpts/tools/code_interpreter/tool.py +118 -39
  19. bisheng_langchain/rag/__init__.py +5 -0
  20. bisheng_langchain/rag/bisheng_rag_pipeline.py +320 -0
  21. bisheng_langchain/rag/bisheng_rag_pipeline_v2.py +359 -0
  22. bisheng_langchain/rag/bisheng_rag_pipeline_v2_cohere_raw_prompting.py +376 -0
  23. bisheng_langchain/rag/bisheng_rag_tool.py +288 -0
  24. bisheng_langchain/rag/config/baseline.yaml +86 -0
  25. bisheng_langchain/rag/config/baseline_caibao.yaml +82 -0
  26. bisheng_langchain/rag/config/baseline_caibao_knowledge_v2.yaml +110 -0
  27. bisheng_langchain/rag/config/baseline_caibao_v2.yaml +112 -0
  28. bisheng_langchain/rag/config/baseline_demo_v2.yaml +92 -0
  29. bisheng_langchain/rag/config/baseline_s2b_mix.yaml +88 -0
  30. bisheng_langchain/rag/config/baseline_v2.yaml +90 -0
  31. bisheng_langchain/rag/extract_info.py +38 -0
  32. bisheng_langchain/rag/init_retrievers/__init__.py +4 -0
  33. bisheng_langchain/rag/init_retrievers/baseline_vector_retriever.py +61 -0
  34. bisheng_langchain/rag/init_retrievers/keyword_retriever.py +65 -0
  35. bisheng_langchain/rag/init_retrievers/mix_retriever.py +103 -0
  36. bisheng_langchain/rag/init_retrievers/smaller_chunks_retriever.py +92 -0
  37. bisheng_langchain/rag/prompts/__init__.py +9 -0
  38. bisheng_langchain/rag/prompts/extract_key_prompt.py +34 -0
  39. bisheng_langchain/rag/prompts/prompt.py +47 -0
  40. bisheng_langchain/rag/prompts/prompt_cohere.py +111 -0
  41. bisheng_langchain/rag/qa_corpus/__init__.py +0 -0
  42. bisheng_langchain/rag/qa_corpus/qa_generator.py +143 -0
  43. bisheng_langchain/rag/rerank/__init__.py +5 -0
  44. bisheng_langchain/rag/rerank/rerank.py +48 -0
  45. bisheng_langchain/rag/rerank/rerank_benchmark.py +139 -0
  46. bisheng_langchain/rag/run_qa_gen_web.py +47 -0
  47. bisheng_langchain/rag/run_rag_evaluate_web.py +55 -0
  48. bisheng_langchain/rag/scoring/__init__.py +0 -0
  49. bisheng_langchain/rag/scoring/llama_index_score.py +91 -0
  50. bisheng_langchain/rag/scoring/ragas_score.py +183 -0
  51. bisheng_langchain/rag/utils.py +181 -0
  52. bisheng_langchain/retrievers/ensemble.py +2 -1
  53. bisheng_langchain/vectorstores/elastic_keywords_search.py +2 -1
  54. {bisheng_langchain-0.3.0rc0.dist-info → bisheng_langchain-0.3.1.dist-info}/METADATA +1 -1
  55. {bisheng_langchain-0.3.0rc0.dist-info → bisheng_langchain-0.3.1.dist-info}/RECORD +57 -22
  56. bisheng_langchain/gpts/prompts/base_prompt.py +0 -1
  57. {bisheng_langchain-0.3.0rc0.dist-info → bisheng_langchain-0.3.1.dist-info}/WHEEL +0 -0
  58. {bisheng_langchain-0.3.0rc0.dist-info → bisheng_langchain-0.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,143 @@
1
+ import os
2
+ import random
3
+ import json
4
+ import copy
5
+ import pandas as pd
6
+ from loguru import logger
7
+ from tqdm import tqdm
8
+ from langchain.document_loaders import PyPDFLoader
9
+ from langchain_core.prompts import PromptTemplate
10
+ from bisheng_langchain.document_loaders import ElemUnstructuredLoader
11
+ from ragas.trainset import TrainsetGenerator
12
+
13
+
14
+ prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
15
+
16
+ {context}
17
+
18
+ Question: {question}
19
+ Helpful Answer:"""
20
+ PROMPT = PromptTemplate(
21
+ template=prompt_template, input_variables=["context", "question"]
22
+ )
23
+
24
+
25
+ class RagQAGenerator(object):
26
+
27
+ def __init__(self,
28
+ corpus_folder,
29
+ qa_gen_folder,
30
+ unstructured_api_url="https://bisheng.dataelem.com/api/v1/etl4llm/predict",
31
+ model_name="gpt-4-0125-preview"):
32
+ self.unstructured_api_url = unstructured_api_url
33
+ self.corpus_folder = corpus_folder
34
+ self.qa_gen_folder = qa_gen_folder
35
+ self.model_name = model_name
36
+ if not os.path.exists(self.qa_gen_folder):
37
+ os.makedirs(self.qa_gen_folder)
38
+
39
+ def generate(self):
40
+ for file_name in tqdm(os.listdir(self.corpus_folder)):
41
+ file_path = os.path.join(self.corpus_folder, file_name)
42
+ logger.info(f'{file_name} generate qa start ...')
43
+ # only consider pdf file
44
+ if file_name.endswith('.pdf'):
45
+ self.generate_qa_each_file(file_path)
46
+ else:
47
+ continue
48
+
49
+ def generate_qa_each_file(self, file_path, train_size=100):
50
+ file_name = os.path.basename(file_path)
51
+ loader = ElemUnstructuredLoader(file_name=file_name,
52
+ file_path=file_path,
53
+ unstructured_api_url=self.unstructured_api_url)
54
+ documents = loader.load()
55
+ for doc in documents:
56
+ doc.metadata = dict()
57
+ logger.info(f'documents: {len(documents)}')
58
+
59
+ trainsetgenerator = TrainsetGenerator.from_default(
60
+ openai_generator_llm=self.model_name,
61
+ openai_filter_llm=self.model_name)
62
+ trainset = trainsetgenerator.generate(documents, train_size=train_size)
63
+
64
+ save_path = os.path.join(self.qa_gen_folder, os.path.splitext(file_name)[0] + '_qa_gen.xlsx')
65
+ df = trainset.to_pandas()
66
+ df.to_excel(save_path, index=False)
67
+ return save_path
68
+
69
+ def statistic_qa(self):
70
+ total_qa_num = 0
71
+ all_qa_info = dict()
72
+ for file_name in os.listdir(self.qa_gen_folder):
73
+ file_path = os.path.join(self.qa_gen_folder, file_name)
74
+ if file_name.endswith('.xlsx'):
75
+ df = pd.read_excel(file_path)
76
+ qa_info = df.to_dict('records')
77
+ logger.info(f'{file_name} qa num: {len(qa_info)}')
78
+ total_qa_num += len(qa_info)
79
+ all_qa_info[file_name] = qa_info
80
+ logger.info(f'total_file_num: {len(list(all_qa_info.keys()))}, total_qa_num: {total_qa_num}')
81
+ return all_qa_info
82
+
83
+ def format_qa_for_sft(self, min_context_num=3, max_context_num=7):
84
+ random.seed(123)
85
+ all_qa_info = self.statistic_qa()
86
+ train_samples = []
87
+ test_samples = []
88
+ for file_name in all_qa_info:
89
+ # each file qa
90
+ qa_info = all_qa_info[file_name]
91
+ if len(qa_info) == 0:
92
+ continue
93
+ contexts = []
94
+ for qa in qa_info:
95
+ ground_truth_context = str(eval(qa['ground_truth_context'])[0])
96
+ contexts.append(ground_truth_context)
97
+
98
+ random.shuffle(qa_info)
99
+ for i, qa in enumerate(qa_info):
100
+ question = qa['question']
101
+ ground_truth_context = str(eval(qa['ground_truth_context'])[0])
102
+ ground_truth = str(eval(qa['ground_truth'])[0])
103
+
104
+ # 加入其他干扰context
105
+ random_number = random.randint(
106
+ min(min_context_num, len(contexts)),
107
+ min(max_context_num, len(contexts))
108
+ )
109
+ random_context = random.sample(contexts, random_number)
110
+ if ground_truth_context in random_context:
111
+ random_context.remove(ground_truth_context)
112
+ # 将当前context随机插入到其他context中
113
+ insert_position = random.randint(0, len(random_context))
114
+ random_context.insert(insert_position, ground_truth_context)
115
+
116
+ random_context = '\n\n'.join(random_context)
117
+ prompt = PROMPT.format(context=random_context, question=question)
118
+ each_sample = {
119
+ 'instruction': '',
120
+ 'input': prompt,
121
+ 'output': ground_truth,
122
+ 'history': []
123
+ }
124
+ if i < 0.9 * len(qa_info):
125
+ train_samples.append(each_sample)
126
+ else:
127
+ test_samples.append(each_sample)
128
+
129
+ logger.info(f'train_samples: {len(train_samples)} test_samples: {len(test_samples)}')
130
+ save_folder = os.path.dirname(self.qa_gen_folder)
131
+ with open(os.path.join(save_folder, f'train_samples_ganrao_chunk{max_context_num+1}.json'), 'w') as f:
132
+ json.dump(train_samples, f, indent=2, ensure_ascii=False)
133
+ with open(os.path.join(save_folder, f'test_samples_ganrao_chunk{max_context_num+1}.json'), 'w') as f:
134
+ json.dump(test_samples, f, indent=2, ensure_ascii=False)
135
+
136
+
137
+ if __name__ == '__main__':
138
+ corpus_folder = '/home/public/rag_benchmark_v1.0/rag_benchmark_processed'
139
+ qa_gen_folder = '/home/public/rag_benchmark_v1.0/rag_qa_gen_filter'
140
+ generator = RagQAGenerator(corpus_folder=corpus_folder, qa_gen_folder=qa_gen_folder)
141
+ # generator.generate()
142
+ # generator.statistic_qa()
143
+ generator.format_qa_for_sft(min_context_num=5, max_context_num=11)
@@ -0,0 +1,5 @@
1
+ from .rerank import CustomReranker
2
+
3
+ __all__ = [
4
+ 'CustomReranker',
5
+ ]
@@ -0,0 +1,48 @@
1
+ import torch
2
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
3
+
4
+
5
+ class CustomReranker:
6
+
7
+ def __init__(self, model_path, device_id='cuda:0', threshold=0.0):
8
+ self.device_id = device_id
9
+ self.threshold = threshold
10
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
11
+ self.rank_model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device_id)
12
+ self.rank_model.eval()
13
+
14
+ def match_score(self, chunk, query):
15
+ """
16
+ rerank模型计算query和chunk的相似度
17
+ """
18
+ pairs = [[query, chunk]]
19
+
20
+ with torch.no_grad():
21
+ inputs = self.tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512).to(self.device_id)
22
+ scores = self.rank_model(**inputs, return_dict=True).logits.view(-1, ).float()
23
+ scores = torch.sigmoid(scores)
24
+ scores = scores.cpu().numpy()
25
+
26
+ return scores[0]
27
+
28
+ def sort_and_filter(self, query, all_chunks):
29
+ """
30
+ rerank模型对所有chunk进行排序
31
+ """
32
+ chunk_match_score = []
33
+ for index, chunk in enumerate(all_chunks):
34
+ chunk_text = chunk.page_content
35
+ chunk_match_score.append(self.match_score(chunk_text, query))
36
+
37
+ sorted_res = sorted(enumerate(chunk_match_score), key=lambda x: -x[1])
38
+ remain_chunks = [all_chunks[elem[0]] for elem in sorted_res if elem[1] >= self.threshold]
39
+ if not remain_chunks:
40
+ remain_chunks = [all_chunks[sorted_res[0][0]]]
41
+
42
+ # for index, chunk in enumerate(remain_chunks):
43
+ # print('query:', query)
44
+ # print('chunk_text:', chunk.page_content)
45
+ # print('socre:', sorted_res[index][1])
46
+ # print('***********')
47
+
48
+ return remain_chunks
@@ -0,0 +1,139 @@
1
+ import json
2
+ import os
3
+ import numpy as np
4
+ import torch
5
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
6
+
7
+ tokenizer = AutoTokenizer.from_pretrained('/home/public/llm/bge-reranker-large')
8
+ model = AutoModelForSequenceClassification.from_pretrained('/home/public/llm/bge-reranker-large').to('cuda:2')
9
+ model.eval()
10
+
11
+
12
+ def min_edit_distance(a, b):
13
+ dp = [[0 for i in range(len(b) + 1)] for j in range(len(a) + 1)]
14
+ for i in range(len(a) + 1):
15
+ dp[i][0] = i
16
+ for j in range(len(b) + 1):
17
+ dp[0][j] = j
18
+ for i in range(1, len(a) + 1):
19
+ for j in range(1, len(b) + 1):
20
+ if a[i - 1] == b[j - 1]:
21
+ dp[i][j] = dp[i - 1][j - 1]
22
+ else:
23
+ dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + 1)
24
+ return dp[-1][-1]
25
+
26
+
27
+ def is_matched(text0, text1, thrd=10):
28
+ text0.replace(" ", "").replace("\n", "")
29
+ text1.replace(" ", "").replace("\n", "")
30
+ dist = min_edit_distance(text0, text1)
31
+ if dist < thrd:
32
+ return True
33
+ return False
34
+
35
+
36
+ def match_score(chunk, query):
37
+ """
38
+ rerank模型计算query和chunk的相似度
39
+ """
40
+ pairs = [[query, chunk]]
41
+
42
+ with torch.no_grad():
43
+ inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512).to('cuda:2')
44
+ scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
45
+ scores = torch.sigmoid(scores)
46
+ scores = scores.cpu().numpy()
47
+
48
+ return scores[0]
49
+
50
+
51
+ def sort_filter_all_chunks_method1(d, th=0.0):
52
+ """
53
+ rerank模型对所有chunk进行排序
54
+ """
55
+ # answer关键词已提取好
56
+ query = d['question']
57
+ all_chunks = d['all_chunks']
58
+
59
+ chunk_match_score = []
60
+ for index, chunk in enumerate(all_chunks):
61
+ chunk_text = chunk['text']
62
+ chunk_match_score.append(match_score(chunk_text, query))
63
+
64
+ sorted_res = sorted(enumerate(chunk_match_score), key=lambda x: -x[1])
65
+ print('-----------')
66
+ print(sorted_res)
67
+ remain_chunks = [all_chunks[elem[0]] for elem in sorted_res if elem[1] >= th]
68
+ if not remain_chunks:
69
+ remain_chunks = [all_chunks[sorted_res[0][0]]]
70
+
71
+ # for index, chunk in enumerate(remain_chunks):
72
+ # print('query:', query)
73
+ # print('chunk_text:', chunk['text'])
74
+ # print('socre:', sorted_res[index][1])
75
+ # print('***********')
76
+
77
+ d['all_chunks'] = remain_chunks
78
+
79
+
80
+ def calc_precision_recall(d):
81
+ """
82
+ 计算分数
83
+ """
84
+ d_ves = d["all_chunks"][:10]
85
+ d_es = d["all_chunks"][10:]
86
+ all_chunks = []
87
+ for i in range(len(d_es)):
88
+ all_chunks.append(d_es[i])
89
+ all_chunks.append(d_ves[i])
90
+ all_chunks.extend(d_ves[i+1:])
91
+ d["all_chunks"] = all_chunks
92
+
93
+ sort_filter_all_chunks_method1(d)
94
+ NCHUNK = len(d["chunks"])
95
+ NCHUNK_ALL = len(d["all_chunks"])
96
+ print('chunks:', NCHUNK, 'all_chunks:', NCHUNK_ALL)
97
+
98
+ scores = np.zeros((NCHUNK, NCHUNK_ALL))
99
+ for j in range(NCHUNK):
100
+ for i in range(NCHUNK):
101
+ if d["chunks"][j]["text"] == d["all_chunks"][i]['text']:
102
+ scores[j][i] = 1
103
+ elif abs(len(d["chunks"][j]["text"]) - len(d["all_chunks"][i]['text'])) > 10:
104
+ scores[j][i] = 0
105
+ elif is_matched(d["chunks"][j]["text"], d["all_chunks"][i]['text']):
106
+ scores[j][i] = 1
107
+
108
+ N_gt = NCHUNK
109
+ N_all = NCHUNK_ALL
110
+ if NCHUNK != 0:
111
+ N_right = sum(scores.max(axis=1))
112
+ else:
113
+ N_right = 0
114
+ recall = 0 if N_gt == 0 else N_right / N_gt
115
+ return recall, N_gt, N_right, N_all
116
+
117
+
118
+ with open('zhaogushu_retriever_gt_convert_key.json', 'r') as f:
119
+ retriever_gt_list = json.load(f)
120
+ mFieldRecall = 0
121
+ total_N_gt, total_N_right, total_N_all = 0, 0, 0
122
+ D_SCORES = {}
123
+ nquestion = 0
124
+ for d in retriever_gt_list:
125
+ recall, N_gt, N_right, N_all = calc_precision_recall(d)
126
+ mFieldRecall += recall
127
+ D_SCORES[d["question"]] = {'recall': recall}
128
+ total_N_gt += N_gt
129
+ total_N_right += N_right
130
+ total_N_all += N_all
131
+ nquestion += 1
132
+
133
+ mFieldRecall = 0 if nquestion == 0 else mFieldRecall / nquestion
134
+ mMethodRecall = total_N_right / total_N_gt
135
+
136
+ print(f'mFieldRecall: {mFieldRecall * 100:.2f} %')
137
+ print(f'mMethodRecall: {mMethodRecall * 100:.2f} %')
138
+ print(f'total_N_right: {total_N_right}, total_N_gt: {total_N_gt}, total_N_all: {total_N_all}' )
139
+ print(f'nquestion: {nquestion}, mean_N_right: {total_N_right / nquestion}, mean_N_gt: {total_N_gt / nquestion}, mean_N_all: {total_N_all / nquestion}')
@@ -0,0 +1,47 @@
1
+ import os
2
+ import tempfile
3
+ import gradio as gr
4
+ from bisheng_langchain.rag.qa_corpus.qa_generator import RagQAGenerator
5
+
6
+
7
+ tmpdir = '/home/public/rag_benchmark_v1.0/tmp'
8
+ if not os.path.exists(tmpdir):
9
+ os.makedirs(tmpdir)
10
+
11
+ qa_gen_folder = '/home/public/rag_benchmark_v1.0/rag_qa_gen_demo'
12
+ if not os.path.exists(qa_gen_folder):
13
+ os.makedirs(qa_gen_folder)
14
+ model_name = "gpt-4-0125-preview"
15
+ unstructured_api_url = "https://bisheng.dataelem.com/api/v1/etl4llm/predict"
16
+ generator = RagQAGenerator(corpus_folder='',
17
+ qa_gen_folder=qa_gen_folder,
18
+ unstructured_api_url=unstructured_api_url,
19
+ model_name=model_name)
20
+
21
+
22
+ def qa_gen_run(intput_file, gen_qa_num):
23
+ gen_qa_num = int(gen_qa_num)
24
+ file_path = intput_file.name
25
+ output_file = generator.generate_qa_each_file(file_path, train_size=gen_qa_num)
26
+ return output_file
27
+
28
+
29
+ with tempfile.TemporaryDirectory(dir=tmpdir) as tmpdir:
30
+ with gr.Blocks(css='#margin-top {margin-top: 15px} #center {text-align: center;} #description {text-align: center}') as demo:
31
+ with gr.Row(elem_id='center'):
32
+ gr.Markdown('# Bisheng qa auto generation Demo')
33
+
34
+ with gr.Row(elem_id = 'description'):
35
+ gr.Markdown("""Qa generation for anything.""")
36
+
37
+ with gr.Row():
38
+ intput_file = gr.components.File(label='FlowFile')
39
+ gen_qa_num = gr.Textbox(label='生成的问题数量', value=10, interactive=True, lines=2)
40
+
41
+ with gr.Row():
42
+ with gr.Column():
43
+ btn0 = gr.Button('Run Qa Gen')
44
+ out0 = gr.components.File(label='FlowFile')
45
+ btn0.click(fn=qa_gen_run, inputs=[intput_file, gen_qa_num], outputs=out0)
46
+
47
+ demo.launch(server_name='192.168.106.20', server_port=9118, share=True)
@@ -0,0 +1,55 @@
1
+ import os
2
+ import gradio as gr
3
+ import pandas as pd
4
+ from pathlib import Path
5
+ from gradio import components
6
+ from bisheng_langchain.rag.scoring.ragas_score import RagScore
7
+
8
+
9
+ save_folder = '/home/public/rag_benchmark_v1.0/rag_score_demo'
10
+ if not os.path.exists(save_folder):
11
+ os.makedirs(save_folder)
12
+
13
+
14
+ def rag_evaluate(excel_file, metric='answer_recall_bisheng', batch_size=5):
15
+ excel_path = excel_file.name
16
+ df = pd.read_excel(excel_path)
17
+ if '问题类型' not in df.columns:
18
+ df['问题类型'] = len(df['问题'].tolist()) * ['QA']
19
+ df.to_excel(excel_path, index=False)
20
+ params = {
21
+ 'excel_path': excel_path,
22
+ 'save_path': save_folder,
23
+ 'question_column': '问题',
24
+ 'query_type_column': '问题类型',
25
+ 'gt_column': '人工标注',
26
+ 'answer_column': '模型回答',
27
+ 'metrics': [metric],
28
+ 'batch_size': int(batch_size),
29
+ }
30
+ rag_score = RagScore(**params)
31
+ rag_score.score()
32
+
33
+ output_path = Path(save_folder) / f"{Path(excel_path).stem}_score.xlsx"
34
+ return str(output_path)
35
+
36
+
37
+ if __name__ == '__main__':
38
+ title = """毕昇QA问答自动评估系统"""
39
+ with gr.Blocks() as demo:
40
+ gr.Markdown(title)
41
+
42
+ with gr.Row():
43
+ with gr.Column(scale=2):
44
+ with gr.Row():
45
+ eval_intput_file = gr.components.File(label='FlowFile')
46
+ with gr.Column():
47
+ metric_options = ["answer_recall_bisheng", "answer_correctness_bisheng"]
48
+ metric = gr.components.Dropdown(label="评估方法", choices=metric_options, default=metric_options[0], interactive=True)
49
+ # metric = gr.Textbox(label='评估方法', value='answer_recall_bisheng', interactive=True, lines=2)
50
+ batch_size = gr.Textbox(label='批评估大小', value=5, interactive=True, lines=2)
51
+ btn0 = gr.Button('Run Evaluation')
52
+ eval_out_file = gr.components.File(label='FlowFile')
53
+
54
+ btn0.click(fn=rag_evaluate, inputs=[eval_intput_file, metric, batch_size], outputs=[eval_out_file])
55
+ demo.queue().launch(share=False, inbrowser=True, server_name="0.0.0.0", server_port=8218)
File without changes
@@ -0,0 +1,91 @@
1
+ import os
2
+ import shutil
3
+
4
+ import httpx
5
+ import nest_asyncio
6
+ import pandas as pd
7
+
8
+ nest_asyncio.apply()
9
+ from collections import defaultdict
10
+
11
+ from llama_index import ServiceContext
12
+ from llama_index.embeddings.openai import OpenAIEmbedding
13
+ from llama_index.evaluation import CorrectnessEvaluator
14
+ from llama_index.llms import OpenAI
15
+ from tqdm import tqdm
16
+
17
+ openai_api_key = os.environ.get('OPENAI_API_KEY', '')
18
+ openai_proxy = os.environ.get('OPENAI_PROXY', '')
19
+
20
+
21
+ def llama_index_answer_correctness(querys, responses, references):
22
+ embed = OpenAIEmbedding(api_key=openai_api_key, http_client=httpx.AsyncClient(proxies=openai_proxy))
23
+
24
+ model_name = "gpt-3.5-turbo-16k"
25
+ service_context = ServiceContext.from_defaults(
26
+ llm=OpenAI(model=model_name, api_key=openai_api_key, http_client=httpx.AsyncClient(proxies=openai_proxy)),
27
+ embed_model=embed,
28
+ )
29
+ evaluator = CorrectnessEvaluator(service_context=service_context)
30
+
31
+ correctness_scores = []
32
+ correctness_feedbacks = []
33
+ for i in tqdm(range(len(querys))):
34
+ result = evaluator.evaluate(
35
+ query=querys[i],
36
+ response=responses[i],
37
+ reference=references[i],
38
+ )
39
+ correctness = result.score
40
+ feedback = result.feedback
41
+ correctness_scores.append(correctness)
42
+ correctness_feedbacks.append(feedback)
43
+ return correctness_scores, correctness_feedbacks
44
+
45
+
46
+ def rag_benchmark_scoring(excel_file):
47
+ if not os.path.exists(excel_file + '.bak'):
48
+ shutil.copy(excel_file, excel_file + '.bak')
49
+
50
+ df = pd.read_excel(excel_file)
51
+ df.dropna(subset=['问题', 'GT', 'rag_answer'], inplace=True)
52
+ all_questions_info = df.to_dict('records')
53
+
54
+ questions = []
55
+ ground_truths = []
56
+ answers = []
57
+ for question_info in all_questions_info:
58
+ question = question_info['问题']
59
+ gt = question_info['GT']
60
+ pred = question_info['rag_answer']
61
+
62
+ questions.append(question)
63
+ answers.append(pred)
64
+ ground_truths.append(gt)
65
+
66
+ correctness_scores, correctness_feedbacks = llama_index_answer_correctness(questions, answers, ground_truths)
67
+
68
+ score_map = {
69
+ 'llama_index_correctness': correctness_scores,
70
+ }
71
+
72
+ for metric, scores in score_map.items():
73
+ df[metric] = df.index.map({i: score for i, score in enumerate(scores)})
74
+ df.to_excel(excel_file, index=False)
75
+
76
+ if '问题类型' in df.columns:
77
+ grouped_df = (
78
+ df.groupby('问题类型')
79
+ .agg({'问题': 'count', **{metric: 'mean' for metric in score_map}})
80
+ .rename(columns={'问题': '问题数量'})
81
+ )
82
+ total_question = grouped_df['问题数量'].sum()
83
+ grouped_df.loc['all', '问题数量'] = total_question
84
+ for metric in score_map:
85
+ grouped_df.loc['all', metric] = df[metric].sum() / total_question
86
+ return grouped_df
87
+
88
+
89
+ if __name__ == '__main__':
90
+ excel_file = './data/benchmark_v1.0.xlsx'
91
+ print(rag_benchmark_scoring(excel_file))