bisheng-langchain 0.3.0rc1__py3-none-any.whl → 0.3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bisheng_langchain/document_loaders/elem_unstrcutured_loader.py +5 -3
- bisheng_langchain/gpts/agent_types/llm_functions_agent.py +7 -1
- bisheng_langchain/gpts/assistant.py +8 -5
- bisheng_langchain/gpts/load_tools.py +2 -0
- bisheng_langchain/gpts/prompts/__init__.py +4 -2
- bisheng_langchain/gpts/prompts/assistant_prompt_base.py +1 -0
- bisheng_langchain/gpts/prompts/assistant_prompt_cohere.py +19 -0
- bisheng_langchain/gpts/tools/api_tools/flow.py +3 -3
- bisheng_langchain/gpts/tools/api_tools/openapi.py +101 -0
- bisheng_langchain/rag/__init__.py +5 -0
- bisheng_langchain/rag/bisheng_rag_pipeline.py +320 -0
- bisheng_langchain/rag/bisheng_rag_pipeline_v2.py +359 -0
- bisheng_langchain/rag/bisheng_rag_pipeline_v2_cohere_raw_prompting.py +376 -0
- bisheng_langchain/rag/bisheng_rag_tool.py +288 -0
- bisheng_langchain/rag/config/baseline.yaml +86 -0
- bisheng_langchain/rag/config/baseline_caibao.yaml +82 -0
- bisheng_langchain/rag/config/baseline_caibao_knowledge_v2.yaml +110 -0
- bisheng_langchain/rag/config/baseline_caibao_v2.yaml +112 -0
- bisheng_langchain/rag/config/baseline_demo_v2.yaml +92 -0
- bisheng_langchain/rag/config/baseline_s2b_mix.yaml +88 -0
- bisheng_langchain/rag/config/baseline_v2.yaml +90 -0
- bisheng_langchain/rag/extract_info.py +38 -0
- bisheng_langchain/rag/init_retrievers/__init__.py +4 -0
- bisheng_langchain/rag/init_retrievers/baseline_vector_retriever.py +61 -0
- bisheng_langchain/rag/init_retrievers/keyword_retriever.py +65 -0
- bisheng_langchain/rag/init_retrievers/mix_retriever.py +103 -0
- bisheng_langchain/rag/init_retrievers/smaller_chunks_retriever.py +92 -0
- bisheng_langchain/rag/prompts/__init__.py +9 -0
- bisheng_langchain/rag/prompts/extract_key_prompt.py +34 -0
- bisheng_langchain/rag/prompts/prompt.py +47 -0
- bisheng_langchain/rag/prompts/prompt_cohere.py +111 -0
- bisheng_langchain/rag/qa_corpus/__init__.py +0 -0
- bisheng_langchain/rag/qa_corpus/qa_generator.py +143 -0
- bisheng_langchain/rag/rerank/__init__.py +5 -0
- bisheng_langchain/rag/rerank/rerank.py +48 -0
- bisheng_langchain/rag/rerank/rerank_benchmark.py +139 -0
- bisheng_langchain/rag/run_qa_gen_web.py +47 -0
- bisheng_langchain/rag/run_rag_evaluate_web.py +55 -0
- bisheng_langchain/rag/scoring/__init__.py +0 -0
- bisheng_langchain/rag/scoring/llama_index_score.py +91 -0
- bisheng_langchain/rag/scoring/ragas_score.py +183 -0
- bisheng_langchain/rag/utils.py +181 -0
- bisheng_langchain/retrievers/ensemble.py +2 -1
- bisheng_langchain/vectorstores/elastic_keywords_search.py +2 -1
- {bisheng_langchain-0.3.0rc1.dist-info → bisheng_langchain-0.3.1.1.dist-info}/METADATA +1 -1
- {bisheng_langchain-0.3.0rc1.dist-info → bisheng_langchain-0.3.1.1.dist-info}/RECORD +48 -13
- bisheng_langchain/gpts/prompts/base_prompt.py +0 -1
- {bisheng_langchain-0.3.0rc1.dist-info → bisheng_langchain-0.3.1.1.dist-info}/WHEEL +0 -0
- {bisheng_langchain-0.3.0rc1.dist-info → bisheng_langchain-0.3.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,143 @@
|
|
1
|
+
import os
|
2
|
+
import random
|
3
|
+
import json
|
4
|
+
import copy
|
5
|
+
import pandas as pd
|
6
|
+
from loguru import logger
|
7
|
+
from tqdm import tqdm
|
8
|
+
from langchain.document_loaders import PyPDFLoader
|
9
|
+
from langchain_core.prompts import PromptTemplate
|
10
|
+
from bisheng_langchain.document_loaders import ElemUnstructuredLoader
|
11
|
+
from ragas.trainset import TrainsetGenerator
|
12
|
+
|
13
|
+
|
14
|
+
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
15
|
+
|
16
|
+
{context}
|
17
|
+
|
18
|
+
Question: {question}
|
19
|
+
Helpful Answer:"""
|
20
|
+
PROMPT = PromptTemplate(
|
21
|
+
template=prompt_template, input_variables=["context", "question"]
|
22
|
+
)
|
23
|
+
|
24
|
+
|
25
|
+
class RagQAGenerator(object):
|
26
|
+
|
27
|
+
def __init__(self,
|
28
|
+
corpus_folder,
|
29
|
+
qa_gen_folder,
|
30
|
+
unstructured_api_url="https://bisheng.dataelem.com/api/v1/etl4llm/predict",
|
31
|
+
model_name="gpt-4-0125-preview"):
|
32
|
+
self.unstructured_api_url = unstructured_api_url
|
33
|
+
self.corpus_folder = corpus_folder
|
34
|
+
self.qa_gen_folder = qa_gen_folder
|
35
|
+
self.model_name = model_name
|
36
|
+
if not os.path.exists(self.qa_gen_folder):
|
37
|
+
os.makedirs(self.qa_gen_folder)
|
38
|
+
|
39
|
+
def generate(self):
|
40
|
+
for file_name in tqdm(os.listdir(self.corpus_folder)):
|
41
|
+
file_path = os.path.join(self.corpus_folder, file_name)
|
42
|
+
logger.info(f'{file_name} generate qa start ...')
|
43
|
+
# only consider pdf file
|
44
|
+
if file_name.endswith('.pdf'):
|
45
|
+
self.generate_qa_each_file(file_path)
|
46
|
+
else:
|
47
|
+
continue
|
48
|
+
|
49
|
+
def generate_qa_each_file(self, file_path, train_size=100):
|
50
|
+
file_name = os.path.basename(file_path)
|
51
|
+
loader = ElemUnstructuredLoader(file_name=file_name,
|
52
|
+
file_path=file_path,
|
53
|
+
unstructured_api_url=self.unstructured_api_url)
|
54
|
+
documents = loader.load()
|
55
|
+
for doc in documents:
|
56
|
+
doc.metadata = dict()
|
57
|
+
logger.info(f'documents: {len(documents)}')
|
58
|
+
|
59
|
+
trainsetgenerator = TrainsetGenerator.from_default(
|
60
|
+
openai_generator_llm=self.model_name,
|
61
|
+
openai_filter_llm=self.model_name)
|
62
|
+
trainset = trainsetgenerator.generate(documents, train_size=train_size)
|
63
|
+
|
64
|
+
save_path = os.path.join(self.qa_gen_folder, os.path.splitext(file_name)[0] + '_qa_gen.xlsx')
|
65
|
+
df = trainset.to_pandas()
|
66
|
+
df.to_excel(save_path, index=False)
|
67
|
+
return save_path
|
68
|
+
|
69
|
+
def statistic_qa(self):
|
70
|
+
total_qa_num = 0
|
71
|
+
all_qa_info = dict()
|
72
|
+
for file_name in os.listdir(self.qa_gen_folder):
|
73
|
+
file_path = os.path.join(self.qa_gen_folder, file_name)
|
74
|
+
if file_name.endswith('.xlsx'):
|
75
|
+
df = pd.read_excel(file_path)
|
76
|
+
qa_info = df.to_dict('records')
|
77
|
+
logger.info(f'{file_name} qa num: {len(qa_info)}')
|
78
|
+
total_qa_num += len(qa_info)
|
79
|
+
all_qa_info[file_name] = qa_info
|
80
|
+
logger.info(f'total_file_num: {len(list(all_qa_info.keys()))}, total_qa_num: {total_qa_num}')
|
81
|
+
return all_qa_info
|
82
|
+
|
83
|
+
def format_qa_for_sft(self, min_context_num=3, max_context_num=7):
|
84
|
+
random.seed(123)
|
85
|
+
all_qa_info = self.statistic_qa()
|
86
|
+
train_samples = []
|
87
|
+
test_samples = []
|
88
|
+
for file_name in all_qa_info:
|
89
|
+
# each file qa
|
90
|
+
qa_info = all_qa_info[file_name]
|
91
|
+
if len(qa_info) == 0:
|
92
|
+
continue
|
93
|
+
contexts = []
|
94
|
+
for qa in qa_info:
|
95
|
+
ground_truth_context = str(eval(qa['ground_truth_context'])[0])
|
96
|
+
contexts.append(ground_truth_context)
|
97
|
+
|
98
|
+
random.shuffle(qa_info)
|
99
|
+
for i, qa in enumerate(qa_info):
|
100
|
+
question = qa['question']
|
101
|
+
ground_truth_context = str(eval(qa['ground_truth_context'])[0])
|
102
|
+
ground_truth = str(eval(qa['ground_truth'])[0])
|
103
|
+
|
104
|
+
# 加入其他干扰context
|
105
|
+
random_number = random.randint(
|
106
|
+
min(min_context_num, len(contexts)),
|
107
|
+
min(max_context_num, len(contexts))
|
108
|
+
)
|
109
|
+
random_context = random.sample(contexts, random_number)
|
110
|
+
if ground_truth_context in random_context:
|
111
|
+
random_context.remove(ground_truth_context)
|
112
|
+
# 将当前context随机插入到其他context中
|
113
|
+
insert_position = random.randint(0, len(random_context))
|
114
|
+
random_context.insert(insert_position, ground_truth_context)
|
115
|
+
|
116
|
+
random_context = '\n\n'.join(random_context)
|
117
|
+
prompt = PROMPT.format(context=random_context, question=question)
|
118
|
+
each_sample = {
|
119
|
+
'instruction': '',
|
120
|
+
'input': prompt,
|
121
|
+
'output': ground_truth,
|
122
|
+
'history': []
|
123
|
+
}
|
124
|
+
if i < 0.9 * len(qa_info):
|
125
|
+
train_samples.append(each_sample)
|
126
|
+
else:
|
127
|
+
test_samples.append(each_sample)
|
128
|
+
|
129
|
+
logger.info(f'train_samples: {len(train_samples)} test_samples: {len(test_samples)}')
|
130
|
+
save_folder = os.path.dirname(self.qa_gen_folder)
|
131
|
+
with open(os.path.join(save_folder, f'train_samples_ganrao_chunk{max_context_num+1}.json'), 'w') as f:
|
132
|
+
json.dump(train_samples, f, indent=2, ensure_ascii=False)
|
133
|
+
with open(os.path.join(save_folder, f'test_samples_ganrao_chunk{max_context_num+1}.json'), 'w') as f:
|
134
|
+
json.dump(test_samples, f, indent=2, ensure_ascii=False)
|
135
|
+
|
136
|
+
|
137
|
+
if __name__ == '__main__':
|
138
|
+
corpus_folder = '/home/public/rag_benchmark_v1.0/rag_benchmark_processed'
|
139
|
+
qa_gen_folder = '/home/public/rag_benchmark_v1.0/rag_qa_gen_filter'
|
140
|
+
generator = RagQAGenerator(corpus_folder=corpus_folder, qa_gen_folder=qa_gen_folder)
|
141
|
+
# generator.generate()
|
142
|
+
# generator.statistic_qa()
|
143
|
+
generator.format_qa_for_sft(min_context_num=5, max_context_num=11)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
import torch
|
2
|
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
3
|
+
|
4
|
+
|
5
|
+
class CustomReranker:
|
6
|
+
|
7
|
+
def __init__(self, model_path, device_id='cuda:0', threshold=0.0):
|
8
|
+
self.device_id = device_id
|
9
|
+
self.threshold = threshold
|
10
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
11
|
+
self.rank_model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device_id)
|
12
|
+
self.rank_model.eval()
|
13
|
+
|
14
|
+
def match_score(self, chunk, query):
|
15
|
+
"""
|
16
|
+
rerank模型计算query和chunk的相似度
|
17
|
+
"""
|
18
|
+
pairs = [[query, chunk]]
|
19
|
+
|
20
|
+
with torch.no_grad():
|
21
|
+
inputs = self.tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512).to(self.device_id)
|
22
|
+
scores = self.rank_model(**inputs, return_dict=True).logits.view(-1, ).float()
|
23
|
+
scores = torch.sigmoid(scores)
|
24
|
+
scores = scores.cpu().numpy()
|
25
|
+
|
26
|
+
return scores[0]
|
27
|
+
|
28
|
+
def sort_and_filter(self, query, all_chunks):
|
29
|
+
"""
|
30
|
+
rerank模型对所有chunk进行排序
|
31
|
+
"""
|
32
|
+
chunk_match_score = []
|
33
|
+
for index, chunk in enumerate(all_chunks):
|
34
|
+
chunk_text = chunk.page_content
|
35
|
+
chunk_match_score.append(self.match_score(chunk_text, query))
|
36
|
+
|
37
|
+
sorted_res = sorted(enumerate(chunk_match_score), key=lambda x: -x[1])
|
38
|
+
remain_chunks = [all_chunks[elem[0]] for elem in sorted_res if elem[1] >= self.threshold]
|
39
|
+
if not remain_chunks:
|
40
|
+
remain_chunks = [all_chunks[sorted_res[0][0]]]
|
41
|
+
|
42
|
+
# for index, chunk in enumerate(remain_chunks):
|
43
|
+
# print('query:', query)
|
44
|
+
# print('chunk_text:', chunk.page_content)
|
45
|
+
# print('socre:', sorted_res[index][1])
|
46
|
+
# print('***********')
|
47
|
+
|
48
|
+
return remain_chunks
|
@@ -0,0 +1,139 @@
|
|
1
|
+
import json
|
2
|
+
import os
|
3
|
+
import numpy as np
|
4
|
+
import torch
|
5
|
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
6
|
+
|
7
|
+
tokenizer = AutoTokenizer.from_pretrained('/home/public/llm/bge-reranker-large')
|
8
|
+
model = AutoModelForSequenceClassification.from_pretrained('/home/public/llm/bge-reranker-large').to('cuda:2')
|
9
|
+
model.eval()
|
10
|
+
|
11
|
+
|
12
|
+
def min_edit_distance(a, b):
|
13
|
+
dp = [[0 for i in range(len(b) + 1)] for j in range(len(a) + 1)]
|
14
|
+
for i in range(len(a) + 1):
|
15
|
+
dp[i][0] = i
|
16
|
+
for j in range(len(b) + 1):
|
17
|
+
dp[0][j] = j
|
18
|
+
for i in range(1, len(a) + 1):
|
19
|
+
for j in range(1, len(b) + 1):
|
20
|
+
if a[i - 1] == b[j - 1]:
|
21
|
+
dp[i][j] = dp[i - 1][j - 1]
|
22
|
+
else:
|
23
|
+
dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + 1)
|
24
|
+
return dp[-1][-1]
|
25
|
+
|
26
|
+
|
27
|
+
def is_matched(text0, text1, thrd=10):
|
28
|
+
text0.replace(" ", "").replace("\n", "")
|
29
|
+
text1.replace(" ", "").replace("\n", "")
|
30
|
+
dist = min_edit_distance(text0, text1)
|
31
|
+
if dist < thrd:
|
32
|
+
return True
|
33
|
+
return False
|
34
|
+
|
35
|
+
|
36
|
+
def match_score(chunk, query):
|
37
|
+
"""
|
38
|
+
rerank模型计算query和chunk的相似度
|
39
|
+
"""
|
40
|
+
pairs = [[query, chunk]]
|
41
|
+
|
42
|
+
with torch.no_grad():
|
43
|
+
inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512).to('cuda:2')
|
44
|
+
scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
|
45
|
+
scores = torch.sigmoid(scores)
|
46
|
+
scores = scores.cpu().numpy()
|
47
|
+
|
48
|
+
return scores[0]
|
49
|
+
|
50
|
+
|
51
|
+
def sort_filter_all_chunks_method1(d, th=0.0):
|
52
|
+
"""
|
53
|
+
rerank模型对所有chunk进行排序
|
54
|
+
"""
|
55
|
+
# answer关键词已提取好
|
56
|
+
query = d['question']
|
57
|
+
all_chunks = d['all_chunks']
|
58
|
+
|
59
|
+
chunk_match_score = []
|
60
|
+
for index, chunk in enumerate(all_chunks):
|
61
|
+
chunk_text = chunk['text']
|
62
|
+
chunk_match_score.append(match_score(chunk_text, query))
|
63
|
+
|
64
|
+
sorted_res = sorted(enumerate(chunk_match_score), key=lambda x: -x[1])
|
65
|
+
print('-----------')
|
66
|
+
print(sorted_res)
|
67
|
+
remain_chunks = [all_chunks[elem[0]] for elem in sorted_res if elem[1] >= th]
|
68
|
+
if not remain_chunks:
|
69
|
+
remain_chunks = [all_chunks[sorted_res[0][0]]]
|
70
|
+
|
71
|
+
# for index, chunk in enumerate(remain_chunks):
|
72
|
+
# print('query:', query)
|
73
|
+
# print('chunk_text:', chunk['text'])
|
74
|
+
# print('socre:', sorted_res[index][1])
|
75
|
+
# print('***********')
|
76
|
+
|
77
|
+
d['all_chunks'] = remain_chunks
|
78
|
+
|
79
|
+
|
80
|
+
def calc_precision_recall(d):
|
81
|
+
"""
|
82
|
+
计算分数
|
83
|
+
"""
|
84
|
+
d_ves = d["all_chunks"][:10]
|
85
|
+
d_es = d["all_chunks"][10:]
|
86
|
+
all_chunks = []
|
87
|
+
for i in range(len(d_es)):
|
88
|
+
all_chunks.append(d_es[i])
|
89
|
+
all_chunks.append(d_ves[i])
|
90
|
+
all_chunks.extend(d_ves[i+1:])
|
91
|
+
d["all_chunks"] = all_chunks
|
92
|
+
|
93
|
+
sort_filter_all_chunks_method1(d)
|
94
|
+
NCHUNK = len(d["chunks"])
|
95
|
+
NCHUNK_ALL = len(d["all_chunks"])
|
96
|
+
print('chunks:', NCHUNK, 'all_chunks:', NCHUNK_ALL)
|
97
|
+
|
98
|
+
scores = np.zeros((NCHUNK, NCHUNK_ALL))
|
99
|
+
for j in range(NCHUNK):
|
100
|
+
for i in range(NCHUNK):
|
101
|
+
if d["chunks"][j]["text"] == d["all_chunks"][i]['text']:
|
102
|
+
scores[j][i] = 1
|
103
|
+
elif abs(len(d["chunks"][j]["text"]) - len(d["all_chunks"][i]['text'])) > 10:
|
104
|
+
scores[j][i] = 0
|
105
|
+
elif is_matched(d["chunks"][j]["text"], d["all_chunks"][i]['text']):
|
106
|
+
scores[j][i] = 1
|
107
|
+
|
108
|
+
N_gt = NCHUNK
|
109
|
+
N_all = NCHUNK_ALL
|
110
|
+
if NCHUNK != 0:
|
111
|
+
N_right = sum(scores.max(axis=1))
|
112
|
+
else:
|
113
|
+
N_right = 0
|
114
|
+
recall = 0 if N_gt == 0 else N_right / N_gt
|
115
|
+
return recall, N_gt, N_right, N_all
|
116
|
+
|
117
|
+
|
118
|
+
with open('zhaogushu_retriever_gt_convert_key.json', 'r') as f:
|
119
|
+
retriever_gt_list = json.load(f)
|
120
|
+
mFieldRecall = 0
|
121
|
+
total_N_gt, total_N_right, total_N_all = 0, 0, 0
|
122
|
+
D_SCORES = {}
|
123
|
+
nquestion = 0
|
124
|
+
for d in retriever_gt_list:
|
125
|
+
recall, N_gt, N_right, N_all = calc_precision_recall(d)
|
126
|
+
mFieldRecall += recall
|
127
|
+
D_SCORES[d["question"]] = {'recall': recall}
|
128
|
+
total_N_gt += N_gt
|
129
|
+
total_N_right += N_right
|
130
|
+
total_N_all += N_all
|
131
|
+
nquestion += 1
|
132
|
+
|
133
|
+
mFieldRecall = 0 if nquestion == 0 else mFieldRecall / nquestion
|
134
|
+
mMethodRecall = total_N_right / total_N_gt
|
135
|
+
|
136
|
+
print(f'mFieldRecall: {mFieldRecall * 100:.2f} %')
|
137
|
+
print(f'mMethodRecall: {mMethodRecall * 100:.2f} %')
|
138
|
+
print(f'total_N_right: {total_N_right}, total_N_gt: {total_N_gt}, total_N_all: {total_N_all}' )
|
139
|
+
print(f'nquestion: {nquestion}, mean_N_right: {total_N_right / nquestion}, mean_N_gt: {total_N_gt / nquestion}, mean_N_all: {total_N_all / nquestion}')
|
@@ -0,0 +1,47 @@
|
|
1
|
+
import os
|
2
|
+
import tempfile
|
3
|
+
import gradio as gr
|
4
|
+
from bisheng_langchain.rag.qa_corpus.qa_generator import RagQAGenerator
|
5
|
+
|
6
|
+
|
7
|
+
tmpdir = '/home/public/rag_benchmark_v1.0/tmp'
|
8
|
+
if not os.path.exists(tmpdir):
|
9
|
+
os.makedirs(tmpdir)
|
10
|
+
|
11
|
+
qa_gen_folder = '/home/public/rag_benchmark_v1.0/rag_qa_gen_demo'
|
12
|
+
if not os.path.exists(qa_gen_folder):
|
13
|
+
os.makedirs(qa_gen_folder)
|
14
|
+
model_name = "gpt-4-0125-preview"
|
15
|
+
unstructured_api_url = "https://bisheng.dataelem.com/api/v1/etl4llm/predict"
|
16
|
+
generator = RagQAGenerator(corpus_folder='',
|
17
|
+
qa_gen_folder=qa_gen_folder,
|
18
|
+
unstructured_api_url=unstructured_api_url,
|
19
|
+
model_name=model_name)
|
20
|
+
|
21
|
+
|
22
|
+
def qa_gen_run(intput_file, gen_qa_num):
|
23
|
+
gen_qa_num = int(gen_qa_num)
|
24
|
+
file_path = intput_file.name
|
25
|
+
output_file = generator.generate_qa_each_file(file_path, train_size=gen_qa_num)
|
26
|
+
return output_file
|
27
|
+
|
28
|
+
|
29
|
+
with tempfile.TemporaryDirectory(dir=tmpdir) as tmpdir:
|
30
|
+
with gr.Blocks(css='#margin-top {margin-top: 15px} #center {text-align: center;} #description {text-align: center}') as demo:
|
31
|
+
with gr.Row(elem_id='center'):
|
32
|
+
gr.Markdown('# Bisheng qa auto generation Demo')
|
33
|
+
|
34
|
+
with gr.Row(elem_id = 'description'):
|
35
|
+
gr.Markdown("""Qa generation for anything.""")
|
36
|
+
|
37
|
+
with gr.Row():
|
38
|
+
intput_file = gr.components.File(label='FlowFile')
|
39
|
+
gen_qa_num = gr.Textbox(label='生成的问题数量', value=10, interactive=True, lines=2)
|
40
|
+
|
41
|
+
with gr.Row():
|
42
|
+
with gr.Column():
|
43
|
+
btn0 = gr.Button('Run Qa Gen')
|
44
|
+
out0 = gr.components.File(label='FlowFile')
|
45
|
+
btn0.click(fn=qa_gen_run, inputs=[intput_file, gen_qa_num], outputs=out0)
|
46
|
+
|
47
|
+
demo.launch(server_name='192.168.106.20', server_port=9118, share=True)
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import os
|
2
|
+
import gradio as gr
|
3
|
+
import pandas as pd
|
4
|
+
from pathlib import Path
|
5
|
+
from gradio import components
|
6
|
+
from bisheng_langchain.rag.scoring.ragas_score import RagScore
|
7
|
+
|
8
|
+
|
9
|
+
save_folder = '/home/public/rag_benchmark_v1.0/rag_score_demo'
|
10
|
+
if not os.path.exists(save_folder):
|
11
|
+
os.makedirs(save_folder)
|
12
|
+
|
13
|
+
|
14
|
+
def rag_evaluate(excel_file, metric='answer_recall_bisheng', batch_size=5):
|
15
|
+
excel_path = excel_file.name
|
16
|
+
df = pd.read_excel(excel_path)
|
17
|
+
if '问题类型' not in df.columns:
|
18
|
+
df['问题类型'] = len(df['问题'].tolist()) * ['QA']
|
19
|
+
df.to_excel(excel_path, index=False)
|
20
|
+
params = {
|
21
|
+
'excel_path': excel_path,
|
22
|
+
'save_path': save_folder,
|
23
|
+
'question_column': '问题',
|
24
|
+
'query_type_column': '问题类型',
|
25
|
+
'gt_column': '人工标注',
|
26
|
+
'answer_column': '模型回答',
|
27
|
+
'metrics': [metric],
|
28
|
+
'batch_size': int(batch_size),
|
29
|
+
}
|
30
|
+
rag_score = RagScore(**params)
|
31
|
+
rag_score.score()
|
32
|
+
|
33
|
+
output_path = Path(save_folder) / f"{Path(excel_path).stem}_score.xlsx"
|
34
|
+
return str(output_path)
|
35
|
+
|
36
|
+
|
37
|
+
if __name__ == '__main__':
|
38
|
+
title = """毕昇QA问答自动评估系统"""
|
39
|
+
with gr.Blocks() as demo:
|
40
|
+
gr.Markdown(title)
|
41
|
+
|
42
|
+
with gr.Row():
|
43
|
+
with gr.Column(scale=2):
|
44
|
+
with gr.Row():
|
45
|
+
eval_intput_file = gr.components.File(label='FlowFile')
|
46
|
+
with gr.Column():
|
47
|
+
metric_options = ["answer_recall_bisheng", "answer_correctness_bisheng"]
|
48
|
+
metric = gr.components.Dropdown(label="评估方法", choices=metric_options, default=metric_options[0], interactive=True)
|
49
|
+
# metric = gr.Textbox(label='评估方法', value='answer_recall_bisheng', interactive=True, lines=2)
|
50
|
+
batch_size = gr.Textbox(label='批评估大小', value=5, interactive=True, lines=2)
|
51
|
+
btn0 = gr.Button('Run Evaluation')
|
52
|
+
eval_out_file = gr.components.File(label='FlowFile')
|
53
|
+
|
54
|
+
btn0.click(fn=rag_evaluate, inputs=[eval_intput_file, metric, batch_size], outputs=[eval_out_file])
|
55
|
+
demo.queue().launch(share=False, inbrowser=True, server_name="0.0.0.0", server_port=8218)
|
File without changes
|
@@ -0,0 +1,91 @@
|
|
1
|
+
import os
|
2
|
+
import shutil
|
3
|
+
|
4
|
+
import httpx
|
5
|
+
import nest_asyncio
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
nest_asyncio.apply()
|
9
|
+
from collections import defaultdict
|
10
|
+
|
11
|
+
from llama_index import ServiceContext
|
12
|
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
13
|
+
from llama_index.evaluation import CorrectnessEvaluator
|
14
|
+
from llama_index.llms import OpenAI
|
15
|
+
from tqdm import tqdm
|
16
|
+
|
17
|
+
openai_api_key = os.environ.get('OPENAI_API_KEY', '')
|
18
|
+
openai_proxy = os.environ.get('OPENAI_PROXY', '')
|
19
|
+
|
20
|
+
|
21
|
+
def llama_index_answer_correctness(querys, responses, references):
|
22
|
+
embed = OpenAIEmbedding(api_key=openai_api_key, http_client=httpx.AsyncClient(proxies=openai_proxy))
|
23
|
+
|
24
|
+
model_name = "gpt-3.5-turbo-16k"
|
25
|
+
service_context = ServiceContext.from_defaults(
|
26
|
+
llm=OpenAI(model=model_name, api_key=openai_api_key, http_client=httpx.AsyncClient(proxies=openai_proxy)),
|
27
|
+
embed_model=embed,
|
28
|
+
)
|
29
|
+
evaluator = CorrectnessEvaluator(service_context=service_context)
|
30
|
+
|
31
|
+
correctness_scores = []
|
32
|
+
correctness_feedbacks = []
|
33
|
+
for i in tqdm(range(len(querys))):
|
34
|
+
result = evaluator.evaluate(
|
35
|
+
query=querys[i],
|
36
|
+
response=responses[i],
|
37
|
+
reference=references[i],
|
38
|
+
)
|
39
|
+
correctness = result.score
|
40
|
+
feedback = result.feedback
|
41
|
+
correctness_scores.append(correctness)
|
42
|
+
correctness_feedbacks.append(feedback)
|
43
|
+
return correctness_scores, correctness_feedbacks
|
44
|
+
|
45
|
+
|
46
|
+
def rag_benchmark_scoring(excel_file):
|
47
|
+
if not os.path.exists(excel_file + '.bak'):
|
48
|
+
shutil.copy(excel_file, excel_file + '.bak')
|
49
|
+
|
50
|
+
df = pd.read_excel(excel_file)
|
51
|
+
df.dropna(subset=['问题', 'GT', 'rag_answer'], inplace=True)
|
52
|
+
all_questions_info = df.to_dict('records')
|
53
|
+
|
54
|
+
questions = []
|
55
|
+
ground_truths = []
|
56
|
+
answers = []
|
57
|
+
for question_info in all_questions_info:
|
58
|
+
question = question_info['问题']
|
59
|
+
gt = question_info['GT']
|
60
|
+
pred = question_info['rag_answer']
|
61
|
+
|
62
|
+
questions.append(question)
|
63
|
+
answers.append(pred)
|
64
|
+
ground_truths.append(gt)
|
65
|
+
|
66
|
+
correctness_scores, correctness_feedbacks = llama_index_answer_correctness(questions, answers, ground_truths)
|
67
|
+
|
68
|
+
score_map = {
|
69
|
+
'llama_index_correctness': correctness_scores,
|
70
|
+
}
|
71
|
+
|
72
|
+
for metric, scores in score_map.items():
|
73
|
+
df[metric] = df.index.map({i: score for i, score in enumerate(scores)})
|
74
|
+
df.to_excel(excel_file, index=False)
|
75
|
+
|
76
|
+
if '问题类型' in df.columns:
|
77
|
+
grouped_df = (
|
78
|
+
df.groupby('问题类型')
|
79
|
+
.agg({'问题': 'count', **{metric: 'mean' for metric in score_map}})
|
80
|
+
.rename(columns={'问题': '问题数量'})
|
81
|
+
)
|
82
|
+
total_question = grouped_df['问题数量'].sum()
|
83
|
+
grouped_df.loc['all', '问题数量'] = total_question
|
84
|
+
for metric in score_map:
|
85
|
+
grouped_df.loc['all', metric] = df[metric].sum() / total_question
|
86
|
+
return grouped_df
|
87
|
+
|
88
|
+
|
89
|
+
if __name__ == '__main__':
|
90
|
+
excel_file = './data/benchmark_v1.0.xlsx'
|
91
|
+
print(rag_benchmark_scoring(excel_file))
|