bisheng-langchain 0.3.0rc1__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bisheng_langchain/document_loaders/elem_unstrcutured_loader.py +5 -3
- bisheng_langchain/gpts/agent_types/llm_functions_agent.py +7 -1
- bisheng_langchain/gpts/assistant.py +8 -5
- bisheng_langchain/gpts/load_tools.py +2 -0
- bisheng_langchain/gpts/prompts/__init__.py +4 -2
- bisheng_langchain/gpts/prompts/assistant_prompt_base.py +1 -0
- bisheng_langchain/gpts/prompts/assistant_prompt_cohere.py +19 -0
- bisheng_langchain/gpts/tools/api_tools/flow.py +3 -3
- bisheng_langchain/gpts/tools/api_tools/openapi.py +101 -0
- bisheng_langchain/rag/__init__.py +5 -0
- bisheng_langchain/rag/bisheng_rag_pipeline.py +320 -0
- bisheng_langchain/rag/bisheng_rag_pipeline_v2.py +359 -0
- bisheng_langchain/rag/bisheng_rag_pipeline_v2_cohere_raw_prompting.py +376 -0
- bisheng_langchain/rag/bisheng_rag_tool.py +288 -0
- bisheng_langchain/rag/config/baseline.yaml +86 -0
- bisheng_langchain/rag/config/baseline_caibao.yaml +82 -0
- bisheng_langchain/rag/config/baseline_caibao_knowledge_v2.yaml +110 -0
- bisheng_langchain/rag/config/baseline_caibao_v2.yaml +112 -0
- bisheng_langchain/rag/config/baseline_demo_v2.yaml +92 -0
- bisheng_langchain/rag/config/baseline_s2b_mix.yaml +88 -0
- bisheng_langchain/rag/config/baseline_v2.yaml +90 -0
- bisheng_langchain/rag/extract_info.py +38 -0
- bisheng_langchain/rag/init_retrievers/__init__.py +4 -0
- bisheng_langchain/rag/init_retrievers/baseline_vector_retriever.py +61 -0
- bisheng_langchain/rag/init_retrievers/keyword_retriever.py +65 -0
- bisheng_langchain/rag/init_retrievers/mix_retriever.py +103 -0
- bisheng_langchain/rag/init_retrievers/smaller_chunks_retriever.py +92 -0
- bisheng_langchain/rag/prompts/__init__.py +9 -0
- bisheng_langchain/rag/prompts/extract_key_prompt.py +34 -0
- bisheng_langchain/rag/prompts/prompt.py +47 -0
- bisheng_langchain/rag/prompts/prompt_cohere.py +111 -0
- bisheng_langchain/rag/qa_corpus/__init__.py +0 -0
- bisheng_langchain/rag/qa_corpus/qa_generator.py +143 -0
- bisheng_langchain/rag/rerank/__init__.py +5 -0
- bisheng_langchain/rag/rerank/rerank.py +48 -0
- bisheng_langchain/rag/rerank/rerank_benchmark.py +139 -0
- bisheng_langchain/rag/run_qa_gen_web.py +47 -0
- bisheng_langchain/rag/run_rag_evaluate_web.py +55 -0
- bisheng_langchain/rag/scoring/__init__.py +0 -0
- bisheng_langchain/rag/scoring/llama_index_score.py +91 -0
- bisheng_langchain/rag/scoring/ragas_score.py +183 -0
- bisheng_langchain/rag/utils.py +181 -0
- bisheng_langchain/retrievers/ensemble.py +2 -1
- bisheng_langchain/vectorstores/elastic_keywords_search.py +2 -1
- {bisheng_langchain-0.3.0rc1.dist-info → bisheng_langchain-0.3.1.dist-info}/METADATA +1 -1
- {bisheng_langchain-0.3.0rc1.dist-info → bisheng_langchain-0.3.1.dist-info}/RECORD +48 -13
- bisheng_langchain/gpts/prompts/base_prompt.py +0 -1
- {bisheng_langchain-0.3.0rc1.dist-info → bisheng_langchain-0.3.1.dist-info}/WHEEL +0 -0
- {bisheng_langchain-0.3.0rc1.dist-info → bisheng_langchain-0.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
data:
|
2
|
+
origin_file_path: '/home/public/rag_benchmark_v1.0/rag_benchmark_processed'
|
3
|
+
question: '/home/gulixin/workspace/llm/bisheng/src/bisheng-langchain/experimental/rag/data/questions_info_with_answer_sample.xlsx'
|
4
|
+
save_answer: '/home/gulixin/workspace/llm/bisheng/src/bisheng-langchain/experimental/rag/data/questions_info_with_answer_sample_gpt4_12chunk_test.xlsx'
|
5
|
+
|
6
|
+
milvus:
|
7
|
+
host: '192.168.106.116'
|
8
|
+
port: '19530'
|
9
|
+
drop_old: True
|
10
|
+
|
11
|
+
elasticsearch:
|
12
|
+
url: 'http://192.168.106.116:9200'
|
13
|
+
ssl_verify:
|
14
|
+
basic_auth: ["elastic", "oSGL-zVvZ5P3Tm7qkDLC"]
|
15
|
+
drop_old: True
|
16
|
+
|
17
|
+
embedding:
|
18
|
+
type: 'OpenAIEmbeddings'
|
19
|
+
model: 'text-embedding-ada-002'
|
20
|
+
openai_api_key: ''
|
21
|
+
openai_proxy: 'http://118.195.232.223:39995'
|
22
|
+
|
23
|
+
chat_llm:
|
24
|
+
type: 'ChatOpenAI'
|
25
|
+
model: 'gpt-4-1106-preview'
|
26
|
+
openai_api_key: ''
|
27
|
+
openai_proxy: 'http://118.195.232.223:39995'
|
28
|
+
temperature: 0.0
|
29
|
+
|
30
|
+
# chat_llm:
|
31
|
+
# type: 'ChatQWen'
|
32
|
+
# model_name: 'qwen1.5-72b-chat'
|
33
|
+
# api_key: ''
|
34
|
+
# temperature: 0.01
|
35
|
+
|
36
|
+
loader:
|
37
|
+
type: 'ElemUnstructuredLoader'
|
38
|
+
unstructured_api_url: 'http://192.168.106.12:10001/v1/etl4llm/predict'
|
39
|
+
|
40
|
+
retriever:
|
41
|
+
type: 'EnsembleRetriever' # 不动
|
42
|
+
suffix: 'test_mix'
|
43
|
+
retrievers:
|
44
|
+
- type: 'MixRetriever'
|
45
|
+
splitter:
|
46
|
+
vector_text_splitter:
|
47
|
+
type: 'ElemCharacterTextSplitter'
|
48
|
+
chunk_size: 500
|
49
|
+
chunk_overlap: 50
|
50
|
+
separators: ["\n\n", "\n", " ", ""]
|
51
|
+
keyword_text_splitter:
|
52
|
+
type: 'ElemCharacterTextSplitter'
|
53
|
+
chunk_size: 500
|
54
|
+
chunk_overlap: 50
|
55
|
+
separators: ["\n\n", "\n", " ", ""]
|
56
|
+
retrieval:
|
57
|
+
combine_strategy: 'mix'
|
58
|
+
search_type: 'similarity'
|
59
|
+
vector_search_kwargs:
|
60
|
+
k: 6
|
61
|
+
keyword_search_kwargs:
|
62
|
+
k: 6
|
63
|
+
|
64
|
+
post_retrieval:
|
65
|
+
delete_duplicate: False
|
66
|
+
with_rank: False
|
67
|
+
rerank:
|
68
|
+
type: 'CustomReranker'
|
69
|
+
model_path: '/home/public/llm/bge-reranker-large'
|
70
|
+
device_id: 'cuda:0'
|
71
|
+
threshold: 0.0
|
72
|
+
|
73
|
+
generate:
|
74
|
+
with_retrieval: True
|
75
|
+
chain_type: 'stuff'
|
76
|
+
# prompt_type: 'BASE_PROMPT'
|
77
|
+
prompt_type: 'CHAT_PROMPT'
|
78
|
+
|
79
|
+
metric:
|
80
|
+
type: 'bisheng-ragas'
|
81
|
+
question_column: '问题'
|
82
|
+
gt_column: 'GT'
|
83
|
+
answer_column: 'rag_answer'
|
84
|
+
query_type_column: '问题类型'
|
85
|
+
metrics: ['answer_correctness_bisheng']
|
86
|
+
batch_size: 5
|
@@ -0,0 +1,82 @@
|
|
1
|
+
data:
|
2
|
+
origin_file_path: '/home/public/rag_benchmark_v1.0/rag_benchmark_processed'
|
3
|
+
question: '/home/gulixin/workspace/llm/bisheng/src/bisheng-langchain/experimental/rag/data/questions_info_with_answer_sample.xlsx'
|
4
|
+
save_answer: '/home/gulixin/workspace/llm/bisheng/src/bisheng-langchain/experimental/rag/data/finance_report_data_100_single_gpt3.5_20chunk.xlsx'
|
5
|
+
|
6
|
+
milvus:
|
7
|
+
host: '110.16.193.170'
|
8
|
+
port: '50062'
|
9
|
+
drop_old: True
|
10
|
+
|
11
|
+
elasticsearch:
|
12
|
+
url: 'http://110.16.193.170:50062/es'
|
13
|
+
ssl_verify:
|
14
|
+
basic_auth: ["elastic", "oSGL-zVvZ5P3Tm7qkDLC"]
|
15
|
+
drop_old: True
|
16
|
+
|
17
|
+
embedding:
|
18
|
+
type: 'OpenAIEmbeddings'
|
19
|
+
model: 'text-embedding-ada-002'
|
20
|
+
openai_api_key: ''
|
21
|
+
openai_proxy: ''
|
22
|
+
|
23
|
+
chat_llm:
|
24
|
+
type: 'ChatOpenAI'
|
25
|
+
model: 'gpt-4-1106-preview'
|
26
|
+
openai_api_key: ''
|
27
|
+
openai_proxy: ''
|
28
|
+
temperature: 0.0
|
29
|
+
|
30
|
+
loader:
|
31
|
+
type: 'ElemUnstructuredLoader'
|
32
|
+
unstructured_api_url: 'http://bisheng.dataelem.com/v1/etl4llm/predict'
|
33
|
+
|
34
|
+
retriever:
|
35
|
+
type: 'EnsembleRetriever' # 不动
|
36
|
+
suffix: 'test_mix'
|
37
|
+
retrievers:
|
38
|
+
- type: 'MixRetriever'
|
39
|
+
splitter:
|
40
|
+
vector_text_splitter:
|
41
|
+
type: 'ElemCharacterTextSplitter'
|
42
|
+
chunk_size: 500
|
43
|
+
chunk_overlap: 50
|
44
|
+
separators: ["\n\n", "\n", " ", ""]
|
45
|
+
keyword_text_splitter:
|
46
|
+
type: 'ElemCharacterTextSplitter'
|
47
|
+
chunk_size: 500
|
48
|
+
chunk_overlap: 50
|
49
|
+
separators: ["\n\n", "\n", " ", ""]
|
50
|
+
retrieval:
|
51
|
+
combine_strategy: 'mix'
|
52
|
+
search_type: 'similarity'
|
53
|
+
vector_search_kwargs:
|
54
|
+
k: 6
|
55
|
+
keyword_search_kwargs:
|
56
|
+
k: 6
|
57
|
+
|
58
|
+
post_retrieval:
|
59
|
+
delete_duplicate: False
|
60
|
+
with_rank: False
|
61
|
+
rerank:
|
62
|
+
type: 'CustomReranker'
|
63
|
+
model_path: '/home/public/llm/bge-reranker-large'
|
64
|
+
device_id: 'cuda:0'
|
65
|
+
threshold: 0.0
|
66
|
+
|
67
|
+
generate:
|
68
|
+
with_retrieval: True
|
69
|
+
chain_type: 'stuff'
|
70
|
+
# prompt_type: 'BASE_PROMPT'
|
71
|
+
prompt_type: 'CHAT_PROMPT'
|
72
|
+
|
73
|
+
metric:
|
74
|
+
type: 'bisheng-ragas'
|
75
|
+
question_column: '问题'
|
76
|
+
gt_column: 'GT'
|
77
|
+
answer_column: 'rag_answer'
|
78
|
+
query_type_column: '问题类型'
|
79
|
+
# metrics: ['answer_correctness_bisheng']
|
80
|
+
metrics: ['answer_recall_bisheng']
|
81
|
+
gt_split_column: 'gt_split_point'
|
82
|
+
batch_size: 5
|
@@ -0,0 +1,110 @@
|
|
1
|
+
data:
|
2
|
+
origin_file_path: '/home/public/rag_benchmark_finance_report'
|
3
|
+
question: '/home/public/rag_benchmark_finance_report/finance_report_data_100.xlsx'
|
4
|
+
save_answer: '/home/public/rag_benchmark_finance_report/finance_report_data_100_knowledge_command-r-plus_20chunk_chunk_size_1000.xlsx'
|
5
|
+
|
6
|
+
milvus:
|
7
|
+
host: '110.16.193.170'
|
8
|
+
port: '50062'
|
9
|
+
drop_old: True
|
10
|
+
|
11
|
+
elasticsearch:
|
12
|
+
url: 'http://110.16.193.170:50062/es'
|
13
|
+
ssl_verify:
|
14
|
+
basic_auth: ["elastic", "oSGL-zVvZ5P3Tm7qkDLC"]
|
15
|
+
drop_old: True
|
16
|
+
|
17
|
+
embedding:
|
18
|
+
type: 'OpenAIEmbeddings'
|
19
|
+
model: 'text-embedding-ada-002'
|
20
|
+
openai_api_key: ''
|
21
|
+
openai_proxy: ''
|
22
|
+
|
23
|
+
# chat_llm:
|
24
|
+
# type: 'ChatOpenAI'
|
25
|
+
# model: 'gpt-4-1106-preview'
|
26
|
+
# openai_api_key: ''
|
27
|
+
# openai_proxy: ''
|
28
|
+
# temperature: 0.0
|
29
|
+
|
30
|
+
chat_llm:
|
31
|
+
type: 'ChatCohere'
|
32
|
+
model: 'command-r-plus'
|
33
|
+
cohere_api_key: ''
|
34
|
+
max_tokens: 1000
|
35
|
+
temperature: 0.01
|
36
|
+
|
37
|
+
# chat_llm:
|
38
|
+
# type: 'ChatOpenAI'
|
39
|
+
# model: 'moonshot-v1-128k'
|
40
|
+
# openai_api_base: 'https://api.moonshot.cn/v1'
|
41
|
+
# openai_api_key : "Y21pamZpdWNwN2Zic3ZtdGJpdGc6bXNrLWZLNHp4VDMxMklsVU56MUxmOVNwY0RMeFMyaUg="
|
42
|
+
# openai_proxy: ''
|
43
|
+
# temperature: 0.01
|
44
|
+
|
45
|
+
# chat_llm:
|
46
|
+
# type: 'ChatQWen'
|
47
|
+
# model_name: 'qwen1.5-72b-chat'
|
48
|
+
# api_key: ''
|
49
|
+
# temperature: 0.01
|
50
|
+
|
51
|
+
loader:
|
52
|
+
type: 'ElemUnstructuredLoader'
|
53
|
+
unstructured_api_url: 'https://bisheng.dataelem.com/api/v1/etl4llm/predict'
|
54
|
+
|
55
|
+
retriever:
|
56
|
+
type: 'EnsembleRetriever' # 不动
|
57
|
+
suffix: 'benchmark_caibao_1000_knowledge_source_title'
|
58
|
+
retrievers:
|
59
|
+
- type: 'KeywordRetriever'
|
60
|
+
splitter:
|
61
|
+
text_splitter:
|
62
|
+
# type: 'ElemCharacterTextSplitter'
|
63
|
+
type: 'RecursiveCharacterTextSplitter'
|
64
|
+
chunk_size: 1000
|
65
|
+
chunk_overlap: 0
|
66
|
+
separators: ["\n\n"]
|
67
|
+
retrieval:
|
68
|
+
search_type: 'similarity'
|
69
|
+
search_kwargs:
|
70
|
+
k: 10
|
71
|
+
- type: 'BaselineVectorRetriever'
|
72
|
+
splitter:
|
73
|
+
text_splitter:
|
74
|
+
# type: 'ElemCharacterTextSplitter'
|
75
|
+
type: 'RecursiveCharacterTextSplitter'
|
76
|
+
chunk_size: 1000
|
77
|
+
chunk_overlap: 0
|
78
|
+
separators: ["\n\n"]
|
79
|
+
retrieval:
|
80
|
+
search_type: 'similarity'
|
81
|
+
search_kwargs:
|
82
|
+
k: 10
|
83
|
+
|
84
|
+
post_retrieval:
|
85
|
+
delete_duplicate: False
|
86
|
+
with_rank: False
|
87
|
+
rerank:
|
88
|
+
type: 'CustomReranker'
|
89
|
+
model_path: '/home/public/llm/bge-reranker-large'
|
90
|
+
device_id: 'cuda:0'
|
91
|
+
threshold: 0.0
|
92
|
+
sort_by_source_and_index: False
|
93
|
+
|
94
|
+
generate:
|
95
|
+
with_retrieval: True
|
96
|
+
max_content: 100000
|
97
|
+
chain_type: 'stuff'
|
98
|
+
# prompt_type: 'BASE_PROMPT'
|
99
|
+
prompt_type: 'CHAT_PROMPT'
|
100
|
+
|
101
|
+
metric:
|
102
|
+
type: 'bisheng-ragas'
|
103
|
+
question_column: '问题'
|
104
|
+
gt_column: 'GT'
|
105
|
+
answer_column: 'rag_answer'
|
106
|
+
query_type_column: '问题类型'
|
107
|
+
# metrics: ['answer_correctness_bisheng']
|
108
|
+
metrics: ['answer_recall_bisheng']
|
109
|
+
gt_split_column: 'gt_split_point'
|
110
|
+
batch_size: 5
|
@@ -0,0 +1,112 @@
|
|
1
|
+
data:
|
2
|
+
origin_file_path: '/home/public/rag_benchmark_finance_report'
|
3
|
+
question: '/home/public/rag_benchmark_finance_report/finance_report_data_100_single.xlsx'
|
4
|
+
save_answer: '/home/public/rag_benchmark_finance_report/finance_report_data_100_single_command-r-plus_20chunk_chunk_size_1000_with_source_title_overlap100.xlsx'
|
5
|
+
|
6
|
+
milvus:
|
7
|
+
host: '110.16.193.170'
|
8
|
+
port: '50062'
|
9
|
+
drop_old: True
|
10
|
+
|
11
|
+
elasticsearch:
|
12
|
+
url: 'http://110.16.193.170:50062/es'
|
13
|
+
ssl_verify:
|
14
|
+
basic_auth: ["elastic", "oSGL-zVvZ5P3Tm7qkDLC"]
|
15
|
+
drop_old: True
|
16
|
+
extract_key_by_llm: False
|
17
|
+
|
18
|
+
embedding:
|
19
|
+
type: 'OpenAIEmbeddings'
|
20
|
+
model: 'text-embedding-ada-002'
|
21
|
+
openai_api_key: ''
|
22
|
+
openai_proxy: ''
|
23
|
+
|
24
|
+
# chat_llm:
|
25
|
+
# type: 'ChatOpenAI'
|
26
|
+
# model: 'gpt-4-1106-preview'
|
27
|
+
# openai_api_key: ''
|
28
|
+
# openai_proxy: ''
|
29
|
+
# temperature: 0.0
|
30
|
+
|
31
|
+
chat_llm:
|
32
|
+
type: 'ChatCohere'
|
33
|
+
model: 'command-r-plus'
|
34
|
+
cohere_api_key: ''
|
35
|
+
max_tokens: 1000
|
36
|
+
temperature: 0.01
|
37
|
+
|
38
|
+
# chat_llm:
|
39
|
+
# type: 'ChatQWen'
|
40
|
+
# model_name: 'qwen1.5-110b-chat'
|
41
|
+
# api_key: ''
|
42
|
+
# temperature: 0.01
|
43
|
+
|
44
|
+
# chat_llm:
|
45
|
+
# type: 'ChatOpenAI'
|
46
|
+
# model: 'qwen1.5-110b-chat'
|
47
|
+
# openai_api_base: 'http://60.31.21.42:12511/v1'
|
48
|
+
# openai_api_key : "Z9b8x3V7C2n0Q5T"
|
49
|
+
# openai_proxy: ''
|
50
|
+
# temperature: 0.01
|
51
|
+
|
52
|
+
loader:
|
53
|
+
type: 'ElemUnstructuredLoader'
|
54
|
+
unstructured_api_url: 'https://bisheng.dataelem.com/api/v1/etl4llm/predict'
|
55
|
+
|
56
|
+
retriever:
|
57
|
+
type: 'EnsembleRetriever' # 不动
|
58
|
+
suffix: 'benchmark_caibao_1000_source_title_overlap100'
|
59
|
+
add_aux_info: True
|
60
|
+
retrievers:
|
61
|
+
- type: 'KeywordRetriever'
|
62
|
+
splitter:
|
63
|
+
text_splitter:
|
64
|
+
# type: 'ElemCharacterTextSplitter'
|
65
|
+
type: 'RecursiveCharacterTextSplitter'
|
66
|
+
chunk_size: 1000
|
67
|
+
chunk_overlap: 100
|
68
|
+
separators: ["\n\n"]
|
69
|
+
retrieval:
|
70
|
+
search_type: 'similarity'
|
71
|
+
search_kwargs:
|
72
|
+
k: 10
|
73
|
+
- type: 'BaselineVectorRetriever'
|
74
|
+
splitter:
|
75
|
+
text_splitter:
|
76
|
+
# type: 'ElemCharacterTextSplitter'
|
77
|
+
type: 'RecursiveCharacterTextSplitter'
|
78
|
+
chunk_size: 1000
|
79
|
+
chunk_overlap: 100
|
80
|
+
separators: ["\n\n"]
|
81
|
+
retrieval:
|
82
|
+
search_type: 'similarity'
|
83
|
+
search_kwargs:
|
84
|
+
k: 10
|
85
|
+
|
86
|
+
post_retrieval:
|
87
|
+
delete_duplicate: False
|
88
|
+
with_rank: False
|
89
|
+
rerank:
|
90
|
+
type: 'CustomReranker'
|
91
|
+
model_path: '/home/public/llm/bge-reranker-large'
|
92
|
+
device_id: 'cuda:0'
|
93
|
+
threshold: 0.0
|
94
|
+
sort_by_source_and_index: False
|
95
|
+
|
96
|
+
generate:
|
97
|
+
with_retrieval: True
|
98
|
+
max_content: 100000
|
99
|
+
chain_type: 'stuff'
|
100
|
+
# prompt_type: 'BASE_PROMPT'
|
101
|
+
prompt_type: 'CHAT_PROMPT'
|
102
|
+
|
103
|
+
metric:
|
104
|
+
type: 'bisheng-ragas'
|
105
|
+
question_column: '问题'
|
106
|
+
gt_column: 'GT'
|
107
|
+
answer_column: 'rag_answer'
|
108
|
+
query_type_column: '问题类型'
|
109
|
+
# metrics: ['answer_correctness_bisheng']
|
110
|
+
metrics: ['answer_recall_bisheng']
|
111
|
+
gt_split_column: 'gt_split_point'
|
112
|
+
batch_size: 5
|
@@ -0,0 +1,92 @@
|
|
1
|
+
data:
|
2
|
+
origin_file_path: '/home/public/rag_benchmark_v1.0/rag_benchmark_processed'
|
3
|
+
question: '/home/gulixin/workspace/llm/bisheng/src/bisheng-langchain/experimental/rag/data/questions_info_with_answer_sample.xlsx'
|
4
|
+
save_answer: '/home/gulixin/workspace/llm/bisheng/src/bisheng-langchain/experimental/rag/data/questions_info_with_answer_sample_gpt4_10w.xlsx'
|
5
|
+
|
6
|
+
milvus:
|
7
|
+
host: '192.168.106.116'
|
8
|
+
port: '19530'
|
9
|
+
drop_old: True
|
10
|
+
|
11
|
+
elasticsearch:
|
12
|
+
url: 'http://192.168.106.116:9200'
|
13
|
+
ssl_verify:
|
14
|
+
basic_auth: ["elastic", "oSGL-zVvZ5P3Tm7qkDLC"]
|
15
|
+
drop_old: True
|
16
|
+
|
17
|
+
embedding:
|
18
|
+
type: 'OpenAIEmbeddings'
|
19
|
+
model: 'text-embedding-ada-002'
|
20
|
+
openai_api_key: ''
|
21
|
+
openai_proxy: 'http://192.168.106.20:1081'
|
22
|
+
|
23
|
+
chat_llm:
|
24
|
+
type: 'ChatOpenAI'
|
25
|
+
model: 'gpt-4-1106-preview'
|
26
|
+
openai_api_key: ''
|
27
|
+
openai_proxy: 'http://192.168.106.20:1081'
|
28
|
+
temperature: 0.0
|
29
|
+
|
30
|
+
# chat_llm:
|
31
|
+
# type: 'ChatQWen'
|
32
|
+
# model_name: 'qwen1.5-72b-chat'
|
33
|
+
# api_key: ''
|
34
|
+
# temperature: 0.01
|
35
|
+
|
36
|
+
loader:
|
37
|
+
type: 'ElemUnstructuredLoader'
|
38
|
+
unstructured_api_url: 'http://192.168.106.12:10001/v1/etl4llm/predict'
|
39
|
+
|
40
|
+
retriever:
|
41
|
+
type: 'EnsembleRetriever' # 不动
|
42
|
+
suffix: 'benchmark_demo_test'
|
43
|
+
retrievers:
|
44
|
+
- type: 'KeywordRetriever'
|
45
|
+
splitter:
|
46
|
+
text_splitter:
|
47
|
+
type: 'ElemCharacterTextSplitter'
|
48
|
+
chunk_size: 500
|
49
|
+
chunk_overlap: 0
|
50
|
+
separators: ["\n\n"]
|
51
|
+
retrieval:
|
52
|
+
search_type: 'similarity'
|
53
|
+
search_kwargs:
|
54
|
+
k: 10000
|
55
|
+
- type: 'BaselineVectorRetriever'
|
56
|
+
splitter:
|
57
|
+
text_splitter:
|
58
|
+
type: 'ElemCharacterTextSplitter'
|
59
|
+
chunk_size: 500
|
60
|
+
chunk_overlap: 0
|
61
|
+
separators: ["\n\n"]
|
62
|
+
retrieval:
|
63
|
+
search_type: 'similarity'
|
64
|
+
search_kwargs:
|
65
|
+
k: 10000
|
66
|
+
|
67
|
+
post_retrieval:
|
68
|
+
delete_duplicate: False
|
69
|
+
with_rank: False
|
70
|
+
rerank:
|
71
|
+
type: 'CustomReranker'
|
72
|
+
model_path: '/home/public/llm/bge-reranker-large'
|
73
|
+
device_id: 'cuda:0'
|
74
|
+
threshold: 0.0
|
75
|
+
sort_by_source_and_index: True
|
76
|
+
|
77
|
+
generate:
|
78
|
+
with_retrieval: True
|
79
|
+
max_content: 100000
|
80
|
+
chain_type: 'stuff'
|
81
|
+
# prompt_type: 'BASE_PROMPT'
|
82
|
+
prompt_type: 'CHAT_PROMPT'
|
83
|
+
|
84
|
+
metric:
|
85
|
+
type: 'bisheng-ragas'
|
86
|
+
question_column: '问题'
|
87
|
+
gt_column: 'GT'
|
88
|
+
answer_column: 'rag_answer'
|
89
|
+
query_type_column: '问题类型'
|
90
|
+
metrics: ['answer_correctness_bisheng']
|
91
|
+
# metrics: ['answer_recall_bisheng']
|
92
|
+
batch_size: 5
|
@@ -0,0 +1,88 @@
|
|
1
|
+
data:
|
2
|
+
origin_file_path: '/home/public/rag_benchmark_v1.0/rag_benchmark_processed'
|
3
|
+
question: '/home/gulixin/workspace/llm/bisheng/src/bisheng-langchain/experimental/rag/data/questions_info_with_answer_sample.xlsx'
|
4
|
+
save_answer: '/home/gulixin/workspace/llm/bisheng/src/bisheng-langchain/experimental/rag/data/questions_info_with_answer_sample_gpt4_12chunk_s2b.xlsx'
|
5
|
+
|
6
|
+
milvus:
|
7
|
+
host: '192.168.106.116'
|
8
|
+
port: '19530'
|
9
|
+
drop_old: True
|
10
|
+
|
11
|
+
elasticsearch:
|
12
|
+
url: 'http://192.168.106.116:9200'
|
13
|
+
ssl_verify:
|
14
|
+
basic_auth: ["elastic", "oSGL-zVvZ5P3Tm7qkDLC"]
|
15
|
+
drop_old: True
|
16
|
+
|
17
|
+
embedding:
|
18
|
+
type: 'OpenAIEmbeddings'
|
19
|
+
model: 'text-embedding-ada-002'
|
20
|
+
openai_api_key: ''
|
21
|
+
openai_proxy: 'http://118.195.232.223:39995'
|
22
|
+
|
23
|
+
chat_llm:
|
24
|
+
type: 'ChatOpenAI'
|
25
|
+
model: 'gpt-4-1106-preview'
|
26
|
+
openai_api_key: ''
|
27
|
+
openai_proxy: 'http://118.195.232.223:39995'
|
28
|
+
temperature: 0.0
|
29
|
+
|
30
|
+
loader:
|
31
|
+
type: 'ElemUnstructuredLoader'
|
32
|
+
unstructured_api_url: 'http://192.168.106.12:10001/v1/etl4llm/predict'
|
33
|
+
|
34
|
+
retriever:
|
35
|
+
type: 'EnsembleRetriever' # 不动
|
36
|
+
suffix: 'test_s2b_mix'
|
37
|
+
retrievers:
|
38
|
+
- type: 'SmallerChunksVectorRetriever'
|
39
|
+
splitter:
|
40
|
+
parent_splitter:
|
41
|
+
type: 'ElemCharacterTextSplitter'
|
42
|
+
chunk_size: 1024
|
43
|
+
chunk_overlap: 200
|
44
|
+
separators: ["\n\n", "\n", " ", ""]
|
45
|
+
child_splitter:
|
46
|
+
type: 'RecursiveCharacterTextSplitter'
|
47
|
+
chunk_size: 216
|
48
|
+
chunk_overlap: 0
|
49
|
+
separators: ["\n\n", "\n", " ", ""]
|
50
|
+
retrieval:
|
51
|
+
search_type: 'similarity'
|
52
|
+
child_search_kwargs:
|
53
|
+
k: 12
|
54
|
+
- type: 'KeywordRetriever'
|
55
|
+
splitter:
|
56
|
+
text_splitter:
|
57
|
+
type: 'ElemCharacterTextSplitter'
|
58
|
+
chunk_size: 1024
|
59
|
+
chunk_overlap: 200
|
60
|
+
separators: ["\n\n", "\n", " ", ""]
|
61
|
+
retrieval:
|
62
|
+
search_type: 'similarity'
|
63
|
+
search_kwargs:
|
64
|
+
k: 6
|
65
|
+
|
66
|
+
post_retrieval:
|
67
|
+
delete_duplicate: True
|
68
|
+
with_rank: False
|
69
|
+
rerank:
|
70
|
+
type: 'CustomReranker'
|
71
|
+
model_path: '/home/public/llm/bge-reranker-large'
|
72
|
+
device_id: 'cuda:0'
|
73
|
+
threshold: 0.0
|
74
|
+
|
75
|
+
generate:
|
76
|
+
with_retrieval: True
|
77
|
+
chain_type: 'stuff'
|
78
|
+
# prompt_type: 'BASE_PROMPT'
|
79
|
+
prompt_type: 'CHAT_PROMPT'
|
80
|
+
|
81
|
+
metric:
|
82
|
+
type: 'bisheng-ragas'
|
83
|
+
question_column: '问题'
|
84
|
+
gt_column: 'GT'
|
85
|
+
answer_column: 'rag_answer'
|
86
|
+
query_type_column: '问题类型'
|
87
|
+
metrics: ['answer_correctness_bisheng']
|
88
|
+
batch_size: 5
|
@@ -0,0 +1,90 @@
|
|
1
|
+
data:
|
2
|
+
origin_file_path: '/home/public/rag_benchmark_finance_report'
|
3
|
+
question: '/home/public/rag_benchmark_finance_report/finance_report_data_100_single.xlsx'
|
4
|
+
save_answer: '/home/public/rag_benchmark_finance_report/finance_report_data_100_single_qwen1.5_72b_20chunk_chunk_size_1000_with_source_title.xlsx'
|
5
|
+
|
6
|
+
milvus:
|
7
|
+
host: '110.16.193.170'
|
8
|
+
port: '50062'
|
9
|
+
drop_old: True
|
10
|
+
|
11
|
+
elasticsearch:
|
12
|
+
url: 'http://110.16.193.170:50062/es'
|
13
|
+
ssl_verify:
|
14
|
+
basic_auth: ["elastic", "oSGL-zVvZ5P3Tm7qkDLC"]
|
15
|
+
drop_old: True
|
16
|
+
|
17
|
+
embedding:
|
18
|
+
type: 'OpenAIEmbeddings'
|
19
|
+
model: 'text-embedding-ada-002'
|
20
|
+
openai_api_key: ''
|
21
|
+
openai_proxy: ''
|
22
|
+
|
23
|
+
chat_llm:
|
24
|
+
type: 'ChatOpenAI'
|
25
|
+
model: 'gpt-4-1106-preview'
|
26
|
+
openai_api_key: ''
|
27
|
+
openai_proxy: ''
|
28
|
+
temperature: 0.0
|
29
|
+
|
30
|
+
loader:
|
31
|
+
type: 'ElemUnstructuredLoader'
|
32
|
+
unstructured_api_url: 'https://bisheng.dataelem.com/api/v1/etl4llm/predict'
|
33
|
+
|
34
|
+
retriever:
|
35
|
+
type: 'EnsembleRetriever' # 不动
|
36
|
+
suffix: 'benchmark_caibao_1000_source_title'
|
37
|
+
add_aux_info: True
|
38
|
+
retrievers:
|
39
|
+
- type: 'KeywordRetriever'
|
40
|
+
splitter:
|
41
|
+
text_splitter:
|
42
|
+
# type: 'ElemCharacterTextSplitter'
|
43
|
+
type: 'RecursiveCharacterTextSplitter'
|
44
|
+
chunk_size: 1000
|
45
|
+
chunk_overlap: 0
|
46
|
+
separators: ["\n\n"]
|
47
|
+
retrieval:
|
48
|
+
search_type: 'similarity'
|
49
|
+
search_kwargs:
|
50
|
+
k: 100
|
51
|
+
- type: 'BaselineVectorRetriever'
|
52
|
+
splitter:
|
53
|
+
text_splitter:
|
54
|
+
# type: 'ElemCharacterTextSplitter'
|
55
|
+
type: 'RecursiveCharacterTextSplitter'
|
56
|
+
chunk_size: 1000
|
57
|
+
chunk_overlap: 0
|
58
|
+
separators: ["\n\n"]
|
59
|
+
retrieval:
|
60
|
+
search_type: 'similarity'
|
61
|
+
search_kwargs:
|
62
|
+
k: 100
|
63
|
+
|
64
|
+
post_retrieval:
|
65
|
+
delete_duplicate: False
|
66
|
+
with_rank: False
|
67
|
+
rerank:
|
68
|
+
type: 'CustomReranker'
|
69
|
+
model_path: '/home/public/llm/bge-reranker-large'
|
70
|
+
device_id: 'cuda:0'
|
71
|
+
threshold: 0.0
|
72
|
+
sort_by_source_and_index: True
|
73
|
+
|
74
|
+
generate:
|
75
|
+
with_retrieval: True
|
76
|
+
max_content: 15000
|
77
|
+
chain_type: 'stuff'
|
78
|
+
# prompt_type: 'BASE_PROMPT'
|
79
|
+
prompt_type: 'CHAT_PROMPT'
|
80
|
+
|
81
|
+
metric:
|
82
|
+
type: 'bisheng-ragas'
|
83
|
+
question_column: '问题'
|
84
|
+
gt_column: 'GT'
|
85
|
+
answer_column: 'rag_answer'
|
86
|
+
query_type_column: '问题类型'
|
87
|
+
# metrics: ['answer_correctness_bisheng']
|
88
|
+
metrics: ['answer_recall_bisheng']
|
89
|
+
gt_split_column: 'gt_split_point'
|
90
|
+
batch_size: 5
|
@@ -0,0 +1,38 @@
|
|
1
|
+
import httpx
|
2
|
+
from langchain.chat_models import ChatOpenAI
|
3
|
+
from bisheng_langchain.chat_models import ChatQWen
|
4
|
+
from langchain.chains import LLMChain
|
5
|
+
from langchain.prompts.chat import (
|
6
|
+
ChatPromptTemplate,
|
7
|
+
SystemMessagePromptTemplate,
|
8
|
+
HumanMessagePromptTemplate,
|
9
|
+
)
|
10
|
+
|
11
|
+
system_template = """你是一个可靠标题生成或者提取助手。你会收到一篇文档的主要内容,请根据这些内容生成或者提取这篇文档的标题。"""
|
12
|
+
human_template = """
|
13
|
+
文档内容如下:
|
14
|
+
{context}
|
15
|
+
|
16
|
+
生成或提取的标题:
|
17
|
+
"""
|
18
|
+
|
19
|
+
messages = [
|
20
|
+
SystemMessagePromptTemplate.from_template(system_template),
|
21
|
+
HumanMessagePromptTemplate.from_template(human_template),
|
22
|
+
]
|
23
|
+
title_extract_prompt = ChatPromptTemplate.from_messages(messages)
|
24
|
+
|
25
|
+
|
26
|
+
def extract_title(llm, text, max_length=7000) -> str:
|
27
|
+
chain = LLMChain(llm=llm, prompt=title_extract_prompt)
|
28
|
+
ans = chain.run(context=text[:max_length])
|
29
|
+
return ans
|
30
|
+
|
31
|
+
|
32
|
+
if __name__ == '__main__':
|
33
|
+
llm = ChatQWen(model_name='qwen1.5-72b-chat',
|
34
|
+
api_key='',
|
35
|
+
temperature=0.01)
|
36
|
+
text = "江苏蔚蓝锂芯股份有限公司\n2021 年年度报告 \n2022 年 03 月\n\n 第一节 重要提示、目录和释义\n公司董事会、监事会及董事、监事、高级管理人员保证年度报告内容的真实、准确、完整,不存在虚假记载、误导性陈述或重大遗漏,并承担个别和连带的法律责任。\n公司负责人 CHEN KAI、主管会计工作负责人林文华及会计机构负责人(会计主管人员)张宗红声明:保证本年度报告中财务报告的真实、准确、完整。\n所有董事均已出席了审议本报告的董事会会议。"
|
37
|
+
ans = extract_title(llm, text)
|
38
|
+
print(ans)
|