flexeval 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {flexeval-0.3.1 → flexeval-0.3.2}/PKG-INFO +2 -1
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/llm_score.py +6 -2
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/judge/llm_judge.py +9 -2
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/Metric/assistant_eval_gpt4_ja_single_turn.jsonnet +1 -0
- flexeval-0.3.2/flexeval/preset_configs/PairwiseJudge/assistant_judge_gpt4_ja_single_turn.jsonnet +49 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/scripts/flexeval_file.py +5 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/scripts/flexeval_lm.py +9 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/scripts/flexeval_pairwise.py +4 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/pyproject.toml +1 -7
- flexeval-0.3.1/flexeval/preset_configs/PairwiseJudge/assistant_judge_gpt4_ja_single_turn.jsonnet +0 -43
- {flexeval-0.3.1 → flexeval-0.3.2}/LICENSE +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/README.md +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/README.md +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-en-ref-gpt4.jsonl +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-en.jsonl +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-ja-ref-gpt4.jsonl +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-ja.jsonl +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/rakuda-v2-ja.jsonl +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-en-ref-gpt4.jsonl +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-en.jsonl +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-ja-ref-gpt4.jsonl +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-ja.jsonl +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/hf_dataset.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/evaluate_chat_response.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/evaluate_from_file.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/evaluate_generation.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/evaluate_multiple_choice.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/evaluate_pairwise.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/evaluate_perplexity.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/few_shot_generator/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/few_shot_generator/balanced.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/few_shot_generator/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/few_shot_generator/rand.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/generation_dataset/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/generation_dataset/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/generation_dataset/hf_dataset.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/generation_dataset/jsonl.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/generation_dataset/sacrebleu_dataset.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/language_model/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/language_model/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/language_model/hf_lm.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/language_model/openai_chatgpt.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/language_model/vllm_model.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/bleu.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/char_f1.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/code_eval.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/common_prefix_length.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/common_string_length.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/exact_match.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/normalizer/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/normalizer/aio.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/normalizer/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/normalizer/regex.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/output_length_stats.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/perspective_api.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/rouge.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/substring_match.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/tokenizer/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/tokenizer/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/tokenizer/mecab.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/tokenizer/sacrebleu_tokenizer.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/tokenizer/whitespace.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/metric/xer.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/multiple_choice_dataset/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/multiple_choice_dataset/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/multiple_choice_dataset/hf_dataset.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/judge/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/judge/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/match.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/match_maker/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/match_maker/all_combinations.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/match_maker/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/match_maker/random_combinations.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/scorer/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/scorer/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/scorer/bradley_terry.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/scorer/win_rate.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/prompt_template/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/prompt_template/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/prompt_template/jinja2.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/text_dataset/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/text_dataset/base.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/text_dataset/hf.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/text_dataset/jsonl.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/utils/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/utils/data_util.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/utils/jinja2_env.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/code_generation/jhumaneval.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/code_generation/jhumaneval_tab_indent.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/code_generation/mbpp.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/code_generation/mbpp_tab_indent.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/code_generation/openai_humaneval.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/code_generation/openai_humaneval_tab_indent.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_chat/mt-en.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_chat/vicuna-en.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_generation/babi.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_generation/commonsense_qa.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_generation/gsm8k.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_generation/squad_v1.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_generation/trivia_qa.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_generation/twitter_sentiment.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_multiple_choice/commonsense_qa_mc.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_multiple_choice/hellaswag.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_multiple_choice/openbookqa.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_multiple_choice/xwinograd_en.jsonnet +0 -0
- {flexeval-0.3.1/flexeval/preset_configs/EvalSetup/en_preplexity → flexeval-0.3.2/flexeval/preset_configs/EvalSetup/en_perplexity}/tiny_shakespeare.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_chat/elyze_tasks_100.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_chat/mt-ja.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_chat/rakuda-v2-ja.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_chat/vicuna-ja.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/aio.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/jcommonsenseqa.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/jnli.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/jsquad.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/mgsm_ja.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/wrime_pos_neg.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/xlsum_ja.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_multiple_choice/jcommonsenseqa_mc.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_multiple_choice/xwinograd_ja.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/translation/wmt20_en_ja.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/translation/wmt20_ja_en.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/Metric/assistant_eval_gpt4_en_single_turn.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/PairwiseJudge/assistant_judge_gpt4_en_single_turn.jsonnet +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/scripts/__init__.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/scripts/common.py +0 -0
- {flexeval-0.3.1 → flexeval-0.3.2}/flexeval/scripts/flexeval_presets.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flexeval
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary:
|
|
5
5
|
Author: ryokan-ri
|
|
6
6
|
Author-email: ryokan.ri@sbintuitions.co.jp
|
|
@@ -9,6 +9,7 @@ Classifier: Programming Language :: Python :: 3
|
|
|
9
9
|
Classifier: Programming Language :: Python :: 3.9
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
13
|
Provides-Extra: vllm
|
|
13
14
|
Requires-Dist: datasets (>=2.14.6,<3.0.0)
|
|
14
15
|
Requires-Dist: evaluate (>=0.4.1,<0.5.0)
|
|
@@ -109,7 +109,7 @@ class ChatLLMScore(Metric):
|
|
|
109
109
|
self,
|
|
110
110
|
language_model: LanguageModel,
|
|
111
111
|
prompt_template: PromptTemplate,
|
|
112
|
-
system_message: str | None = None,
|
|
112
|
+
system_message: str | PromptTemplate | None = None,
|
|
113
113
|
batch_size: int = 4,
|
|
114
114
|
) -> None:
|
|
115
115
|
self._language_model = language_model
|
|
@@ -151,9 +151,13 @@ class ChatLLMScore(Metric):
|
|
|
151
151
|
evaluator_input = self._prompt_template.embed_input(prompt_inputs)
|
|
152
152
|
input_chat_messages = [{"role": "user", "content": evaluator_input}]
|
|
153
153
|
if self._system_message:
|
|
154
|
+
if isinstance(self._system_message, str):
|
|
155
|
+
system_message = self._system_message
|
|
156
|
+
else:
|
|
157
|
+
system_message = self._system_message.embed_input(prompt_inputs)
|
|
154
158
|
input_chat_messages.insert(
|
|
155
159
|
0,
|
|
156
|
-
{"role": "system", "content":
|
|
160
|
+
{"role": "system", "content": system_message},
|
|
157
161
|
)
|
|
158
162
|
evaluator_input_list.append(input_chat_messages)
|
|
159
163
|
|
|
@@ -26,7 +26,7 @@ class ChatLLMPairwiseJudge(PairwiseJudge):
|
|
|
26
26
|
self,
|
|
27
27
|
language_model: LanguageModel,
|
|
28
28
|
prompt_template: PromptTemplate,
|
|
29
|
-
system_message: str | None = None,
|
|
29
|
+
system_message: str | PromptTemplate | None = None,
|
|
30
30
|
) -> None:
|
|
31
31
|
self._language_model = language_model
|
|
32
32
|
self._prompt_template = prompt_template
|
|
@@ -76,7 +76,14 @@ class ChatLLMPairwiseJudge(PairwiseJudge):
|
|
|
76
76
|
judge_input = self._prompt_template.embed_input(prompt_inputs)
|
|
77
77
|
input_chat_messages = [{"role": "user", "content": judge_input}]
|
|
78
78
|
if self._system_message:
|
|
79
|
-
|
|
79
|
+
if isinstance(self._system_message, str):
|
|
80
|
+
system_message = self._system_message
|
|
81
|
+
else:
|
|
82
|
+
system_message = self._system_message.embed_input(prompt_inputs)
|
|
83
|
+
input_chat_messages.insert(
|
|
84
|
+
0,
|
|
85
|
+
{"role": "system", "content": system_message},
|
|
86
|
+
)
|
|
80
87
|
input_chat_messages_list.append(input_chat_messages)
|
|
81
88
|
judge_outputs = self._language_model.batch_generate_chat_response(input_chat_messages_list)
|
|
82
89
|
return [self._parse_judge_output(output) for output in judge_outputs]
|
flexeval-0.3.2/flexeval/preset_configs/PairwiseJudge/assistant_judge_gpt4_ja_single_turn.jsonnet
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/*
|
|
2
|
+
This is a configuration for evaluting the quality of responses generated by an AI assistant.
|
|
3
|
+
Originally used to generate scores for the Japanese versions of MT-bench or Vicuna-bench.
|
|
4
|
+
|
|
5
|
+
Translated and adapted from [lm-sys/FastChat](https://github.com/lm-sys/FastChat/blob/main/fastchat/llm_judge/data/judge_prompts.jsonl).
|
|
6
|
+
*/
|
|
7
|
+
{
|
|
8
|
+
class_path: 'ChatLLMPairwiseJudge',
|
|
9
|
+
init_args: {
|
|
10
|
+
language_model: { class_path: 'OpenAIChatGPT', init_args: { model_name: 'gpt-4-turbo-2024-04-09' } },
|
|
11
|
+
prompt_template: {
|
|
12
|
+
class_path: 'Jinja2PromptTemplate',
|
|
13
|
+
init_args: {
|
|
14
|
+
template: |||
|
|
15
|
+
{% set question = model1_item["task_inputs"]["messages"][0]["content"] -%}
|
|
16
|
+
{% set model1_chat = model1_item["task_inputs"]["messages"] -%}
|
|
17
|
+
{% set model2_chat = model2_item["task_inputs"]["messages"] -%}
|
|
18
|
+
|
|
19
|
+
[ユーザの質問]
|
|
20
|
+
{{ model1_item["task_inputs"]["chat"][0]["content"] }}
|
|
21
|
+
|
|
22
|
+
{% if references|length > 0 -%}
|
|
23
|
+
[参考回答の開始]
|
|
24
|
+
{{ references[0] }}
|
|
25
|
+
[参考回答の終了]
|
|
26
|
+
{% endif -%}
|
|
27
|
+
[アシスタント1の回答開始]
|
|
28
|
+
{% if model1_chat|length == 1 %}{{ model1_item["lm_output"] }}{% else %}{{ model1_chat[1]["content"] }}{% endif %}
|
|
29
|
+
[アシスタント1の回答終了]
|
|
30
|
+
[アシスタント2の回答開始]
|
|
31
|
+
{% if model2_chat|length == 1 %}{{ model2_item["lm_output"] }}{% else %}{{ model2_chat[1]["content"] }}{% endif %}
|
|
32
|
+
[アシスタント2の回答終了]
|
|
33
|
+
|||,
|
|
34
|
+
},
|
|
35
|
+
},
|
|
36
|
+
system_message: {
|
|
37
|
+
class_path: 'Jinja2PromptTemplate',
|
|
38
|
+
init_args: {
|
|
39
|
+
template: |||
|
|
40
|
+
{% if references|length > 0 -%}
|
|
41
|
+
あなたは、回答の質をチェックするための審判員です。以下に示されるユーザーの質問に対する2つのAIアシスタントの応答の品質を評価してください。回答の内容がユーザーの指示に従っており、ユーザーの質問によりよく答えているアシスタントを選んでください。参照回答、アシスタント1の回答、アシスタント2の回答が与えられるので、どちらのアシスタントの回答が優れているかを評価してください。評価の際には、まずそれぞれのアシスタントの回答を参照回答と比較し、回答の誤りを見つけて修正してください。立場が偏らないようにし、回答の提示順があなたの判断に影響しないようにしてください。回答の長さが評価に影響しないこと、特定のアシスタントの名前を好まないこと、できるだけ客観的であること、に気をつけてください。説明の後に、最終的な判断を以下の形式に従って出力してください:アシスタント1が優れていれば[[1]]、アシスタント2が優れていれば[[2]]、同点の場合は[[3]]
|
|
42
|
+
{%- else -%}
|
|
43
|
+
あなたは、回答の質をチェックするための審判員です。以下に示されるユーザーの質問に対する2つのAIアシスタントの応答の品質を評価してください。回答の内容がユーザーの指示に従っており、ユーザーの質問によりよく答えているアシスタントを選んでください。具体的には、回答の有用性、関連性、正確性、深さ、創造性、詳細レベルなどの要素を考慮する必要があります。評価の際には、まず2つの回答を比較し、簡単な説明をしてください。立場が偏らないようにし、回答の提示順があなたの判断に影響しないようにしてください。回答の長さが評価に影響しないこと、特定のアシスタントの名前を好まないこと、できるだけ客観的であること、に気をつけてください。説明の後に、最終的な判断を以下の形式に従って出力してください:アシスタント1が優れていれば[[1]]、アシスタント2が優れていれば[[2]]、同点の場合は[[3]]
|
|
44
|
+
{%- endif %}
|
|
45
|
+
|||,
|
|
46
|
+
},
|
|
47
|
+
},
|
|
48
|
+
},
|
|
49
|
+
}
|
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
import sys
|
|
6
7
|
from importlib.metadata import version
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
from typing import Any, Dict, List, Union
|
|
@@ -76,6 +77,10 @@ def main() -> None:
|
|
|
76
77
|
help="Path to the config file",
|
|
77
78
|
)
|
|
78
79
|
|
|
80
|
+
# Add the current directory to sys.path
|
|
81
|
+
# to enable importing modules from the directory where this script is executed.
|
|
82
|
+
sys.path.append(os.environ.get("ADDITIONAL_MODULES_PATH", "./"))
|
|
83
|
+
|
|
79
84
|
args = parser.parse_args()
|
|
80
85
|
logger.info(args)
|
|
81
86
|
|
|
@@ -188,6 +188,7 @@ def main() -> None: # noqa: C901, PLR0912, PLR0915
|
|
|
188
188
|
"You can specify the parameters, the path to the config file, or the name of the preset config.",
|
|
189
189
|
enable_path=True,
|
|
190
190
|
)
|
|
191
|
+
# Saving arguments
|
|
191
192
|
parser.add_argument(
|
|
192
193
|
"--save_dir",
|
|
193
194
|
type=str,
|
|
@@ -200,11 +201,13 @@ def main() -> None: # noqa: C901, PLR0912, PLR0915
|
|
|
200
201
|
default=False,
|
|
201
202
|
help="Overwrite the save_dir if it exists",
|
|
202
203
|
)
|
|
204
|
+
# Argument parsing arguments
|
|
203
205
|
parser.add_argument(
|
|
204
206
|
"--config",
|
|
205
207
|
action=ActionConfigFile,
|
|
206
208
|
help="Path to the config file",
|
|
207
209
|
)
|
|
210
|
+
# Metadata
|
|
208
211
|
parser.add_argument(
|
|
209
212
|
"--metadata",
|
|
210
213
|
type=Dict[str, Any],
|
|
@@ -230,6 +233,10 @@ def main() -> None: # noqa: C901, PLR0912, PLR0915
|
|
|
230
233
|
if resolved_config_path is not None:
|
|
231
234
|
sys.argv[i + 1] = resolved_config_path
|
|
232
235
|
|
|
236
|
+
# Add the current directory to sys.path
|
|
237
|
+
# to enable importing modules from the directory where this script is executed.
|
|
238
|
+
sys.path.append(os.environ.get("ADDITIONAL_MODULES_PATH", "./"))
|
|
239
|
+
|
|
233
240
|
args = parser.parse_args()
|
|
234
241
|
logger.info(args)
|
|
235
242
|
logger.info(f"flexeval version: {version('flexeval')}")
|
|
@@ -320,6 +327,8 @@ def main() -> None: # noqa: C901, PLR0912, PLR0915
|
|
|
320
327
|
f"Overwriting the existing file: {save_dir / CONFIG_FILE_NAME}",
|
|
321
328
|
)
|
|
322
329
|
|
|
330
|
+
save_json(task_config, save_dir / CONFIG_FILE_NAME)
|
|
331
|
+
|
|
323
332
|
try:
|
|
324
333
|
with Timer() as timer:
|
|
325
334
|
metrics, outputs = eval_setup.evaluate_lm(
|
|
@@ -90,6 +90,10 @@ def main() -> None:
|
|
|
90
90
|
if resolved_config_path is not None:
|
|
91
91
|
sys.argv[i + 1] = resolved_config_path
|
|
92
92
|
|
|
93
|
+
# Add the current directory to sys.path
|
|
94
|
+
# to enable importing modules from the directory where this script is executed.
|
|
95
|
+
sys.path.append(os.environ.get("ADDITIONAL_MODULES_PATH", "./"))
|
|
96
|
+
|
|
93
97
|
args = parser.parse_args()
|
|
94
98
|
logger.info(args)
|
|
95
99
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "flexeval"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.2" # This will be automatically set from git tag by poetry-dynamic-versioning
|
|
4
4
|
description = ""
|
|
5
5
|
authors = ["ryokan-ri <ryokan.ri@sbintuitions.co.jp>"]
|
|
6
6
|
readme = "README.md"
|
|
@@ -47,12 +47,6 @@ mkdocs-gen-files = "^0.5.0"
|
|
|
47
47
|
mkdocs-literate-nav = "^0.6.1"
|
|
48
48
|
mkdocs-section-index = "^0.3.9"
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
[[tool.poetry.source]]
|
|
52
|
-
name = "pypi_test"
|
|
53
|
-
url = "https://test.pypi.org/"
|
|
54
|
-
priority = "supplemental"
|
|
55
|
-
|
|
56
50
|
[build-system]
|
|
57
51
|
requires = ["poetry-core", "poetry-dynamic-versioning"]
|
|
58
52
|
build-backend = "poetry_dynamic_versioning.backend"
|
flexeval-0.3.1/flexeval/preset_configs/PairwiseJudge/assistant_judge_gpt4_ja_single_turn.jsonnet
DELETED
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
This is a configuration for evaluting the quality of responses generated by an AI assistant.
|
|
3
|
-
Originally used to generate scores for the Japanese versions of MT-bench or Vicuna-bench.
|
|
4
|
-
|
|
5
|
-
Translated and adapted from [lm-sys/FastChat](https://github.com/lm-sys/FastChat/blob/main/fastchat/llm_judge/data/judge_prompts.jsonl).
|
|
6
|
-
*/
|
|
7
|
-
{
|
|
8
|
-
class_path: 'ChatLLMPairwiseJudge',
|
|
9
|
-
init_args: {
|
|
10
|
-
language_model: { class_path: 'OpenAIChatGPT', init_args: { model_name: 'gpt-4-turbo-2024-04-09' } },
|
|
11
|
-
prompt_template: {
|
|
12
|
-
class_path: 'Jinja2PromptTemplate',
|
|
13
|
-
init_args: {
|
|
14
|
-
template: |||
|
|
15
|
-
{% set question = model1_item["task_inputs"]["messages"][0]["content"] -%}
|
|
16
|
-
{% set model1_chat = model1_item["task_inputs"]["messages"] -%}
|
|
17
|
-
{% set model2_chat = model2_item["task_inputs"]["messages"] -%}
|
|
18
|
-
[指示]
|
|
19
|
-
{% if references|length > 0 -%}
|
|
20
|
-
以下に示されるユーザーの質問に対する2つのAIアシスタントの応答の品質を評価してください。回答の内容がユーザーの指示に従っており、ユーザーの質問によりよく答えているアシスタントを選んでください。参照回答、アシスタント1の回答、アシスタント2の回答が与えられるので、どちらのアシスタントの回答が優れているかを評価してください。評価の際には、まずそれぞれのアシスタントの回答を参照回答と比較し、回答の誤りを見つけて修正してください。立場が偏らないようにし、回答の提示順があなたの判断に影響しないようにしてください。回答の長さが評価に影響しないこと、特定のアシスタントの名前を好まないこと、できるだけ客観的であること、に気をつけてください。説明の後に、最終的な判断を以下の形式に従って出力してください:アシスタント1が優れていれば[[1]]、アシスタント2が優れていれば[[2]]、同点の場合は[[3]]
|
|
21
|
-
{%- else -%}
|
|
22
|
-
以下に示されるユーザーの質問に対する2つのAIアシスタントの応答の品質を評価してください。回答の内容がユーザーの指示に従っており、ユーザーの質問によりよく答えているアシスタントを選んでください。具体的には、回答の有用性、関連性、正確性、深さ、創造性、詳細レベルなどの要素を考慮する必要があります。評価の際には、まず2つの回答を比較し、簡単な説明をしてください。立場が偏らないようにし、回答の提示順があなたの判断に影響しないようにしてください。回答の長さが評価に影響しないこと、特定のアシスタントの名前を好まないこと、できるだけ客観的であること、に気をつけてください。説明の後に、最終的な判断を以下の形式に従って出力してください:アシスタント1が優れていれば[[1]]、アシスタント2が優れていれば[[2]]、同点の場合は[[3]]
|
|
23
|
-
{%- endif %}
|
|
24
|
-
|
|
25
|
-
[ユーザの質問]
|
|
26
|
-
{{ model1_item["task_inputs"]["chat"][0]["content"] }}
|
|
27
|
-
|
|
28
|
-
{% if references|length > 0 -%}
|
|
29
|
-
[参考回答の開始]
|
|
30
|
-
{{ references[0] }}
|
|
31
|
-
[参考回答の終了]
|
|
32
|
-
{% endif -%}
|
|
33
|
-
[アシスタント1の回答開始]
|
|
34
|
-
{% if model1_chat|length == 1 %}{{ model1_item["lm_output"] }}{% else %}{{ model1_chat[1]["content"] }}{% endif %}
|
|
35
|
-
[アシスタント1の回答終了]
|
|
36
|
-
[アシスタント2の回答開始]
|
|
37
|
-
{% if model2_chat|length == 1 %}{{ model2_item["lm_output"] }}{% else %}{{ model2_chat[1]["content"] }}{% endif %}
|
|
38
|
-
[アシスタント2の回答終了]
|
|
39
|
-
|||,
|
|
40
|
-
},
|
|
41
|
-
},
|
|
42
|
-
},
|
|
43
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/README.md
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-en.jsonl
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-ja.jsonl
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-en.jsonl
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-ja.jsonl
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/core/pairwise_comparison/match_maker/all_combinations.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/code_generation/mbpp.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_chat/vicuna-en.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_generation/babi.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_generation/gsm8k.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_generation/squad_v1.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/en_generation/trivia_qa.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_chat/elyze_tasks_100.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_chat/rakuda-v2-ja.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_chat/vicuna-ja.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/aio.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/jnli.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/jsquad.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/mgsm_ja.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/ja_generation/xlsum_ja.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/translation/wmt20_en_ja.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.3.1 → flexeval-0.3.2}/flexeval/preset_configs/EvalSetup/translation/wmt20_ja_en.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|