flexeval 0.7.2__tar.gz → 0.7.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {flexeval-0.7.2 → flexeval-0.7.4}/PKG-INFO +2 -1
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/openai_api.py +39 -5
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/openai_batch_api.py +32 -14
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/llm_score.py +1 -1
- flexeval-0.7.4/flexeval/core/prompt_template/jinja2.py +37 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_model/pairwise_judge_reward_model.py +1 -1
- {flexeval-0.7.2 → flexeval-0.7.4}/pyproject.toml +1 -1
- flexeval-0.7.2/flexeval/core/prompt_template/jinja2.py +0 -25
- {flexeval-0.7.2 → flexeval-0.7.4}/LICENSE +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/README.md +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/README.md +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-en-ref-gpt4.jsonl +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-en.jsonl +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-ja-ref-gpt4.jsonl +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-ja.jsonl +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/rakuda-v2-ja.jsonl +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-en-ref-gpt4.jsonl +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-en.jsonl +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-ja-ref-gpt4.jsonl +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-ja.jsonl +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/sacrebleu_dataset.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/template_based.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/eval_setups.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_chat_response.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_from_data.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_generation.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_multiple_choice.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_pairwise.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_perplexity.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_reward_model.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/few_shot_generator/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/few_shot_generator/balanced.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/few_shot_generator/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/few_shot_generator/rand.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/generation_dataset/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/generation_dataset/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/generation_dataset/sacrebleu_dataset.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/generation_dataset/template_based.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/hf_lm.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/vllm_model.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/bleu.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/char_f1.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/code_eval.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/common_prefix_length.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/common_string_length.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/exact_match.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/output_length_stats.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/perspective_api.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/rouge.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/aio.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/last_line.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/noop.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/regex.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/string_strip.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/substring_match.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/tokenizer/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/tokenizer/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/tokenizer/mecab.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/tokenizer/sacrebleu_tokenizer.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/tokenizer/whitespace.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/xer.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/multiple_choice_dataset/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/multiple_choice_dataset/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/multiple_choice_dataset/template_based.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/judge/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/judge/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/judge/llm_judge.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/match.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/match_maker/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/match_maker/all_combinations.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/match_maker/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/match_maker/random_combinations.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/scorer/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/scorer/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/scorer/bradley_terry.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/scorer/win_rate.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/prompt_template/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/prompt_template/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/result_recorder/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/result_recorder/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/result_recorder/local_recorder.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/result_recorder/wandb_recorder.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_bench_dataset/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_bench_dataset/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_bench_dataset/hf.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_model/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_model/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/text_dataset/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/text_dataset/base.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/text_dataset/hf.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/text_dataset/jsonl.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/utils/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/utils/data_util.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/utils/jinja2_utils.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_chat/mbpp_chat.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/jhumaneval.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/jhumaneval_tab_indent.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/mbpp.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/mbpp_tab_indent.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/openai_humaneval.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/openai_humaneval_tab_indent.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_chat/mt-en.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_chat/vicuna-en.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/babi.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/commonsense_qa.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/gsm8k.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/squad_v1.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/trivia_qa.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/twitter_sentiment.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_multiple_choice/commonsense_qa_mc.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_multiple_choice/hellaswag.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_multiple_choice/openbookqa.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_multiple_choice/xwinograd_en.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_perplexity/tiny_shakespeare.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/aio_chat.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/elyza_tasks_100.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/mgsm_ja_chat.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/mt-ja.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/rakuda-v2-ja.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/vicuna-ja.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/aio.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/jcommonsenseqa.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/jnli.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/jsquad.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/mgsm_ja.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/wrime_pos_neg.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/xlsum_ja.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_multiple_choice/jcommonsenseqa_mc.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_multiple_choice/xwinograd_ja.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/translation/wmt20_en_ja.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/translation/wmt20_ja_en.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/translation_chat/wmt20_en_ja_chat.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/translation_chat/wmt20_ja_en_chat.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/Metric/assistant_eval_en_single_turn.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/Metric/assistant_eval_ja_single_turn.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/Metric/elyza_tasks_100_eval.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/PairwiseJudge/assistant_judge_en_single_turn.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/PairwiseJudge/assistant_judge_ja_single_turn.jsonnet +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/common.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/flexeval_file.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/flexeval_lm.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/flexeval_pairwise.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/flexeval_presets.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/flexeval_reward.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/utils/__init__.py +0 -0
- {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/utils/module_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flexeval
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.4
|
|
4
4
|
Summary:
|
|
5
5
|
Author: ryokan-ri
|
|
6
6
|
Author-email: ryokan.ri@sbintuitions.co.jp
|
|
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
14
|
Provides-Extra: vllm
|
|
14
15
|
Provides-Extra: wandb
|
|
15
16
|
Requires-Dist: datasets (>=2.14.6,<3.0.0)
|
|
@@ -6,28 +6,56 @@ from typing import Any, Awaitable, Callable, TypeVar
|
|
|
6
6
|
import openai
|
|
7
7
|
from loguru import logger
|
|
8
8
|
from openai import AsyncOpenAI
|
|
9
|
+
from openai.types.chat import ChatCompletion, ChatCompletionMessage
|
|
10
|
+
from openai.types.chat.chat_completion import Choice
|
|
9
11
|
|
|
10
12
|
from .base import LanguageModel, normalize_stop_sequences
|
|
11
13
|
|
|
12
14
|
T = TypeVar("T")
|
|
13
15
|
|
|
14
16
|
|
|
17
|
+
# NOTE: current implementation uses only choices[0].message.content field.
|
|
18
|
+
EMPTY_RESPONSE = ChatCompletion(
|
|
19
|
+
id="dummy",
|
|
20
|
+
choices=[
|
|
21
|
+
Choice(
|
|
22
|
+
finish_reason="stop",
|
|
23
|
+
index=0,
|
|
24
|
+
message=ChatCompletionMessage(
|
|
25
|
+
content="", refusal=None, role="assistant", function_call=None, tool_calls=None
|
|
26
|
+
),
|
|
27
|
+
)
|
|
28
|
+
],
|
|
29
|
+
created=946652400, # dummy integer
|
|
30
|
+
model="dummy_model",
|
|
31
|
+
object="chat.completion",
|
|
32
|
+
service_tier=None,
|
|
33
|
+
system_fingerprint=None,
|
|
34
|
+
usage=None,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
15
38
|
async def _retry_on_error(
|
|
16
39
|
openai_call: Callable[[], Awaitable[T]],
|
|
17
40
|
max_num_trials: int = 5,
|
|
18
41
|
first_wait_time: int = 10,
|
|
19
|
-
) -> Awaitable[T]
|
|
42
|
+
) -> Awaitable[T]:
|
|
20
43
|
for i in range(max_num_trials):
|
|
21
44
|
try:
|
|
22
45
|
return await openai_call()
|
|
23
46
|
except openai.APIError as e: # noqa: PERF203
|
|
24
47
|
if i == max_num_trials - 1:
|
|
25
|
-
|
|
48
|
+
# Since reaching maximum number of trials, exit for-loop and return
|
|
49
|
+
# empty response.
|
|
50
|
+
break
|
|
26
51
|
logger.warning(f"We got an error: {e}")
|
|
27
52
|
wait_time_seconds = first_wait_time * (2**i)
|
|
28
53
|
logger.warning(f"Wait for {wait_time_seconds} seconds...")
|
|
29
54
|
await asyncio.sleep(wait_time_seconds)
|
|
30
|
-
|
|
55
|
+
|
|
56
|
+
logger.warning(f"We reached maximum number of trials ({max_num_trials} trials.).")
|
|
57
|
+
logger.warning("Response including empty string is returned.")
|
|
58
|
+
return EMPTY_RESPONSE
|
|
31
59
|
|
|
32
60
|
|
|
33
61
|
class OpenAIChatAPI(LanguageModel):
|
|
@@ -108,7 +136,10 @@ class OpenAIChatAPI(LanguageModel):
|
|
|
108
136
|
**kwargs,
|
|
109
137
|
),
|
|
110
138
|
)
|
|
111
|
-
|
|
139
|
+
completions = [res.choices[0].message.content for res in api_responses]
|
|
140
|
+
if all(completion == "" for completion in completions):
|
|
141
|
+
logger.warning("All generated texts are empty strings. Something may be wrong.")
|
|
142
|
+
return completions
|
|
112
143
|
|
|
113
144
|
def batch_generate_chat_response(
|
|
114
145
|
self,
|
|
@@ -118,7 +149,10 @@ class OpenAIChatAPI(LanguageModel):
|
|
|
118
149
|
api_responses = asyncio.run(
|
|
119
150
|
self._async_batch_run_chatgpt(chat_messages_list, **kwargs),
|
|
120
151
|
)
|
|
121
|
-
|
|
152
|
+
completions = [res.choices[0].message.content for res in api_responses]
|
|
153
|
+
if all(completion == "" for completion in completions):
|
|
154
|
+
logger.warning("All generated texts are empty string. Something may go wrong.")
|
|
155
|
+
return completions
|
|
122
156
|
|
|
123
157
|
def __repr__(self) -> str:
|
|
124
158
|
return f"{self.__class__.__name__}(model={self.model})"
|
|
@@ -6,6 +6,7 @@ import os
|
|
|
6
6
|
import tempfile
|
|
7
7
|
import uuid
|
|
8
8
|
from enum import Enum
|
|
9
|
+
from pprint import pformat
|
|
9
10
|
from typing import Any
|
|
10
11
|
|
|
11
12
|
from loguru import logger
|
|
@@ -128,6 +129,10 @@ class OpenAIChatBatchAPI(LanguageModel):
|
|
|
128
129
|
logger.info(f"Current status: {status.value}")
|
|
129
130
|
return status, batch_response
|
|
130
131
|
|
|
132
|
+
def _retrieve_file_content(self, file_id: str) -> list[dict[any, any]]:
|
|
133
|
+
file_response = asyncio.run(self._client.files.content(file_id))
|
|
134
|
+
return [json.loads(line) for line in file_response.text.strip().split("\n")]
|
|
135
|
+
|
|
131
136
|
def _execute_batch_requests(
|
|
132
137
|
self,
|
|
133
138
|
messages_list: list[list[dict[str, str]]],
|
|
@@ -136,13 +141,15 @@ class OpenAIChatBatchAPI(LanguageModel):
|
|
|
136
141
|
custom_id_2_message: dict[str, list[dict[str, str]]] = {
|
|
137
142
|
str(uuid.uuid4()): messages for messages in messages_list
|
|
138
143
|
}
|
|
139
|
-
|
|
140
|
-
|
|
144
|
+
# The response will be an empty string if the API produces an error.
|
|
145
|
+
custom_id_2_response: dict[str, str] = {custom_id: "" for custom_id in custom_id_2_message}
|
|
146
|
+
exec_cnt = 1
|
|
141
147
|
|
|
142
148
|
while len(custom_id_2_message) > 0:
|
|
143
149
|
if exec_cnt > MAX_NUM_TRIALS:
|
|
144
150
|
break
|
|
145
151
|
logger.info(f"Trial {exec_cnt}")
|
|
152
|
+
exec_cnt += 1
|
|
146
153
|
batch_id = asyncio.run(self._post_batch_requests(custom_id_2_message, **kwargs))
|
|
147
154
|
|
|
148
155
|
status, batch_response = asyncio.run(
|
|
@@ -152,13 +159,25 @@ class OpenAIChatBatchAPI(LanguageModel):
|
|
|
152
159
|
error_message = f"Failed: {batch_response}"
|
|
153
160
|
raise ValueError(error_message)
|
|
154
161
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
data.
|
|
161
|
-
|
|
162
|
+
# Check error_file_id exists and if exists, log error details.
|
|
163
|
+
error_file_id = batch_response.error_file_id
|
|
164
|
+
# If any request fails, error_file_id is set.
|
|
165
|
+
if error_file_id is not None:
|
|
166
|
+
logger.warning("Request on some messages failed following reason.")
|
|
167
|
+
data: list[dict[str, Any]] = self._retrieve_file_content(error_file_id)
|
|
168
|
+
# [Error](https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L8857])
|
|
169
|
+
# instance is embedded in response.
|
|
170
|
+
for data_i in data:
|
|
171
|
+
error = data_i["response"]
|
|
172
|
+
logger.warning(f"Failed: {error}")
|
|
173
|
+
|
|
174
|
+
output_file_id = batch_response.output_file_id
|
|
175
|
+
# If completion on all input fails, output_file_id is None.
|
|
176
|
+
if output_file_id is None:
|
|
177
|
+
logger.warning("All request failed. Continue...")
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
data: list[dict[str, Any]] = self._retrieve_file_content(output_file_id)
|
|
162
181
|
for data_i in data:
|
|
163
182
|
if data_i["error"] is not None:
|
|
164
183
|
continue
|
|
@@ -167,11 +186,10 @@ class OpenAIChatBatchAPI(LanguageModel):
|
|
|
167
186
|
custom_id_2_message.pop(custom_id)
|
|
168
187
|
custom_id_2_response[custom_id] = data_i["response"]["body"]["choices"][0]["message"]["content"]
|
|
169
188
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
raise ValueError(error_message)
|
|
189
|
+
# The remaining elements are all those that failed to complete request.
|
|
190
|
+
if custom_id_2_message:
|
|
191
|
+
logger.warning("The following messages failed to complete request.")
|
|
192
|
+
logger.warning(pformat(list(custom_id_2_message.values())))
|
|
175
193
|
|
|
176
194
|
return list(custom_id_2_response.values())
|
|
177
195
|
|
|
@@ -49,7 +49,7 @@ def summarize_evaluator_scores(
|
|
|
49
49
|
if score is None or category_key is None:
|
|
50
50
|
continue
|
|
51
51
|
if category_key in task_inputs:
|
|
52
|
-
category2valid_scores[task_inputs[
|
|
52
|
+
category2valid_scores[task_inputs[category_key]].append(score)
|
|
53
53
|
|
|
54
54
|
category2mean_score: dict[str, float] = {}
|
|
55
55
|
for category, valid_scores in category2valid_scores.items():
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from flexeval.core.utils.jinja2_utils import JINJA2_ENV
|
|
6
|
+
|
|
7
|
+
from .base import PromptTemplate
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Jinja2PromptTemplate(PromptTemplate):
|
|
11
|
+
"""
|
|
12
|
+
Embed task inputs using Jinja2 template engine.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
template: The Jinja2 template to use.
|
|
16
|
+
template_path: The path to a file with the Jinja2 template to use.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, template: str | None = None, template_path: str | None = None) -> None:
|
|
20
|
+
if template is None and template_path is None:
|
|
21
|
+
msg = "Either template or template_path must be provided"
|
|
22
|
+
raise ValueError(msg)
|
|
23
|
+
if template is not None and template_path is not None:
|
|
24
|
+
msg = "Only one of template or template_path can be provided"
|
|
25
|
+
raise ValueError(msg)
|
|
26
|
+
|
|
27
|
+
if template_path is not None:
|
|
28
|
+
with open(template_path) as f:
|
|
29
|
+
self.template = f.read()
|
|
30
|
+
else:
|
|
31
|
+
self.template = template
|
|
32
|
+
|
|
33
|
+
def embed_inputs(self, input_dict: dict[str, Any]) -> str:
|
|
34
|
+
return JINJA2_ENV.from_string(self.template).render(input_dict)
|
|
35
|
+
|
|
36
|
+
def __repr__(self) -> str:
|
|
37
|
+
return f"Jinja2PromptTemplate(template={self.template!r})"
|
|
@@ -120,6 +120,6 @@ class PairwiseJudgeRewardModel(RewardModel):
|
|
|
120
120
|
|
|
121
121
|
for i in range(len(outputs)):
|
|
122
122
|
outputs[i]["llm_outputs"] = [judge_outputs[i * 2], judge_outputs[i * 2 + 1]]
|
|
123
|
-
outputs[i]["
|
|
123
|
+
outputs[i]["evaluation_results"] = [chosen_is_betters[i * 2], chosen_is_betters[i * 2 + 1]]
|
|
124
124
|
|
|
125
125
|
return chosen_is_betters, outputs
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "flexeval"
|
|
3
|
-
version = "0.7.
|
|
3
|
+
version = "0.7.4" # This will be automatically set from git tag by poetry-dynamic-versioning
|
|
4
4
|
description = ""
|
|
5
5
|
authors = ["ryokan-ri <ryokan.ri@sbintuitions.co.jp>"]
|
|
6
6
|
readme = "README.md"
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
from flexeval.core.utils.jinja2_utils import JINJA2_ENV
|
|
6
|
-
|
|
7
|
-
from .base import PromptTemplate
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class Jinja2PromptTemplate(PromptTemplate):
|
|
11
|
-
"""
|
|
12
|
-
Embed task inputs using Jinja2 template engine.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
template: The Jinja2 template to use.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
def __init__(self, template: str) -> None:
|
|
19
|
-
self.template = template
|
|
20
|
-
|
|
21
|
-
def embed_inputs(self, input_dict: dict[str, Any]) -> str:
|
|
22
|
-
return JINJA2_ENV.from_string(self.template).render(input_dict)
|
|
23
|
-
|
|
24
|
-
def __repr__(self) -> str:
|
|
25
|
-
return f"Jinja2PromptTemplate(template={self.template!r})"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/README.md
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-en.jsonl
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-ja.jsonl
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-en.jsonl
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-ja.jsonl
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/match_maker/all_combinations.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_chat/mbpp_chat.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/mbpp.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_chat/vicuna-en.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/babi.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/gsm8k.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/squad_v1.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/trivia_qa.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/aio_chat.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/elyza_tasks_100.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/mgsm_ja_chat.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/rakuda-v2-ja.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/vicuna-ja.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/aio.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/jnli.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/jsquad.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/mgsm_ja.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/xlsum_ja.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/translation/wmt20_en_ja.jsonnet
RENAMED
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/translation/wmt20_ja_en.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/Metric/elyza_tasks_100_eval.jsonnet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|