flexeval 0.7.2__tar.gz → 0.7.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. {flexeval-0.7.2 → flexeval-0.7.4}/PKG-INFO +2 -1
  2. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/openai_api.py +39 -5
  3. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/openai_batch_api.py +32 -14
  4. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/llm_score.py +1 -1
  5. flexeval-0.7.4/flexeval/core/prompt_template/jinja2.py +37 -0
  6. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_model/pairwise_judge_reward_model.py +1 -1
  7. {flexeval-0.7.2 → flexeval-0.7.4}/pyproject.toml +1 -1
  8. flexeval-0.7.2/flexeval/core/prompt_template/jinja2.py +0 -25
  9. {flexeval-0.7.2 → flexeval-0.7.4}/LICENSE +0 -0
  10. {flexeval-0.7.2 → flexeval-0.7.4}/README.md +0 -0
  11. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/__init__.py +0 -0
  12. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/__init__.py +0 -0
  13. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/__init__.py +0 -0
  14. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/base.py +0 -0
  15. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench.py +0 -0
  16. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/README.md +0 -0
  17. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-en-ref-gpt4.jsonl +0 -0
  18. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-en.jsonl +0 -0
  19. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-ja-ref-gpt4.jsonl +0 -0
  20. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/mt-ja.jsonl +0 -0
  21. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/rakuda-v2-ja.jsonl +0 -0
  22. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-en-ref-gpt4.jsonl +0 -0
  23. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-en.jsonl +0 -0
  24. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-ja-ref-gpt4.jsonl +0 -0
  25. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/chatbot_bench_datasets/vicuna-ja.jsonl +0 -0
  26. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/sacrebleu_dataset.py +0 -0
  27. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/chat_dataset/template_based.py +0 -0
  28. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/eval_setups.py +0 -0
  29. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_chat_response.py +0 -0
  30. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_from_data.py +0 -0
  31. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_generation.py +0 -0
  32. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_multiple_choice.py +0 -0
  33. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_pairwise.py +0 -0
  34. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_perplexity.py +0 -0
  35. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/evaluate_reward_model.py +0 -0
  36. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/few_shot_generator/__init__.py +0 -0
  37. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/few_shot_generator/balanced.py +0 -0
  38. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/few_shot_generator/base.py +0 -0
  39. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/few_shot_generator/rand.py +0 -0
  40. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/generation_dataset/__init__.py +0 -0
  41. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/generation_dataset/base.py +0 -0
  42. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/generation_dataset/sacrebleu_dataset.py +0 -0
  43. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/generation_dataset/template_based.py +0 -0
  44. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/__init__.py +0 -0
  45. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/base.py +0 -0
  46. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/hf_lm.py +0 -0
  47. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/language_model/vllm_model.py +0 -0
  48. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/__init__.py +0 -0
  49. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/base.py +0 -0
  50. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/bleu.py +0 -0
  51. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/char_f1.py +0 -0
  52. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/code_eval.py +0 -0
  53. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/common_prefix_length.py +0 -0
  54. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/common_string_length.py +0 -0
  55. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/exact_match.py +0 -0
  56. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/output_length_stats.py +0 -0
  57. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/perspective_api.py +0 -0
  58. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/rouge.py +0 -0
  59. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/__init__.py +0 -0
  60. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/aio.py +0 -0
  61. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/base.py +0 -0
  62. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/last_line.py +0 -0
  63. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/noop.py +0 -0
  64. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/regex.py +0 -0
  65. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/string_processor/string_strip.py +0 -0
  66. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/substring_match.py +0 -0
  67. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/tokenizer/__init__.py +0 -0
  68. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/tokenizer/base.py +0 -0
  69. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/tokenizer/mecab.py +0 -0
  70. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/tokenizer/sacrebleu_tokenizer.py +0 -0
  71. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/tokenizer/whitespace.py +0 -0
  72. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/metric/xer.py +0 -0
  73. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/multiple_choice_dataset/__init__.py +0 -0
  74. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/multiple_choice_dataset/base.py +0 -0
  75. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/multiple_choice_dataset/template_based.py +0 -0
  76. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/__init__.py +0 -0
  77. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/judge/__init__.py +0 -0
  78. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/judge/base.py +0 -0
  79. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/judge/llm_judge.py +0 -0
  80. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/match.py +0 -0
  81. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/match_maker/__init__.py +0 -0
  82. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/match_maker/all_combinations.py +0 -0
  83. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/match_maker/base.py +0 -0
  84. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/match_maker/random_combinations.py +0 -0
  85. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/scorer/__init__.py +0 -0
  86. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/scorer/base.py +0 -0
  87. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/scorer/bradley_terry.py +0 -0
  88. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/pairwise_comparison/scorer/win_rate.py +0 -0
  89. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/prompt_template/__init__.py +0 -0
  90. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/prompt_template/base.py +0 -0
  91. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/result_recorder/__init__.py +0 -0
  92. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/result_recorder/base.py +0 -0
  93. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/result_recorder/local_recorder.py +0 -0
  94. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/result_recorder/wandb_recorder.py +0 -0
  95. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_bench_dataset/__init__.py +0 -0
  96. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_bench_dataset/base.py +0 -0
  97. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_bench_dataset/hf.py +0 -0
  98. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_model/__init__.py +0 -0
  99. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/reward_model/base.py +0 -0
  100. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/text_dataset/__init__.py +0 -0
  101. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/text_dataset/base.py +0 -0
  102. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/text_dataset/hf.py +0 -0
  103. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/text_dataset/jsonl.py +0 -0
  104. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/utils/__init__.py +0 -0
  105. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/utils/data_util.py +0 -0
  106. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/core/utils/jinja2_utils.py +0 -0
  107. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_chat/mbpp_chat.jsonnet +0 -0
  108. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/jhumaneval.jsonnet +0 -0
  109. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/jhumaneval_tab_indent.jsonnet +0 -0
  110. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/mbpp.jsonnet +0 -0
  111. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/mbpp_tab_indent.jsonnet +0 -0
  112. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/openai_humaneval.jsonnet +0 -0
  113. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/code_generation/openai_humaneval_tab_indent.jsonnet +0 -0
  114. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_chat/mt-en.jsonnet +0 -0
  115. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_chat/vicuna-en.jsonnet +0 -0
  116. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/babi.jsonnet +0 -0
  117. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/commonsense_qa.jsonnet +0 -0
  118. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/gsm8k.jsonnet +0 -0
  119. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/squad_v1.jsonnet +0 -0
  120. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/trivia_qa.jsonnet +0 -0
  121. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_generation/twitter_sentiment.jsonnet +0 -0
  122. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_multiple_choice/commonsense_qa_mc.jsonnet +0 -0
  123. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_multiple_choice/hellaswag.jsonnet +0 -0
  124. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_multiple_choice/openbookqa.jsonnet +0 -0
  125. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_multiple_choice/xwinograd_en.jsonnet +0 -0
  126. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/en_perplexity/tiny_shakespeare.jsonnet +0 -0
  127. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/aio_chat.jsonnet +0 -0
  128. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/elyza_tasks_100.jsonnet +0 -0
  129. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/mgsm_ja_chat.jsonnet +0 -0
  130. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/mt-ja.jsonnet +0 -0
  131. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/rakuda-v2-ja.jsonnet +0 -0
  132. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_chat/vicuna-ja.jsonnet +0 -0
  133. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/aio.jsonnet +0 -0
  134. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/jcommonsenseqa.jsonnet +0 -0
  135. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/jnli.jsonnet +0 -0
  136. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/jsquad.jsonnet +0 -0
  137. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/mgsm_ja.jsonnet +0 -0
  138. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/wrime_pos_neg.jsonnet +0 -0
  139. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_generation/xlsum_ja.jsonnet +0 -0
  140. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_multiple_choice/jcommonsenseqa_mc.jsonnet +0 -0
  141. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/ja_multiple_choice/xwinograd_ja.jsonnet +0 -0
  142. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/translation/wmt20_en_ja.jsonnet +0 -0
  143. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/translation/wmt20_ja_en.jsonnet +0 -0
  144. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/translation_chat/wmt20_en_ja_chat.jsonnet +0 -0
  145. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/EvalSetup/translation_chat/wmt20_ja_en_chat.jsonnet +0 -0
  146. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/Metric/assistant_eval_en_single_turn.jsonnet +0 -0
  147. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/Metric/assistant_eval_ja_single_turn.jsonnet +0 -0
  148. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/Metric/elyza_tasks_100_eval.jsonnet +0 -0
  149. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/PairwiseJudge/assistant_judge_en_single_turn.jsonnet +0 -0
  150. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/preset_configs/PairwiseJudge/assistant_judge_ja_single_turn.jsonnet +0 -0
  151. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/__init__.py +0 -0
  152. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/common.py +0 -0
  153. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/flexeval_file.py +0 -0
  154. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/flexeval_lm.py +0 -0
  155. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/flexeval_pairwise.py +0 -0
  156. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/flexeval_presets.py +0 -0
  157. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/scripts/flexeval_reward.py +0 -0
  158. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/utils/__init__.py +0 -0
  159. {flexeval-0.7.2 → flexeval-0.7.4}/flexeval/utils/module_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flexeval
3
- Version: 0.7.2
3
+ Version: 0.7.4
4
4
  Summary:
5
5
  Author: ryokan-ri
6
6
  Author-email: ryokan.ri@sbintuitions.co.jp
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.9
10
10
  Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
12
  Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
13
14
  Provides-Extra: vllm
14
15
  Provides-Extra: wandb
15
16
  Requires-Dist: datasets (>=2.14.6,<3.0.0)
@@ -6,28 +6,56 @@ from typing import Any, Awaitable, Callable, TypeVar
6
6
  import openai
7
7
  from loguru import logger
8
8
  from openai import AsyncOpenAI
9
+ from openai.types.chat import ChatCompletion, ChatCompletionMessage
10
+ from openai.types.chat.chat_completion import Choice
9
11
 
10
12
  from .base import LanguageModel, normalize_stop_sequences
11
13
 
12
14
  T = TypeVar("T")
13
15
 
14
16
 
17
+ # NOTE: current implementation uses only choices[0].message.content field.
18
+ EMPTY_RESPONSE = ChatCompletion(
19
+ id="dummy",
20
+ choices=[
21
+ Choice(
22
+ finish_reason="stop",
23
+ index=0,
24
+ message=ChatCompletionMessage(
25
+ content="", refusal=None, role="assistant", function_call=None, tool_calls=None
26
+ ),
27
+ )
28
+ ],
29
+ created=946652400, # dummy integer
30
+ model="dummy_model",
31
+ object="chat.completion",
32
+ service_tier=None,
33
+ system_fingerprint=None,
34
+ usage=None,
35
+ )
36
+
37
+
15
38
  async def _retry_on_error(
16
39
  openai_call: Callable[[], Awaitable[T]],
17
40
  max_num_trials: int = 5,
18
41
  first_wait_time: int = 10,
19
- ) -> Awaitable[T] | None:
42
+ ) -> Awaitable[T]:
20
43
  for i in range(max_num_trials):
21
44
  try:
22
45
  return await openai_call()
23
46
  except openai.APIError as e: # noqa: PERF203
24
47
  if i == max_num_trials - 1:
25
- raise
48
+ # Since reaching maximum number of trials, exit for-loop and return
49
+ # empty response.
50
+ break
26
51
  logger.warning(f"We got an error: {e}")
27
52
  wait_time_seconds = first_wait_time * (2**i)
28
53
  logger.warning(f"Wait for {wait_time_seconds} seconds...")
29
54
  await asyncio.sleep(wait_time_seconds)
30
- return None
55
+
56
+ logger.warning(f"We reached maximum number of trials ({max_num_trials} trials.).")
57
+ logger.warning("Response including empty string is returned.")
58
+ return EMPTY_RESPONSE
31
59
 
32
60
 
33
61
  class OpenAIChatAPI(LanguageModel):
@@ -108,7 +136,10 @@ class OpenAIChatAPI(LanguageModel):
108
136
  **kwargs,
109
137
  ),
110
138
  )
111
- return [res.choices[0].message.content for res in api_responses]
139
+ completions = [res.choices[0].message.content for res in api_responses]
140
+ if all(completion == "" for completion in completions):
141
+ logger.warning("All generated texts are empty strings. Something may be wrong.")
142
+ return completions
112
143
 
113
144
  def batch_generate_chat_response(
114
145
  self,
@@ -118,7 +149,10 @@ class OpenAIChatAPI(LanguageModel):
118
149
  api_responses = asyncio.run(
119
150
  self._async_batch_run_chatgpt(chat_messages_list, **kwargs),
120
151
  )
121
- return [res.choices[0].message.content for res in api_responses]
152
+ completions = [res.choices[0].message.content for res in api_responses]
153
+ if all(completion == "" for completion in completions):
154
+ logger.warning("All generated texts are empty string. Something may go wrong.")
155
+ return completions
122
156
 
123
157
  def __repr__(self) -> str:
124
158
  return f"{self.__class__.__name__}(model={self.model})"
@@ -6,6 +6,7 @@ import os
6
6
  import tempfile
7
7
  import uuid
8
8
  from enum import Enum
9
+ from pprint import pformat
9
10
  from typing import Any
10
11
 
11
12
  from loguru import logger
@@ -128,6 +129,10 @@ class OpenAIChatBatchAPI(LanguageModel):
128
129
  logger.info(f"Current status: {status.value}")
129
130
  return status, batch_response
130
131
 
132
+ def _retrieve_file_content(self, file_id: str) -> list[dict[any, any]]:
133
+ file_response = asyncio.run(self._client.files.content(file_id))
134
+ return [json.loads(line) for line in file_response.text.strip().split("\n")]
135
+
131
136
  def _execute_batch_requests(
132
137
  self,
133
138
  messages_list: list[list[dict[str, str]]],
@@ -136,13 +141,15 @@ class OpenAIChatBatchAPI(LanguageModel):
136
141
  custom_id_2_message: dict[str, list[dict[str, str]]] = {
137
142
  str(uuid.uuid4()): messages for messages in messages_list
138
143
  }
139
- custom_id_2_response: dict[str, str | None] = {custom_id: None for custom_id in custom_id_2_message}
140
- exec_cnt = 0
144
+ # The response will be an empty string if the API produces an error.
145
+ custom_id_2_response: dict[str, str] = {custom_id: "" for custom_id in custom_id_2_message}
146
+ exec_cnt = 1
141
147
 
142
148
  while len(custom_id_2_message) > 0:
143
149
  if exec_cnt > MAX_NUM_TRIALS:
144
150
  break
145
151
  logger.info(f"Trial {exec_cnt}")
152
+ exec_cnt += 1
146
153
  batch_id = asyncio.run(self._post_batch_requests(custom_id_2_message, **kwargs))
147
154
 
148
155
  status, batch_response = asyncio.run(
@@ -152,13 +159,25 @@ class OpenAIChatBatchAPI(LanguageModel):
152
159
  error_message = f"Failed: {batch_response}"
153
160
  raise ValueError(error_message)
154
161
 
155
- file_response = asyncio.run(self._client.files.content(batch_response.output_file_id))
156
-
157
- data = []
158
- for line in file_response.text.strip().split("\n"):
159
- json_data = json.loads(line)
160
- data.append(json_data)
161
-
162
+ # Check error_file_id exists and if exists, log error details.
163
+ error_file_id = batch_response.error_file_id
164
+ # If any request fails, error_file_id is set.
165
+ if error_file_id is not None:
166
+ logger.warning("Request on some messages failed following reason.")
167
+ data: list[dict[str, Any]] = self._retrieve_file_content(error_file_id)
168
+ # [Error](https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L8857])
169
+ # instance is embedded in response.
170
+ for data_i in data:
171
+ error = data_i["response"]
172
+ logger.warning(f"Failed: {error}")
173
+
174
+ output_file_id = batch_response.output_file_id
175
+ # If completion on all input fails, output_file_id is None.
176
+ if output_file_id is None:
177
+ logger.warning("All request failed. Continue...")
178
+ continue
179
+
180
+ data: list[dict[str, Any]] = self._retrieve_file_content(output_file_id)
162
181
  for data_i in data:
163
182
  if data_i["error"] is not None:
164
183
  continue
@@ -167,11 +186,10 @@ class OpenAIChatBatchAPI(LanguageModel):
167
186
  custom_id_2_message.pop(custom_id)
168
187
  custom_id_2_response[custom_id] = data_i["response"]["body"]["choices"][0]["message"]["content"]
169
188
 
170
- exec_cnt += 1
171
-
172
- if sum([response is not None for response in custom_id_2_response.values()]) < len(messages_list):
173
- error_message = "Exec failed"
174
- raise ValueError(error_message)
189
+ # The remaining elements are all those that failed to complete request.
190
+ if custom_id_2_message:
191
+ logger.warning("The following messages failed to complete request.")
192
+ logger.warning(pformat(list(custom_id_2_message.values())))
175
193
 
176
194
  return list(custom_id_2_response.values())
177
195
 
@@ -49,7 +49,7 @@ def summarize_evaluator_scores(
49
49
  if score is None or category_key is None:
50
50
  continue
51
51
  if category_key in task_inputs:
52
- category2valid_scores[task_inputs["category"]].append(score)
52
+ category2valid_scores[task_inputs[category_key]].append(score)
53
53
 
54
54
  category2mean_score: dict[str, float] = {}
55
55
  for category, valid_scores in category2valid_scores.items():
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from flexeval.core.utils.jinja2_utils import JINJA2_ENV
6
+
7
+ from .base import PromptTemplate
8
+
9
+
10
+ class Jinja2PromptTemplate(PromptTemplate):
11
+ """
12
+ Embed task inputs using Jinja2 template engine.
13
+
14
+ Args:
15
+ template: The Jinja2 template to use.
16
+ template_path: The path to a file with the Jinja2 template to use.
17
+ """
18
+
19
+ def __init__(self, template: str | None = None, template_path: str | None = None) -> None:
20
+ if template is None and template_path is None:
21
+ msg = "Either template or template_path must be provided"
22
+ raise ValueError(msg)
23
+ if template is not None and template_path is not None:
24
+ msg = "Only one of template or template_path can be provided"
25
+ raise ValueError(msg)
26
+
27
+ if template_path is not None:
28
+ with open(template_path) as f:
29
+ self.template = f.read()
30
+ else:
31
+ self.template = template
32
+
33
+ def embed_inputs(self, input_dict: dict[str, Any]) -> str:
34
+ return JINJA2_ENV.from_string(self.template).render(input_dict)
35
+
36
+ def __repr__(self) -> str:
37
+ return f"Jinja2PromptTemplate(template={self.template!r})"
@@ -120,6 +120,6 @@ class PairwiseJudgeRewardModel(RewardModel):
120
120
 
121
121
  for i in range(len(outputs)):
122
122
  outputs[i]["llm_outputs"] = [judge_outputs[i * 2], judge_outputs[i * 2 + 1]]
123
- outputs[i]["is_corrects"] = [chosen_is_betters[i * 2], chosen_is_betters[i * 2 + 1]]
123
+ outputs[i]["evaluation_results"] = [chosen_is_betters[i * 2], chosen_is_betters[i * 2 + 1]]
124
124
 
125
125
  return chosen_is_betters, outputs
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "flexeval"
3
- version = "0.7.2" # This will be automatically set from git tag by poetry-dynamic-versioning
3
+ version = "0.7.4" # This will be automatically set from git tag by poetry-dynamic-versioning
4
4
  description = ""
5
5
  authors = ["ryokan-ri <ryokan.ri@sbintuitions.co.jp>"]
6
6
  readme = "README.md"
@@ -1,25 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any
4
-
5
- from flexeval.core.utils.jinja2_utils import JINJA2_ENV
6
-
7
- from .base import PromptTemplate
8
-
9
-
10
- class Jinja2PromptTemplate(PromptTemplate):
11
- """
12
- Embed task inputs using Jinja2 template engine.
13
-
14
- Args:
15
- template: The Jinja2 template to use.
16
- """
17
-
18
- def __init__(self, template: str) -> None:
19
- self.template = template
20
-
21
- def embed_inputs(self, input_dict: dict[str, Any]) -> str:
22
- return JINJA2_ENV.from_string(self.template).render(input_dict)
23
-
24
- def __repr__(self) -> str:
25
- return f"Jinja2PromptTemplate(template={self.template!r})"
File without changes
File without changes
File without changes