llm-evaluation-toolkit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_eval/__init__.py +30 -0
- llm_eval/datasets.py +120 -0
- llm_eval/evaluators/__init__.py +30 -0
- llm_eval/metrics/__init__.py +13 -0
- llm_eval/metrics/base.py +23 -0
- llm_eval/metrics/bleu.py +61 -0
- llm_eval/metrics/judge.py +123 -0
- llm_eval/metrics/rouge.py +55 -0
- llm_eval/metrics/semantic.py +78 -0
- llm_eval/providers/__init__.py +36 -0
- llm_eval/providers/anthropic_provider.py +37 -0
- llm_eval/providers/openai_provider.py +38 -0
- llm_eval/types.py +15 -0
- llm_evaluation_toolkit-0.1.0.dist-info/METADATA +279 -0
- llm_evaluation_toolkit-0.1.0.dist-info/RECORD +17 -0
- llm_evaluation_toolkit-0.1.0.dist-info/WHEEL +4 -0
- llm_evaluation_toolkit-0.1.0.dist-info/licenses/LICENSE +21 -0
llm_eval/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from llm_eval.evaluators import BaseEvaluator
|
|
2
|
+
from llm_eval.metrics import (
|
|
3
|
+
BaseMetric,
|
|
4
|
+
BLEUMetric,
|
|
5
|
+
ROUGEMetric,
|
|
6
|
+
SemanticSimilarityMetric,
|
|
7
|
+
LLMJudgeMetric,
|
|
8
|
+
)
|
|
9
|
+
from llm_eval.providers import BaseProvider, GenerationConfig
|
|
10
|
+
from llm_eval.providers.openai_provider import OpenAIProvider
|
|
11
|
+
from llm_eval.providers.anthropic_provider import AnthropicProvider
|
|
12
|
+
from llm_eval.types import EvalResult
|
|
13
|
+
from llm_eval.datasets import DatasetLoader, EvalDataset
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0"
|
|
16
|
+
__all__ = [
|
|
17
|
+
"BaseMetric",
|
|
18
|
+
"BaseEvaluator",
|
|
19
|
+
"EvalResult",
|
|
20
|
+
"BLEUMetric",
|
|
21
|
+
"ROUGEMetric",
|
|
22
|
+
"SemanticSimilarityMetric",
|
|
23
|
+
"LLMJudgeMetric",
|
|
24
|
+
"BaseProvider",
|
|
25
|
+
"GenerationConfig",
|
|
26
|
+
"OpenAIProvider",
|
|
27
|
+
"AnthropicProvider",
|
|
28
|
+
"DatasetLoader",
|
|
29
|
+
"EvalDataset",
|
|
30
|
+
]
|
llm_eval/datasets.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class EvalDataset:
|
|
8
|
+
"""評価用データセットを格納するクラス"""
|
|
9
|
+
name: str
|
|
10
|
+
questions: list[str]
|
|
11
|
+
references: list[str]
|
|
12
|
+
metadata: dict = None
|
|
13
|
+
|
|
14
|
+
def __post_init__(self):
|
|
15
|
+
if self.metadata is None:
|
|
16
|
+
self.metadata = {}
|
|
17
|
+
if len(self.questions) != len(self.references):
|
|
18
|
+
raise ValueError(
|
|
19
|
+
f"questionsとreferencesの数が一致しません: "
|
|
20
|
+
f"{len(self.questions)} != {len(self.references)}"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
def __len__(self) -> int:
|
|
24
|
+
return len(self.questions)
|
|
25
|
+
|
|
26
|
+
def __repr__(self) -> str:
|
|
27
|
+
return f"EvalDataset(name={self.name}, size={len(self)})"
|
|
28
|
+
|
|
29
|
+
def subset(self, n: int) -> "EvalDataset":
|
|
30
|
+
"""先頭n件だけ取り出す"""
|
|
31
|
+
return EvalDataset(
|
|
32
|
+
name=f"{self.name}(subset={n})",
|
|
33
|
+
questions=self.questions[:n],
|
|
34
|
+
references=self.references[:n],
|
|
35
|
+
metadata=self.metadata,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DatasetLoader:
|
|
40
|
+
"""HuggingFace datasetsからベンチマークを読み込むクラス"""
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def _import_datasets():
|
|
44
|
+
try:
|
|
45
|
+
import datasets
|
|
46
|
+
return datasets
|
|
47
|
+
except ImportError:
|
|
48
|
+
raise ImportError(
|
|
49
|
+
"datasetsパッケージが必要です: pip install datasets"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def load_squad(cls, split: str = "validation", max_samples: int = 100) -> EvalDataset:
|
|
54
|
+
"""
|
|
55
|
+
SQuAD (Stanford Question Answering Dataset) を読み込む
|
|
56
|
+
|
|
57
|
+
質問応答タスクの定番ベンチマーク。
|
|
58
|
+
questions: 質問文
|
|
59
|
+
references: 正解テキスト
|
|
60
|
+
"""
|
|
61
|
+
ds = cls._import_datasets()
|
|
62
|
+
dataset = ds.load_dataset("squad", split=split)
|
|
63
|
+
|
|
64
|
+
questions = []
|
|
65
|
+
references = []
|
|
66
|
+
|
|
67
|
+
for item in dataset.select(range(min(max_samples, len(dataset)))):
|
|
68
|
+
questions.append(item["question"])
|
|
69
|
+
references.append(item["answers"]["text"][0])
|
|
70
|
+
|
|
71
|
+
return EvalDataset(
|
|
72
|
+
name="squad",
|
|
73
|
+
questions=questions,
|
|
74
|
+
references=references,
|
|
75
|
+
metadata={"split": split, "max_samples": max_samples},
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def load_cnn_dailymail(cls, split: str = "validation", max_samples: int = 50) -> EvalDataset:
|
|
80
|
+
"""
|
|
81
|
+
CNN/DailyMail を読み込む
|
|
82
|
+
|
|
83
|
+
要約タスクの定番ベンチマーク。
|
|
84
|
+
questions: 元記事
|
|
85
|
+
references: 正解要約
|
|
86
|
+
"""
|
|
87
|
+
ds = cls._import_datasets()
|
|
88
|
+
dataset = ds.load_dataset("cnn_dailymail", "3.0.0", split=split)
|
|
89
|
+
|
|
90
|
+
questions = []
|
|
91
|
+
references = []
|
|
92
|
+
|
|
93
|
+
for item in dataset.select(range(min(max_samples, len(dataset)))):
|
|
94
|
+
questions.append(item["article"][:512])
|
|
95
|
+
references.append(item["highlights"])
|
|
96
|
+
|
|
97
|
+
return EvalDataset(
|
|
98
|
+
name="cnn_dailymail",
|
|
99
|
+
questions=questions,
|
|
100
|
+
references=references,
|
|
101
|
+
metadata={"split": split, "max_samples": max_samples},
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def from_dict(
|
|
106
|
+
cls,
|
|
107
|
+
name: str,
|
|
108
|
+
questions: list[str],
|
|
109
|
+
references: list[str],
|
|
110
|
+
) -> EvalDataset:
|
|
111
|
+
"""
|
|
112
|
+
自前のデータからEvalDatasetを作る
|
|
113
|
+
|
|
114
|
+
APIキーなしでテストしたいときに便利。
|
|
115
|
+
"""
|
|
116
|
+
return EvalDataset(
|
|
117
|
+
name=name,
|
|
118
|
+
questions=questions,
|
|
119
|
+
references=references,
|
|
120
|
+
)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC
|
|
4
|
+
|
|
5
|
+
from llm_eval.metrics import BaseMetric
|
|
6
|
+
from llm_eval.types import EvalResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseEvaluator(ABC):
|
|
10
|
+
"""複数のMetricをまとめて実行する基底クラス"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, metrics: list[BaseMetric]):
|
|
13
|
+
self.metrics = metrics
|
|
14
|
+
|
|
15
|
+
def evaluate(
|
|
16
|
+
self,
|
|
17
|
+
predictions: list[str],
|
|
18
|
+
references: list[str],
|
|
19
|
+
) -> dict[str, EvalResult]:
|
|
20
|
+
results = {}
|
|
21
|
+
for metric in self.metrics:
|
|
22
|
+
results[metric.name] = metric.compute(predictions, references)
|
|
23
|
+
return results
|
|
24
|
+
|
|
25
|
+
def add_metric(self, metric: BaseMetric) -> None:
|
|
26
|
+
self.metrics.append(metric)
|
|
27
|
+
|
|
28
|
+
def __repr__(self) -> str:
|
|
29
|
+
metric_names = [m.name for m in self.metrics]
|
|
30
|
+
return f"{self.__class__.__name__}(metrics={metric_names})"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from llm_eval.metrics.base import BaseMetric
|
|
2
|
+
from llm_eval.metrics.bleu import BLEUMetric
|
|
3
|
+
from llm_eval.metrics.rouge import ROUGEMetric
|
|
4
|
+
from llm_eval.metrics.semantic import SemanticSimilarityMetric
|
|
5
|
+
from llm_eval.metrics.judge import LLMJudgeMetric
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"BaseMetric",
|
|
9
|
+
"BLEUMetric",
|
|
10
|
+
"ROUGEMetric",
|
|
11
|
+
"SemanticSimilarityMetric",
|
|
12
|
+
"LLMJudgeMetric",
|
|
13
|
+
]
|
llm_eval/metrics/base.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
from llm_eval.types import EvalResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseMetric(ABC):
|
|
9
|
+
"""すべての評価指標の基底クラス"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, name: str):
|
|
12
|
+
self.name = name
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def compute(
|
|
16
|
+
self,
|
|
17
|
+
predictions: list[str],
|
|
18
|
+
references: list[str],
|
|
19
|
+
) -> EvalResult:
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
def __repr__(self) -> str:
|
|
23
|
+
return f"{self.__class__.__name__}(name={self.name})"
|
llm_eval/metrics/bleu.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import nltk
|
|
4
|
+
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
|
|
5
|
+
|
|
6
|
+
from llm_eval.metrics.base import BaseMetric
|
|
7
|
+
from llm_eval.types import EvalResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BLEUMetric(BaseMetric):
|
|
11
|
+
"""
|
|
12
|
+
BLEU (Bilingual Evaluation Understudy) スコアを計算する
|
|
13
|
+
|
|
14
|
+
機械翻訳や要約タスクでLLM出力と正解文の一致度を測る。
|
|
15
|
+
スコアは0.0〜1.0で、1.0が完全一致。
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, max_n: int = 4):
|
|
19
|
+
super().__init__(name="bleu")
|
|
20
|
+
self.max_n = max_n
|
|
21
|
+
self._ensure_nltk_data()
|
|
22
|
+
|
|
23
|
+
def _ensure_nltk_data(self) -> None:
|
|
24
|
+
try:
|
|
25
|
+
nltk.data.find("tokenizers/punkt")
|
|
26
|
+
except LookupError:
|
|
27
|
+
nltk.download("punkt", quiet=True)
|
|
28
|
+
try:
|
|
29
|
+
nltk.data.find("tokenizers/punkt_tab")
|
|
30
|
+
except LookupError:
|
|
31
|
+
nltk.download("punkt_tab", quiet=True)
|
|
32
|
+
|
|
33
|
+
def compute(
|
|
34
|
+
self,
|
|
35
|
+
predictions: list[str],
|
|
36
|
+
references: list[str],
|
|
37
|
+
) -> EvalResult:
|
|
38
|
+
if len(predictions) != len(references):
|
|
39
|
+
raise ValueError(
|
|
40
|
+
f"predictionsとreferencesの数が一致しません: "
|
|
41
|
+
f"{len(predictions)} != {len(references)}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
tokenized_predictions = [p.lower().split() for p in predictions]
|
|
45
|
+
tokenized_references = [[r.lower().split()] for r in references]
|
|
46
|
+
|
|
47
|
+
smoothing = SmoothingFunction().method1
|
|
48
|
+
score = corpus_bleu(
|
|
49
|
+
tokenized_references,
|
|
50
|
+
tokenized_predictions,
|
|
51
|
+
smoothing_function=smoothing,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
return EvalResult(
|
|
55
|
+
metric_name=self.name,
|
|
56
|
+
score=float(score),
|
|
57
|
+
details={
|
|
58
|
+
"num_samples": len(predictions),
|
|
59
|
+
"max_n": self.max_n,
|
|
60
|
+
},
|
|
61
|
+
)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from llm_eval.metrics.base import BaseMetric
|
|
7
|
+
from llm_eval.types import EvalResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
JUDGE_PROMPT_TEMPLATE = """You are an expert evaluator for AI-generated text.
|
|
11
|
+
|
|
12
|
+
Evaluate the following response based on these criteria:
|
|
13
|
+
- Accuracy: Is the information correct?
|
|
14
|
+
- Clarity: Is the response clear and easy to understand?
|
|
15
|
+
- Completeness: Does it fully address the question?
|
|
16
|
+
- Relevance: Is it relevant to the question asked?
|
|
17
|
+
|
|
18
|
+
Question: {question}
|
|
19
|
+
Response: {response}
|
|
20
|
+
|
|
21
|
+
Provide your evaluation in the following format:
|
|
22
|
+
Score: [0-10]
|
|
23
|
+
Reasoning: [brief explanation]
|
|
24
|
+
|
|
25
|
+
Be strict but fair. A score of 10 means perfect."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class LLMJudgeMetric(BaseMetric):
|
|
29
|
+
"""
|
|
30
|
+
LLM自身が審判として出力を採点するメトリクス
|
|
31
|
+
|
|
32
|
+
正解テキストなしで出力の品質を評価できる。
|
|
33
|
+
スコアは0.0〜1.0に正規化される(元スコアは0〜10)。
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
judge_model: str = "gpt-4o-mini",
|
|
39
|
+
api_key: str | None = None,
|
|
40
|
+
prompt_template: str | None = None,
|
|
41
|
+
):
|
|
42
|
+
super().__init__(name="llm_judge")
|
|
43
|
+
self.judge_model = judge_model
|
|
44
|
+
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
|
45
|
+
self.prompt_template = prompt_template or JUDGE_PROMPT_TEMPLATE
|
|
46
|
+
self._client = None
|
|
47
|
+
|
|
48
|
+
def _get_client(self):
|
|
49
|
+
"""OpenAIクライアントの遅延初期化"""
|
|
50
|
+
if self._client is None:
|
|
51
|
+
try:
|
|
52
|
+
from openai import OpenAI
|
|
53
|
+
except ImportError:
|
|
54
|
+
raise ImportError(
|
|
55
|
+
"openaiパッケージが必要です: "
|
|
56
|
+
"pip install llm-evaluation-toolkit[openai]"
|
|
57
|
+
)
|
|
58
|
+
self._client = OpenAI(api_key=self.api_key)
|
|
59
|
+
return self._client
|
|
60
|
+
|
|
61
|
+
def _parse_score(self, judge_response: str) -> float:
|
|
62
|
+
"""LLMの採点レスポンスからスコアを抽出する"""
|
|
63
|
+
pattern = r"Score:\s*(\d+(?:\.\d+)?)"
|
|
64
|
+
match = re.search(pattern, judge_response, re.IGNORECASE)
|
|
65
|
+
if match:
|
|
66
|
+
raw_score = float(match.group(1))
|
|
67
|
+
return min(max(raw_score / 10.0, 0.0), 1.0)
|
|
68
|
+
return 0.5
|
|
69
|
+
|
|
70
|
+
def _judge_single(self, question: str, response: str) -> tuple[float, str]:
|
|
71
|
+
"""1つのレスポンスを採点する"""
|
|
72
|
+
client = self._get_client()
|
|
73
|
+
prompt = self.prompt_template.format(
|
|
74
|
+
question=question,
|
|
75
|
+
response=response,
|
|
76
|
+
)
|
|
77
|
+
result = client.chat.completions.create(
|
|
78
|
+
model=self.judge_model,
|
|
79
|
+
messages=[{"role": "user", "content": prompt}],
|
|
80
|
+
max_tokens=256,
|
|
81
|
+
temperature=0.0,
|
|
82
|
+
)
|
|
83
|
+
judge_response = result.choices[0].message.content or ""
|
|
84
|
+
score = self._parse_score(judge_response)
|
|
85
|
+
return score, judge_response
|
|
86
|
+
|
|
87
|
+
def compute(
|
|
88
|
+
self,
|
|
89
|
+
predictions: list[str],
|
|
90
|
+
references: list[str],
|
|
91
|
+
) -> EvalResult:
|
|
92
|
+
"""
|
|
93
|
+
Args:
|
|
94
|
+
predictions: LLMの出力テキストのリスト
|
|
95
|
+
references: 対応する質問文のリスト(このMetricでは正解ではなく質問を渡す)
|
|
96
|
+
"""
|
|
97
|
+
if len(predictions) != len(references):
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"predictionsとreferencesの数が一致しません: "
|
|
100
|
+
f"{len(predictions)} != {len(references)}"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
scores = []
|
|
104
|
+
judge_responses = []
|
|
105
|
+
|
|
106
|
+
for question, response in zip(references, predictions):
|
|
107
|
+
score, judge_response = self._judge_single(question, response)
|
|
108
|
+
scores.append(score)
|
|
109
|
+
judge_responses.append(judge_response)
|
|
110
|
+
print(f"採点完了: {score:.2f} / 1.0")
|
|
111
|
+
|
|
112
|
+
avg_score = sum(scores) / len(scores)
|
|
113
|
+
|
|
114
|
+
return EvalResult(
|
|
115
|
+
metric_name=self.name,
|
|
116
|
+
score=avg_score,
|
|
117
|
+
details={
|
|
118
|
+
"num_samples": len(predictions),
|
|
119
|
+
"judge_model": self.judge_model,
|
|
120
|
+
"individual_scores": scores,
|
|
121
|
+
"judge_responses": judge_responses,
|
|
122
|
+
},
|
|
123
|
+
)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from rouge_score import rouge_scorer
|
|
4
|
+
|
|
5
|
+
from llm_eval.metrics.base import BaseMetric
|
|
6
|
+
from llm_eval.types import EvalResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ROUGEMetric(BaseMetric):
|
|
10
|
+
"""
|
|
11
|
+
ROUGE (Recall-Oriented Understudy for Gisting Evaluation) スコアを計算する
|
|
12
|
+
|
|
13
|
+
要約タスクでよく使われる。
|
|
14
|
+
ROUGE-1: 単語の一致率
|
|
15
|
+
ROUGE-2: 2単語フレーズの一致率
|
|
16
|
+
ROUGE-L: 最長共通部分列ベースの一致率
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, rouge_types: list[str] | None = None):
|
|
20
|
+
super().__init__(name="rouge")
|
|
21
|
+
self.rouge_types = rouge_types or ["rouge1", "rouge2", "rougeL"]
|
|
22
|
+
self._scorer = rouge_scorer.RougeScorer(
|
|
23
|
+
self.rouge_types,
|
|
24
|
+
use_stemmer=True,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def compute(
|
|
28
|
+
self,
|
|
29
|
+
predictions: list[str],
|
|
30
|
+
references: list[str],
|
|
31
|
+
) -> EvalResult:
|
|
32
|
+
if len(predictions) != len(references):
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"predictionsとreferencesの数が一致しません: "
|
|
35
|
+
f"{len(predictions)} != {len(references)}"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
aggregated: dict[str, list[float]] = {t: [] for t in self.rouge_types}
|
|
39
|
+
|
|
40
|
+
for pred, ref in zip(predictions, references):
|
|
41
|
+
scores = self._scorer.score(ref, pred)
|
|
42
|
+
for rouge_type in self.rouge_types:
|
|
43
|
+
aggregated[rouge_type].append(scores[rouge_type].fmeasure)
|
|
44
|
+
|
|
45
|
+
details = {}
|
|
46
|
+
for rouge_type, values in aggregated.items():
|
|
47
|
+
details[rouge_type] = sum(values) / len(values)
|
|
48
|
+
|
|
49
|
+
primary_score = details.get("rougeL", details[self.rouge_types[0]])
|
|
50
|
+
|
|
51
|
+
return EvalResult(
|
|
52
|
+
metric_name=self.name,
|
|
53
|
+
score=primary_score,
|
|
54
|
+
details=details,
|
|
55
|
+
)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from llm_eval.metrics.base import BaseMetric
|
|
6
|
+
from llm_eval.types import EvalResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SemanticSimilarityMetric(BaseMetric):
|
|
10
|
+
"""
|
|
11
|
+
埋め込みベクトルのコサイン類似度で意味的な近さを評価する
|
|
12
|
+
|
|
13
|
+
BLEUやROUGEでは捉えられない言い換えや同義語を考慮できる。
|
|
14
|
+
スコアは0.0〜1.0で、1.0が意味的に完全一致。
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
|
|
18
|
+
super().__init__(name="semantic_similarity")
|
|
19
|
+
self.model_name = model_name
|
|
20
|
+
self._model = None
|
|
21
|
+
|
|
22
|
+
def _load_model(self):
|
|
23
|
+
"""モデルの遅延読み込み(初回呼び出し時にダウンロード)"""
|
|
24
|
+
if self._model is None:
|
|
25
|
+
try:
|
|
26
|
+
from sentence_transformers import SentenceTransformer
|
|
27
|
+
except ImportError:
|
|
28
|
+
raise ImportError(
|
|
29
|
+
"sentence-transformersが必要です: "
|
|
30
|
+
"pip install llm-evaluation-toolkit[semantic]"
|
|
31
|
+
)
|
|
32
|
+
print(f"モデルを読み込み中: {self.model_name}")
|
|
33
|
+
self._model = SentenceTransformer(self.model_name)
|
|
34
|
+
return self._model
|
|
35
|
+
|
|
36
|
+
def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
|
|
37
|
+
"""2つのベクトルのコサイン類似度を計算する"""
|
|
38
|
+
dot_product = np.dot(a, b)
|
|
39
|
+
norm_a = np.linalg.norm(a)
|
|
40
|
+
norm_b = np.linalg.norm(b)
|
|
41
|
+
if norm_a == 0 or norm_b == 0:
|
|
42
|
+
return 0.0
|
|
43
|
+
return float(dot_product / (norm_a * norm_b))
|
|
44
|
+
|
|
45
|
+
def compute(
|
|
46
|
+
self,
|
|
47
|
+
predictions: list[str],
|
|
48
|
+
references: list[str],
|
|
49
|
+
) -> EvalResult:
|
|
50
|
+
if len(predictions) != len(references):
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"predictionsとreferencesの数が一致しません: "
|
|
53
|
+
f"{len(predictions)} != {len(references)}"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
model = self._load_model()
|
|
57
|
+
|
|
58
|
+
pred_embeddings = model.encode(predictions, convert_to_numpy=True)
|
|
59
|
+
ref_embeddings = model.encode(references, convert_to_numpy=True)
|
|
60
|
+
|
|
61
|
+
similarities = [
|
|
62
|
+
self._cosine_similarity(pred_embeddings[i], ref_embeddings[i])
|
|
63
|
+
for i in range(len(predictions))
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
avg_score = float(np.mean(similarities))
|
|
67
|
+
|
|
68
|
+
return EvalResult(
|
|
69
|
+
metric_name=self.name,
|
|
70
|
+
score=avg_score,
|
|
71
|
+
details={
|
|
72
|
+
"num_samples": len(predictions),
|
|
73
|
+
"model": self.model_name,
|
|
74
|
+
"individual_scores": similarities,
|
|
75
|
+
"min_score": float(np.min(similarities)),
|
|
76
|
+
"max_score": float(np.max(similarities)),
|
|
77
|
+
},
|
|
78
|
+
)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class GenerationConfig:
|
|
9
|
+
"""テキスト生成の設定"""
|
|
10
|
+
max_tokens: int = 512
|
|
11
|
+
temperature: float = 0.0
|
|
12
|
+
system_prompt: str = "You are a helpful assistant."
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseProvider(ABC):
|
|
16
|
+
"""すべてのLLMプロバイダの基底クラス"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, model: str, config: GenerationConfig | None = None):
|
|
19
|
+
self.model = model
|
|
20
|
+
self.config = config or GenerationConfig()
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def generate(self, prompt: str) -> str:
|
|
24
|
+
...
|
|
25
|
+
|
|
26
|
+
def generate_batch(self, prompts: list[str]) -> list[str]:
|
|
27
|
+
return [self.generate(p) for p in prompts]
|
|
28
|
+
|
|
29
|
+
def __repr__(self) -> str:
|
|
30
|
+
return f"{self.__class__.__name__}(model={self.model})"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
from llm_eval.providers.openai_provider import OpenAIProvider # noqa: E402
|
|
34
|
+
from llm_eval.providers.anthropic_provider import AnthropicProvider # noqa: E402
|
|
35
|
+
|
|
36
|
+
__all__ = ["BaseProvider", "GenerationConfig", "OpenAIProvider", "AnthropicProvider"]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from llm_eval.providers import BaseProvider, GenerationConfig
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AnthropicProvider(BaseProvider):
|
|
9
|
+
"""Anthropic API プロバイダ"""
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
model: str = "claude-haiku-4-5-20251001",
|
|
14
|
+
config: GenerationConfig | None = None,
|
|
15
|
+
api_key: str | None = None,
|
|
16
|
+
):
|
|
17
|
+
super().__init__(model=model, config=config)
|
|
18
|
+
try:
|
|
19
|
+
import anthropic
|
|
20
|
+
except ImportError:
|
|
21
|
+
raise ImportError(
|
|
22
|
+
"anthropicパッケージが必要です: pip install llm-evaluation-toolkit[anthropic]"
|
|
23
|
+
)
|
|
24
|
+
self._client = anthropic.Anthropic(
|
|
25
|
+
api_key=api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def generate(self, prompt: str) -> str:
|
|
29
|
+
response = self._client.messages.create(
|
|
30
|
+
model=self.model,
|
|
31
|
+
max_tokens=self.config.max_tokens,
|
|
32
|
+
system=self.config.system_prompt,
|
|
33
|
+
messages=[
|
|
34
|
+
{"role": "user", "content": prompt},
|
|
35
|
+
],
|
|
36
|
+
)
|
|
37
|
+
return response.content[0].text
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from llm_eval.providers import BaseProvider, GenerationConfig
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OpenAIProvider(BaseProvider):
|
|
9
|
+
"""OpenAI API プロバイダ"""
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
model: str = "gpt-4o-mini",
|
|
14
|
+
config: GenerationConfig | None = None,
|
|
15
|
+
api_key: str | None = None,
|
|
16
|
+
):
|
|
17
|
+
super().__init__(model=model, config=config)
|
|
18
|
+
try:
|
|
19
|
+
from openai import OpenAI
|
|
20
|
+
except ImportError:
|
|
21
|
+
raise ImportError(
|
|
22
|
+
"openaiパッケージが必要です: pip install llm-evaluation-toolkit[openai]"
|
|
23
|
+
)
|
|
24
|
+
self._client = OpenAI(
|
|
25
|
+
api_key=api_key or os.environ.get("OPENAI_API_KEY")
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def generate(self, prompt: str) -> str:
|
|
29
|
+
response = self._client.chat.completions.create(
|
|
30
|
+
model=self.model,
|
|
31
|
+
messages=[
|
|
32
|
+
{"role": "system", "content": self.config.system_prompt},
|
|
33
|
+
{"role": "user", "content": prompt},
|
|
34
|
+
],
|
|
35
|
+
max_tokens=self.config.max_tokens,
|
|
36
|
+
temperature=self.config.temperature,
|
|
37
|
+
)
|
|
38
|
+
return response.choices[0].message.content or ""
|
llm_eval/types.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class EvalResult:
|
|
7
|
+
"""評価結果を格納するデータクラス"""
|
|
8
|
+
|
|
9
|
+
metric_name: str
|
|
10
|
+
score: float
|
|
11
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
12
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
13
|
+
|
|
14
|
+
def __repr__(self) -> str:
|
|
15
|
+
return f"EvalResult(metric={self.metric_name}, score={self.score:.4f})"
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-evaluation-toolkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight toolkit for evaluating LLM outputs
|
|
5
|
+
Project-URL: Homepage, https://github.com/swoswoyuu1156/llm-evaluation-toolkit
|
|
6
|
+
Project-URL: Repository, https://github.com/swoswoyuu1156/llm-evaluation-toolkit
|
|
7
|
+
Project-URL: Issues, https://github.com/swoswoyuu1156/llm-evaluation-toolkit/issues
|
|
8
|
+
Author-email: swoswoyuu1156 <174082729+swoswoyuu1156@users.noreply.github.com>
|
|
9
|
+
License: MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2026 swoswoyuu1156
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Keywords: ai,evaluation,llm,machine-learning,nlp
|
|
32
|
+
Classifier: Development Status :: 3 - Alpha
|
|
33
|
+
Classifier: Intended Audience :: Developers
|
|
34
|
+
Classifier: Intended Audience :: Science/Research
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
41
|
+
Requires-Python: >=3.9
|
|
42
|
+
Requires-Dist: datasets>=2.14
|
|
43
|
+
Requires-Dist: nltk>=3.8
|
|
44
|
+
Requires-Dist: numpy>=1.24
|
|
45
|
+
Requires-Dist: python-dotenv>=1.0
|
|
46
|
+
Requires-Dist: requests>=2.28
|
|
47
|
+
Requires-Dist: rouge-score>=0.1.2
|
|
48
|
+
Requires-Dist: sentence-transformers>=2.2
|
|
49
|
+
Provides-Extra: anthropic
|
|
50
|
+
Requires-Dist: anthropic>=0.20; extra == 'anthropic'
|
|
51
|
+
Provides-Extra: dev
|
|
52
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
53
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
55
|
+
Provides-Extra: openai
|
|
56
|
+
Requires-Dist: openai>=1.0; extra == 'openai'
|
|
57
|
+
Provides-Extra: semantic
|
|
58
|
+
Requires-Dist: sentence-transformers>=2.2; extra == 'semantic'
|
|
59
|
+
Description-Content-Type: text/markdown
|
|
60
|
+
|
|
61
|
+
# llm-evaluation-toolkit
|
|
62
|
+
|
|
63
|
+
[](https://github.com/swoswoyuu1156/llm-evaluation-toolkit/actions/workflows/ci.yml)
|
|
64
|
+
[](https://www.python.org)
|
|
65
|
+
[](https://opensource.org/licenses/MIT)
|
|
66
|
+
[](https://badge.fury.io/py/llm-evaluation-toolkit)
|
|
67
|
+
|
|
68
|
+
LLMの出力を評価するための軽量Pythonライブラリです。BLEU・ROUGE・意味的類似度・LLM-as-a-Judgeの4種類の評価指標を、統一されたAPIで利用できます。
|
|
69
|
+
|
|
70
|
+
*A lightweight Python toolkit for evaluating LLM outputs. Supports BLEU, ROUGE, semantic similarity, and LLM-as-a-Judge — all with a unified API.*
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## なぜ llm-evaluation-toolkit なのか? / Why llm-evaluation-toolkit?
|
|
75
|
+
|
|
76
|
+
既存の評価ライブラリは研究用途に特化していたり、セットアップが複雑すぎる問題がありました。このライブラリは、実際の開発ワークフローでLLMの出力を素早く評価したい開発者のために設計されています。
|
|
77
|
+
|
|
78
|
+
*Existing evaluation libraries are either too research-focused or require complex setup. This toolkit is designed for developers who need to evaluate LLM outputs quickly in real workflows.*
|
|
79
|
+
|
|
80
|
+
- **統一API** — すべての評価指標が同じインターフェースを持つ / *Unified API — all metrics share the same interface*
|
|
81
|
+
- **複数プロバイダ対応** — OpenAI・Anthropicをすぐに利用可能 / *Multiple providers — OpenAI and Anthropic out of the box*
|
|
82
|
+
- **正解テキスト不要** — LLM-as-a-Judgeは参照テキストなしで評価可能 / *No reference needed — LLM-as-a-Judge works without ground truth*
|
|
83
|
+
- **軽量設計** — 必要な機能だけインストールできる / *Lightweight — install only what you need*
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## インストール / Installation
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# 基本インストール / Base installation
|
|
91
|
+
pip install llm-evaluation-toolkit
|
|
92
|
+
|
|
93
|
+
# OpenAI対応 / With OpenAI support
|
|
94
|
+
pip install llm-evaluation-toolkit[openai]
|
|
95
|
+
|
|
96
|
+
# Anthropic対応 / With Anthropic support
|
|
97
|
+
pip install llm-evaluation-toolkit[anthropic]
|
|
98
|
+
|
|
99
|
+
# 意味的類似度対応 / With semantic similarity support
|
|
100
|
+
pip install llm-evaluation-toolkit[semantic]
|
|
101
|
+
|
|
102
|
+
# 全機能 / All features
|
|
103
|
+
pip install llm-evaluation-toolkit[openai,anthropic,semantic]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## クイックスタート / Quick Start
|
|
109
|
+
|
|
110
|
+
### 基本的な評価 / Basic evaluation
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from llm_eval import BLEUMetric, ROUGEMetric, SemanticSimilarityMetric
|
|
114
|
+
|
|
115
|
+
predictions = [
|
|
116
|
+
"The cat is on the mat",
|
|
117
|
+
"A dog was running in the park",
|
|
118
|
+
]
|
|
119
|
+
references = [
|
|
120
|
+
"A cat is sitting on a mat",
|
|
121
|
+
"The dog ran through the park",
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
bleu = BLEUMetric()
|
|
125
|
+
rouge = ROUGEMetric()
|
|
126
|
+
semantic = SemanticSimilarityMetric()
|
|
127
|
+
|
|
128
|
+
print(bleu.compute(predictions, references))
|
|
129
|
+
# EvalResult(metric=bleu, score=0.4231)
|
|
130
|
+
|
|
131
|
+
print(rouge.compute(predictions, references))
|
|
132
|
+
# EvalResult(metric=rouge, score=0.6842)
|
|
133
|
+
|
|
134
|
+
print(semantic.compute(predictions, references))
|
|
135
|
+
# EvalResult(metric=semantic_similarity, score=0.8923)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### 正解テキストなしで評価(LLM-as-a-Judge) / Evaluate without reference texts
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from llm_eval import LLMJudgeMetric
|
|
142
|
+
|
|
143
|
+
# 正解テキストなしで出力品質を評価できる
|
|
144
|
+
# Evaluate output quality without any reference text
|
|
145
|
+
judge = LLMJudgeMetric(judge_model="gpt-4o-mini")
|
|
146
|
+
|
|
147
|
+
questions = ["日本の首都はどこですか?"]
|
|
148
|
+
answers = ["日本の首都は東京です。政治・経済・文化の中心地として機能しています。"]
|
|
149
|
+
|
|
150
|
+
result = judge.compute(answers, questions)
|
|
151
|
+
print(result)
|
|
152
|
+
# EvalResult(metric=llm_judge, score=0.9)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 複数の評価指標を一括実行 / Run multiple metrics at once
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from llm_eval import BaseEvaluator, BLEUMetric, ROUGEMetric, SemanticSimilarityMetric
|
|
159
|
+
|
|
160
|
+
evaluator = BaseEvaluator(metrics=[
|
|
161
|
+
BLEUMetric(),
|
|
162
|
+
ROUGEMetric(),
|
|
163
|
+
SemanticSimilarityMetric(),
|
|
164
|
+
])
|
|
165
|
+
|
|
166
|
+
results = evaluator.evaluate(predictions, references)
|
|
167
|
+
for metric_name, result in results.items():
|
|
168
|
+
print(f"{metric_name}: {result.score:.4f}")
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### LLMプロバイダとの連携 / Use with LLM providers
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
from llm_eval import OpenAIProvider, GenerationConfig, BLEUMetric
|
|
175
|
+
|
|
176
|
+
# LLMで回答を生成してそのまま評価する
|
|
177
|
+
# Generate predictions from LLM and evaluate directly
|
|
178
|
+
provider = OpenAIProvider(
|
|
179
|
+
model="gpt-4o-mini",
|
|
180
|
+
config=GenerationConfig(temperature=0.0, max_tokens=256),
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
questions = ["機械学習とは何ですか?", "ニューラルネットワークを説明してください。"]
|
|
184
|
+
predictions = provider.generate_batch(questions)
|
|
185
|
+
|
|
186
|
+
references = [
|
|
187
|
+
"機械学習はデータから学習するAIの一分野です。",
|
|
188
|
+
"ニューラルネットワークは人間の脳を模倣した計算システムです。",
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
result = BLEUMetric().compute(predictions, references)
|
|
192
|
+
print(result)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### ベンチマークデータセットの利用 / Load benchmark datasets
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
from llm_eval import DatasetLoader
|
|
199
|
+
|
|
200
|
+
# 組み込みベンチマークを使う / Use built-in benchmarks
|
|
201
|
+
squad = DatasetLoader.load_squad(max_samples=50)
|
|
202
|
+
print(squad)
|
|
203
|
+
# EvalDataset(name=squad, size=50)
|
|
204
|
+
|
|
205
|
+
# 自前データを使う / Use your own data
|
|
206
|
+
dataset = DatasetLoader.from_dict(
|
|
207
|
+
name="my_dataset",
|
|
208
|
+
questions=["質問1", "質問2"],
|
|
209
|
+
references=["回答1", "回答2"],
|
|
210
|
+
)
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## 対応評価指標 / Supported Metrics
|
|
216
|
+
|
|
217
|
+
| 指標 / Metric | 正解テキスト / Reference | 適したタスク / Best for |
|
|
218
|
+
|--------------|------------------------|------------------------|
|
|
219
|
+
| BLEU | 必要 / Yes | 翻訳・固定フォーマット生成 / Translation, fixed-format generation |
|
|
220
|
+
| ROUGE | 必要 / Yes | 要約 / Summarization |
|
|
221
|
+
| Semantic Similarity | 必要 / Yes | 言い換えを含むタスク / Paraphrase-heavy tasks |
|
|
222
|
+
| LLM-as-a-Judge | 不要 / No | 自由記述生成 / Open-ended generation |
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## 対応プロバイダ / Supported Providers
|
|
227
|
+
|
|
228
|
+
| プロバイダ / Provider | インストール / Install | モデル例 / Models |
|
|
229
|
+
|----------------------|----------------------|------------------|
|
|
230
|
+
| OpenAI | `pip install llm-evaluation-toolkit[openai]` | gpt-4o, gpt-4o-mini |
|
|
231
|
+
| Anthropic | `pip install llm-evaluation-toolkit[anthropic]` | claude-opus-4-6, claude-haiku-4-5 |
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## 環境変数 / Environment Variables
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
OPENAI_API_KEY=your_openai_api_key
|
|
239
|
+
ANTHROPIC_API_KEY=your_anthropic_api_key
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## 開発環境のセットアップ / Development Setup
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
git clone https://github.com/swoswoyuu1156/llm-evaluation-toolkit.git
|
|
248
|
+
cd llm-evaluation-toolkit
|
|
249
|
+
python -m venv .venv
|
|
250
|
+
|
|
251
|
+
# Mac/Linux
|
|
252
|
+
source .venv/bin/activate
|
|
253
|
+
# Windows
|
|
254
|
+
.venv\Scripts\activate
|
|
255
|
+
|
|
256
|
+
pip install -e ".[dev]"
|
|
257
|
+
|
|
258
|
+
# テスト実行 / Run tests
|
|
259
|
+
pytest tests/ -v --cov=src/llm_eval
|
|
260
|
+
|
|
261
|
+
# リント実行 / Run linter
|
|
262
|
+
ruff check src/ tests/
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## コントリビューション / Contributing
|
|
268
|
+
|
|
269
|
+
コントリビューションを歓迎します!まず [CONTRIBUTING.md](CONTRIBUTING.md) をご確認ください。
|
|
270
|
+
|
|
271
|
+
*Contributions are welcome! Please read [CONTRIBUTING.md](CONTRIBUTING.md) first.*
|
|
272
|
+
|
|
273
|
+
---
|
|
274
|
+
|
|
275
|
+
## ライセンス / License
|
|
276
|
+
|
|
277
|
+
MIT License — 詳細は [LICENSE](LICENSE) をご確認ください。
|
|
278
|
+
|
|
279
|
+
*MIT License — see [LICENSE](LICENSE) for details.*
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
llm_eval/__init__.py,sha256=WZGF2gsReDmuW8tTozYzRP2CsW32wxVSoh9FXGpEHrE,780
|
|
2
|
+
llm_eval/datasets.py,sha256=stNm2vvSZf9y5sz6qNYxGXowAe458levjSy35Qg4gOc,3528
|
|
3
|
+
llm_eval/types.py,sha256=J1nFoWtPsi1fwRmxF_UpIKHzXOkYcNlW1-mybLa30FI,418
|
|
4
|
+
llm_eval/evaluators/__init__.py,sha256=P6Ayk7a3DkB8YPSBm1v2IvWJQszqNwIVbxJ4xxSH0Eg,841
|
|
5
|
+
llm_eval/metrics/__init__.py,sha256=k57U6Y1JBDf6-uS5tgTL6W0ggEAotMh4VbYumJ6-PyM,373
|
|
6
|
+
llm_eval/metrics/base.py,sha256=Z4bFvy9n6LpG78-UXUaL4SpO8x73q95TotLecEIVuKs,491
|
|
7
|
+
llm_eval/metrics/bleu.py,sha256=X7YqDYZcS2lHESuIhb8FX-r9K3hHZ3zDQNn-uJQvIDs,1826
|
|
8
|
+
llm_eval/metrics/judge.py,sha256=edwJxALUNtwu0LABS3F-QeM0MQNpUF-Y8phLnEFKPm0,4115
|
|
9
|
+
llm_eval/metrics/rouge.py,sha256=CEt-bAvKQTAXdxlU1z76bxt9wePlKDSluyaUZ05EQ-o,1762
|
|
10
|
+
llm_eval/metrics/semantic.py,sha256=FtqrWYQkr_juOezlOs-p4aYvaKuf6Z4CHZkZt5PDxjI,2729
|
|
11
|
+
llm_eval/providers/__init__.py,sha256=O_ommltS7L6GyI__ff8DDDVCXhWY5kxDp2eI4aAu42o,1067
|
|
12
|
+
llm_eval/providers/anthropic_provider.py,sha256=Qhh0bgmzr3-kTG0PVr4POuBWTfGbF17ch_oloDh4esQ,1115
|
|
13
|
+
llm_eval/providers/openai_provider.py,sha256=XYhUed104Vfvrlo1vHqXJbi_pZG6buzXqdN3bqgpQuc,1184
|
|
14
|
+
llm_evaluation_toolkit-0.1.0.dist-info/METADATA,sha256=06QyqOU_lJsq-9EJAQESmcRCjjO2GaMiiDD6AIFUHbg,10275
|
|
15
|
+
llm_evaluation_toolkit-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
16
|
+
llm_evaluation_toolkit-0.1.0.dist-info/licenses/LICENSE,sha256=o38RZRbQzW-aKIgfjXQ3tNCZoSDUNvY3-OC15qE6baE,1070
|
|
17
|
+
llm_evaluation_toolkit-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 swoswoyuu1156
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|