llm-evaluation-toolkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_eval/__init__.py ADDED
@@ -0,0 +1,30 @@
1
+ from llm_eval.evaluators import BaseEvaluator
2
+ from llm_eval.metrics import (
3
+ BaseMetric,
4
+ BLEUMetric,
5
+ ROUGEMetric,
6
+ SemanticSimilarityMetric,
7
+ LLMJudgeMetric,
8
+ )
9
+ from llm_eval.providers import BaseProvider, GenerationConfig
10
+ from llm_eval.providers.openai_provider import OpenAIProvider
11
+ from llm_eval.providers.anthropic_provider import AnthropicProvider
12
+ from llm_eval.types import EvalResult
13
+ from llm_eval.datasets import DatasetLoader, EvalDataset
14
+
15
+ __version__ = "0.1.0"
16
+ __all__ = [
17
+ "BaseMetric",
18
+ "BaseEvaluator",
19
+ "EvalResult",
20
+ "BLEUMetric",
21
+ "ROUGEMetric",
22
+ "SemanticSimilarityMetric",
23
+ "LLMJudgeMetric",
24
+ "BaseProvider",
25
+ "GenerationConfig",
26
+ "OpenAIProvider",
27
+ "AnthropicProvider",
28
+ "DatasetLoader",
29
+ "EvalDataset",
30
+ ]
llm_eval/datasets.py ADDED
@@ -0,0 +1,120 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass
7
+ class EvalDataset:
8
+ """評価用データセットを格納するクラス"""
9
+ name: str
10
+ questions: list[str]
11
+ references: list[str]
12
+ metadata: dict = None
13
+
14
+ def __post_init__(self):
15
+ if self.metadata is None:
16
+ self.metadata = {}
17
+ if len(self.questions) != len(self.references):
18
+ raise ValueError(
19
+ f"questionsとreferencesの数が一致しません: "
20
+ f"{len(self.questions)} != {len(self.references)}"
21
+ )
22
+
23
+ def __len__(self) -> int:
24
+ return len(self.questions)
25
+
26
+ def __repr__(self) -> str:
27
+ return f"EvalDataset(name={self.name}, size={len(self)})"
28
+
29
+ def subset(self, n: int) -> "EvalDataset":
30
+ """先頭n件だけ取り出す"""
31
+ return EvalDataset(
32
+ name=f"{self.name}(subset={n})",
33
+ questions=self.questions[:n],
34
+ references=self.references[:n],
35
+ metadata=self.metadata,
36
+ )
37
+
38
+
39
+ class DatasetLoader:
40
+ """HuggingFace datasetsからベンチマークを読み込むクラス"""
41
+
42
+ @staticmethod
43
+ def _import_datasets():
44
+ try:
45
+ import datasets
46
+ return datasets
47
+ except ImportError:
48
+ raise ImportError(
49
+ "datasetsパッケージが必要です: pip install datasets"
50
+ )
51
+
52
+ @classmethod
53
+ def load_squad(cls, split: str = "validation", max_samples: int = 100) -> EvalDataset:
54
+ """
55
+ SQuAD (Stanford Question Answering Dataset) を読み込む
56
+
57
+ 質問応答タスクの定番ベンチマーク。
58
+ questions: 質問文
59
+ references: 正解テキスト
60
+ """
61
+ ds = cls._import_datasets()
62
+ dataset = ds.load_dataset("squad", split=split)
63
+
64
+ questions = []
65
+ references = []
66
+
67
+ for item in dataset.select(range(min(max_samples, len(dataset)))):
68
+ questions.append(item["question"])
69
+ references.append(item["answers"]["text"][0])
70
+
71
+ return EvalDataset(
72
+ name="squad",
73
+ questions=questions,
74
+ references=references,
75
+ metadata={"split": split, "max_samples": max_samples},
76
+ )
77
+
78
+ @classmethod
79
+ def load_cnn_dailymail(cls, split: str = "validation", max_samples: int = 50) -> EvalDataset:
80
+ """
81
+ CNN/DailyMail を読み込む
82
+
83
+ 要約タスクの定番ベンチマーク。
84
+ questions: 元記事
85
+ references: 正解要約
86
+ """
87
+ ds = cls._import_datasets()
88
+ dataset = ds.load_dataset("cnn_dailymail", "3.0.0", split=split)
89
+
90
+ questions = []
91
+ references = []
92
+
93
+ for item in dataset.select(range(min(max_samples, len(dataset)))):
94
+ questions.append(item["article"][:512])
95
+ references.append(item["highlights"])
96
+
97
+ return EvalDataset(
98
+ name="cnn_dailymail",
99
+ questions=questions,
100
+ references=references,
101
+ metadata={"split": split, "max_samples": max_samples},
102
+ )
103
+
104
+ @classmethod
105
+ def from_dict(
106
+ cls,
107
+ name: str,
108
+ questions: list[str],
109
+ references: list[str],
110
+ ) -> EvalDataset:
111
+ """
112
+ 自前のデータからEvalDatasetを作る
113
+
114
+ APIキーなしでテストしたいときに便利。
115
+ """
116
+ return EvalDataset(
117
+ name=name,
118
+ questions=questions,
119
+ references=references,
120
+ )
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+
5
+ from llm_eval.metrics import BaseMetric
6
+ from llm_eval.types import EvalResult
7
+
8
+
9
+ class BaseEvaluator(ABC):
10
+ """複数のMetricをまとめて実行する基底クラス"""
11
+
12
+ def __init__(self, metrics: list[BaseMetric]):
13
+ self.metrics = metrics
14
+
15
+ def evaluate(
16
+ self,
17
+ predictions: list[str],
18
+ references: list[str],
19
+ ) -> dict[str, EvalResult]:
20
+ results = {}
21
+ for metric in self.metrics:
22
+ results[metric.name] = metric.compute(predictions, references)
23
+ return results
24
+
25
+ def add_metric(self, metric: BaseMetric) -> None:
26
+ self.metrics.append(metric)
27
+
28
+ def __repr__(self) -> str:
29
+ metric_names = [m.name for m in self.metrics]
30
+ return f"{self.__class__.__name__}(metrics={metric_names})"
@@ -0,0 +1,13 @@
1
+ from llm_eval.metrics.base import BaseMetric
2
+ from llm_eval.metrics.bleu import BLEUMetric
3
+ from llm_eval.metrics.rouge import ROUGEMetric
4
+ from llm_eval.metrics.semantic import SemanticSimilarityMetric
5
+ from llm_eval.metrics.judge import LLMJudgeMetric
6
+
7
+ __all__ = [
8
+ "BaseMetric",
9
+ "BLEUMetric",
10
+ "ROUGEMetric",
11
+ "SemanticSimilarityMetric",
12
+ "LLMJudgeMetric",
13
+ ]
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ from llm_eval.types import EvalResult
6
+
7
+
8
+ class BaseMetric(ABC):
9
+ """すべての評価指標の基底クラス"""
10
+
11
+ def __init__(self, name: str):
12
+ self.name = name
13
+
14
+ @abstractmethod
15
+ def compute(
16
+ self,
17
+ predictions: list[str],
18
+ references: list[str],
19
+ ) -> EvalResult:
20
+ ...
21
+
22
+ def __repr__(self) -> str:
23
+ return f"{self.__class__.__name__}(name={self.name})"
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+
3
+ import nltk
4
+ from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
5
+
6
+ from llm_eval.metrics.base import BaseMetric
7
+ from llm_eval.types import EvalResult
8
+
9
+
10
+ class BLEUMetric(BaseMetric):
11
+ """
12
+ BLEU (Bilingual Evaluation Understudy) スコアを計算する
13
+
14
+ 機械翻訳や要約タスクでLLM出力と正解文の一致度を測る。
15
+ スコアは0.0〜1.0で、1.0が完全一致。
16
+ """
17
+
18
+ def __init__(self, max_n: int = 4):
19
+ super().__init__(name="bleu")
20
+ self.max_n = max_n
21
+ self._ensure_nltk_data()
22
+
23
+ def _ensure_nltk_data(self) -> None:
24
+ try:
25
+ nltk.data.find("tokenizers/punkt")
26
+ except LookupError:
27
+ nltk.download("punkt", quiet=True)
28
+ try:
29
+ nltk.data.find("tokenizers/punkt_tab")
30
+ except LookupError:
31
+ nltk.download("punkt_tab", quiet=True)
32
+
33
+ def compute(
34
+ self,
35
+ predictions: list[str],
36
+ references: list[str],
37
+ ) -> EvalResult:
38
+ if len(predictions) != len(references):
39
+ raise ValueError(
40
+ f"predictionsとreferencesの数が一致しません: "
41
+ f"{len(predictions)} != {len(references)}"
42
+ )
43
+
44
+ tokenized_predictions = [p.lower().split() for p in predictions]
45
+ tokenized_references = [[r.lower().split()] for r in references]
46
+
47
+ smoothing = SmoothingFunction().method1
48
+ score = corpus_bleu(
49
+ tokenized_references,
50
+ tokenized_predictions,
51
+ smoothing_function=smoothing,
52
+ )
53
+
54
+ return EvalResult(
55
+ metric_name=self.name,
56
+ score=float(score),
57
+ details={
58
+ "num_samples": len(predictions),
59
+ "max_n": self.max_n,
60
+ },
61
+ )
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+
6
+ from llm_eval.metrics.base import BaseMetric
7
+ from llm_eval.types import EvalResult
8
+
9
+
10
+ JUDGE_PROMPT_TEMPLATE = """You are an expert evaluator for AI-generated text.
11
+
12
+ Evaluate the following response based on these criteria:
13
+ - Accuracy: Is the information correct?
14
+ - Clarity: Is the response clear and easy to understand?
15
+ - Completeness: Does it fully address the question?
16
+ - Relevance: Is it relevant to the question asked?
17
+
18
+ Question: {question}
19
+ Response: {response}
20
+
21
+ Provide your evaluation in the following format:
22
+ Score: [0-10]
23
+ Reasoning: [brief explanation]
24
+
25
+ Be strict but fair. A score of 10 means perfect."""
26
+
27
+
28
+ class LLMJudgeMetric(BaseMetric):
29
+ """
30
+ LLM自身が審判として出力を採点するメトリクス
31
+
32
+ 正解テキストなしで出力の品質を評価できる。
33
+ スコアは0.0〜1.0に正規化される(元スコアは0〜10)。
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ judge_model: str = "gpt-4o-mini",
39
+ api_key: str | None = None,
40
+ prompt_template: str | None = None,
41
+ ):
42
+ super().__init__(name="llm_judge")
43
+ self.judge_model = judge_model
44
+ self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
45
+ self.prompt_template = prompt_template or JUDGE_PROMPT_TEMPLATE
46
+ self._client = None
47
+
48
+ def _get_client(self):
49
+ """OpenAIクライアントの遅延初期化"""
50
+ if self._client is None:
51
+ try:
52
+ from openai import OpenAI
53
+ except ImportError:
54
+ raise ImportError(
55
+ "openaiパッケージが必要です: "
56
+ "pip install llm-evaluation-toolkit[openai]"
57
+ )
58
+ self._client = OpenAI(api_key=self.api_key)
59
+ return self._client
60
+
61
+ def _parse_score(self, judge_response: str) -> float:
62
+ """LLMの採点レスポンスからスコアを抽出する"""
63
+ pattern = r"Score:\s*(\d+(?:\.\d+)?)"
64
+ match = re.search(pattern, judge_response, re.IGNORECASE)
65
+ if match:
66
+ raw_score = float(match.group(1))
67
+ return min(max(raw_score / 10.0, 0.0), 1.0)
68
+ return 0.5
69
+
70
+ def _judge_single(self, question: str, response: str) -> tuple[float, str]:
71
+ """1つのレスポンスを採点する"""
72
+ client = self._get_client()
73
+ prompt = self.prompt_template.format(
74
+ question=question,
75
+ response=response,
76
+ )
77
+ result = client.chat.completions.create(
78
+ model=self.judge_model,
79
+ messages=[{"role": "user", "content": prompt}],
80
+ max_tokens=256,
81
+ temperature=0.0,
82
+ )
83
+ judge_response = result.choices[0].message.content or ""
84
+ score = self._parse_score(judge_response)
85
+ return score, judge_response
86
+
87
+ def compute(
88
+ self,
89
+ predictions: list[str],
90
+ references: list[str],
91
+ ) -> EvalResult:
92
+ """
93
+ Args:
94
+ predictions: LLMの出力テキストのリスト
95
+ references: 対応する質問文のリスト(このMetricでは正解ではなく質問を渡す)
96
+ """
97
+ if len(predictions) != len(references):
98
+ raise ValueError(
99
+ f"predictionsとreferencesの数が一致しません: "
100
+ f"{len(predictions)} != {len(references)}"
101
+ )
102
+
103
+ scores = []
104
+ judge_responses = []
105
+
106
+ for question, response in zip(references, predictions):
107
+ score, judge_response = self._judge_single(question, response)
108
+ scores.append(score)
109
+ judge_responses.append(judge_response)
110
+ print(f"採点完了: {score:.2f} / 1.0")
111
+
112
+ avg_score = sum(scores) / len(scores)
113
+
114
+ return EvalResult(
115
+ metric_name=self.name,
116
+ score=avg_score,
117
+ details={
118
+ "num_samples": len(predictions),
119
+ "judge_model": self.judge_model,
120
+ "individual_scores": scores,
121
+ "judge_responses": judge_responses,
122
+ },
123
+ )
@@ -0,0 +1,55 @@
1
+ from __future__ import annotations
2
+
3
+ from rouge_score import rouge_scorer
4
+
5
+ from llm_eval.metrics.base import BaseMetric
6
+ from llm_eval.types import EvalResult
7
+
8
+
9
+ class ROUGEMetric(BaseMetric):
10
+ """
11
+ ROUGE (Recall-Oriented Understudy for Gisting Evaluation) スコアを計算する
12
+
13
+ 要約タスクでよく使われる。
14
+ ROUGE-1: 単語の一致率
15
+ ROUGE-2: 2単語フレーズの一致率
16
+ ROUGE-L: 最長共通部分列ベースの一致率
17
+ """
18
+
19
+ def __init__(self, rouge_types: list[str] | None = None):
20
+ super().__init__(name="rouge")
21
+ self.rouge_types = rouge_types or ["rouge1", "rouge2", "rougeL"]
22
+ self._scorer = rouge_scorer.RougeScorer(
23
+ self.rouge_types,
24
+ use_stemmer=True,
25
+ )
26
+
27
+ def compute(
28
+ self,
29
+ predictions: list[str],
30
+ references: list[str],
31
+ ) -> EvalResult:
32
+ if len(predictions) != len(references):
33
+ raise ValueError(
34
+ f"predictionsとreferencesの数が一致しません: "
35
+ f"{len(predictions)} != {len(references)}"
36
+ )
37
+
38
+ aggregated: dict[str, list[float]] = {t: [] for t in self.rouge_types}
39
+
40
+ for pred, ref in zip(predictions, references):
41
+ scores = self._scorer.score(ref, pred)
42
+ for rouge_type in self.rouge_types:
43
+ aggregated[rouge_type].append(scores[rouge_type].fmeasure)
44
+
45
+ details = {}
46
+ for rouge_type, values in aggregated.items():
47
+ details[rouge_type] = sum(values) / len(values)
48
+
49
+ primary_score = details.get("rougeL", details[self.rouge_types[0]])
50
+
51
+ return EvalResult(
52
+ metric_name=self.name,
53
+ score=primary_score,
54
+ details=details,
55
+ )
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import numpy as np
4
+
5
+ from llm_eval.metrics.base import BaseMetric
6
+ from llm_eval.types import EvalResult
7
+
8
+
9
+ class SemanticSimilarityMetric(BaseMetric):
10
+ """
11
+ 埋め込みベクトルのコサイン類似度で意味的な近さを評価する
12
+
13
+ BLEUやROUGEでは捉えられない言い換えや同義語を考慮できる。
14
+ スコアは0.0〜1.0で、1.0が意味的に完全一致。
15
+ """
16
+
17
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
18
+ super().__init__(name="semantic_similarity")
19
+ self.model_name = model_name
20
+ self._model = None
21
+
22
+ def _load_model(self):
23
+ """モデルの遅延読み込み(初回呼び出し時にダウンロード)"""
24
+ if self._model is None:
25
+ try:
26
+ from sentence_transformers import SentenceTransformer
27
+ except ImportError:
28
+ raise ImportError(
29
+ "sentence-transformersが必要です: "
30
+ "pip install llm-evaluation-toolkit[semantic]"
31
+ )
32
+ print(f"モデルを読み込み中: {self.model_name}")
33
+ self._model = SentenceTransformer(self.model_name)
34
+ return self._model
35
+
36
+ def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
37
+ """2つのベクトルのコサイン類似度を計算する"""
38
+ dot_product = np.dot(a, b)
39
+ norm_a = np.linalg.norm(a)
40
+ norm_b = np.linalg.norm(b)
41
+ if norm_a == 0 or norm_b == 0:
42
+ return 0.0
43
+ return float(dot_product / (norm_a * norm_b))
44
+
45
+ def compute(
46
+ self,
47
+ predictions: list[str],
48
+ references: list[str],
49
+ ) -> EvalResult:
50
+ if len(predictions) != len(references):
51
+ raise ValueError(
52
+ f"predictionsとreferencesの数が一致しません: "
53
+ f"{len(predictions)} != {len(references)}"
54
+ )
55
+
56
+ model = self._load_model()
57
+
58
+ pred_embeddings = model.encode(predictions, convert_to_numpy=True)
59
+ ref_embeddings = model.encode(references, convert_to_numpy=True)
60
+
61
+ similarities = [
62
+ self._cosine_similarity(pred_embeddings[i], ref_embeddings[i])
63
+ for i in range(len(predictions))
64
+ ]
65
+
66
+ avg_score = float(np.mean(similarities))
67
+
68
+ return EvalResult(
69
+ metric_name=self.name,
70
+ score=avg_score,
71
+ details={
72
+ "num_samples": len(predictions),
73
+ "model": self.model_name,
74
+ "individual_scores": similarities,
75
+ "min_score": float(np.min(similarities)),
76
+ "max_score": float(np.max(similarities)),
77
+ },
78
+ )
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass
5
+
6
+
7
+ @dataclass
8
+ class GenerationConfig:
9
+ """テキスト生成の設定"""
10
+ max_tokens: int = 512
11
+ temperature: float = 0.0
12
+ system_prompt: str = "You are a helpful assistant."
13
+
14
+
15
+ class BaseProvider(ABC):
16
+ """すべてのLLMプロバイダの基底クラス"""
17
+
18
+ def __init__(self, model: str, config: GenerationConfig | None = None):
19
+ self.model = model
20
+ self.config = config or GenerationConfig()
21
+
22
+ @abstractmethod
23
+ def generate(self, prompt: str) -> str:
24
+ ...
25
+
26
+ def generate_batch(self, prompts: list[str]) -> list[str]:
27
+ return [self.generate(p) for p in prompts]
28
+
29
+ def __repr__(self) -> str:
30
+ return f"{self.__class__.__name__}(model={self.model})"
31
+
32
+
33
+ from llm_eval.providers.openai_provider import OpenAIProvider # noqa: E402
34
+ from llm_eval.providers.anthropic_provider import AnthropicProvider # noqa: E402
35
+
36
+ __all__ = ["BaseProvider", "GenerationConfig", "OpenAIProvider", "AnthropicProvider"]
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ from llm_eval.providers import BaseProvider, GenerationConfig
6
+
7
+
8
+ class AnthropicProvider(BaseProvider):
9
+ """Anthropic API プロバイダ"""
10
+
11
+ def __init__(
12
+ self,
13
+ model: str = "claude-haiku-4-5-20251001",
14
+ config: GenerationConfig | None = None,
15
+ api_key: str | None = None,
16
+ ):
17
+ super().__init__(model=model, config=config)
18
+ try:
19
+ import anthropic
20
+ except ImportError:
21
+ raise ImportError(
22
+ "anthropicパッケージが必要です: pip install llm-evaluation-toolkit[anthropic]"
23
+ )
24
+ self._client = anthropic.Anthropic(
25
+ api_key=api_key or os.environ.get("ANTHROPIC_API_KEY")
26
+ )
27
+
28
+ def generate(self, prompt: str) -> str:
29
+ response = self._client.messages.create(
30
+ model=self.model,
31
+ max_tokens=self.config.max_tokens,
32
+ system=self.config.system_prompt,
33
+ messages=[
34
+ {"role": "user", "content": prompt},
35
+ ],
36
+ )
37
+ return response.content[0].text
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ from llm_eval.providers import BaseProvider, GenerationConfig
6
+
7
+
8
+ class OpenAIProvider(BaseProvider):
9
+ """OpenAI API プロバイダ"""
10
+
11
+ def __init__(
12
+ self,
13
+ model: str = "gpt-4o-mini",
14
+ config: GenerationConfig | None = None,
15
+ api_key: str | None = None,
16
+ ):
17
+ super().__init__(model=model, config=config)
18
+ try:
19
+ from openai import OpenAI
20
+ except ImportError:
21
+ raise ImportError(
22
+ "openaiパッケージが必要です: pip install llm-evaluation-toolkit[openai]"
23
+ )
24
+ self._client = OpenAI(
25
+ api_key=api_key or os.environ.get("OPENAI_API_KEY")
26
+ )
27
+
28
+ def generate(self, prompt: str) -> str:
29
+ response = self._client.chat.completions.create(
30
+ model=self.model,
31
+ messages=[
32
+ {"role": "system", "content": self.config.system_prompt},
33
+ {"role": "user", "content": prompt},
34
+ ],
35
+ max_tokens=self.config.max_tokens,
36
+ temperature=self.config.temperature,
37
+ )
38
+ return response.choices[0].message.content or ""
llm_eval/types.py ADDED
@@ -0,0 +1,15 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Any
3
+
4
+
5
+ @dataclass
6
+ class EvalResult:
7
+ """評価結果を格納するデータクラス"""
8
+
9
+ metric_name: str
10
+ score: float
11
+ details: dict[str, Any] = field(default_factory=dict)
12
+ metadata: dict[str, Any] = field(default_factory=dict)
13
+
14
+ def __repr__(self) -> str:
15
+ return f"EvalResult(metric={self.metric_name}, score={self.score:.4f})"
@@ -0,0 +1,279 @@
1
+ Metadata-Version: 2.4
2
+ Name: llm-evaluation-toolkit
3
+ Version: 0.1.0
4
+ Summary: A lightweight toolkit for evaluating LLM outputs
5
+ Project-URL: Homepage, https://github.com/swoswoyuu1156/llm-evaluation-toolkit
6
+ Project-URL: Repository, https://github.com/swoswoyuu1156/llm-evaluation-toolkit
7
+ Project-URL: Issues, https://github.com/swoswoyuu1156/llm-evaluation-toolkit/issues
8
+ Author-email: swoswoyuu1156 <174082729+swoswoyuu1156@users.noreply.github.com>
9
+ License: MIT License
10
+
11
+ Copyright (c) 2026 swoswoyuu1156
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Keywords: ai,evaluation,llm,machine-learning,nlp
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: Intended Audience :: Science/Research
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3.9
38
+ Classifier: Programming Language :: Python :: 3.10
39
+ Classifier: Programming Language :: Python :: 3.11
40
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
41
+ Requires-Python: >=3.9
42
+ Requires-Dist: datasets>=2.14
43
+ Requires-Dist: nltk>=3.8
44
+ Requires-Dist: numpy>=1.24
45
+ Requires-Dist: python-dotenv>=1.0
46
+ Requires-Dist: requests>=2.28
47
+ Requires-Dist: rouge-score>=0.1.2
48
+ Requires-Dist: sentence-transformers>=2.2
49
+ Provides-Extra: anthropic
50
+ Requires-Dist: anthropic>=0.20; extra == 'anthropic'
51
+ Provides-Extra: dev
52
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
53
+ Requires-Dist: pytest>=7.0; extra == 'dev'
54
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
55
+ Provides-Extra: openai
56
+ Requires-Dist: openai>=1.0; extra == 'openai'
57
+ Provides-Extra: semantic
58
+ Requires-Dist: sentence-transformers>=2.2; extra == 'semantic'
59
+ Description-Content-Type: text/markdown
60
+
61
+ # llm-evaluation-toolkit
62
+
63
+ [![CI](https://github.com/swoswoyuu1156/llm-evaluation-toolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/swoswoyuu1156/llm-evaluation-toolkit/actions/workflows/ci.yml)
64
+ [![Python](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org)
65
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
66
+ [![PyPI version](https://badge.fury.io/py/llm-evaluation-toolkit.svg)](https://badge.fury.io/py/llm-evaluation-toolkit)
67
+
68
+ LLMの出力を評価するための軽量Pythonライブラリです。BLEU・ROUGE・意味的類似度・LLM-as-a-Judgeの4種類の評価指標を、統一されたAPIで利用できます。
69
+
70
+ *A lightweight Python toolkit for evaluating LLM outputs. Supports BLEU, ROUGE, semantic similarity, and LLM-as-a-Judge — all with a unified API.*
71
+
72
+ ---
73
+
74
+ ## なぜ llm-evaluation-toolkit なのか? / Why llm-evaluation-toolkit?
75
+
76
+ 既存の評価ライブラリは研究用途に特化していたり、セットアップが複雑すぎる問題がありました。このライブラリは、実際の開発ワークフローでLLMの出力を素早く評価したい開発者のために設計されています。
77
+
78
+ *Existing evaluation libraries are either too research-focused or require complex setup. This toolkit is designed for developers who need to evaluate LLM outputs quickly in real workflows.*
79
+
80
+ - **統一API** — すべての評価指標が同じインターフェースを持つ / *Unified API — all metrics share the same interface*
81
+ - **複数プロバイダ対応** — OpenAI・Anthropicをすぐに利用可能 / *Multiple providers — OpenAI and Anthropic out of the box*
82
+ - **正解テキスト不要** — LLM-as-a-Judgeは参照テキストなしで評価可能 / *No reference needed — LLM-as-a-Judge works without ground truth*
83
+ - **軽量設計** — 必要な機能だけインストールできる / *Lightweight — install only what you need*
84
+
85
+ ---
86
+
87
+ ## インストール / Installation
88
+
89
+ ```bash
90
+ # 基本インストール / Base installation
91
+ pip install llm-evaluation-toolkit
92
+
93
+ # OpenAI対応 / With OpenAI support
94
+ pip install llm-evaluation-toolkit[openai]
95
+
96
+ # Anthropic対応 / With Anthropic support
97
+ pip install llm-evaluation-toolkit[anthropic]
98
+
99
+ # 意味的類似度対応 / With semantic similarity support
100
+ pip install llm-evaluation-toolkit[semantic]
101
+
102
+ # 全機能 / All features
103
+ pip install llm-evaluation-toolkit[openai,anthropic,semantic]
104
+ ```
105
+
106
+ ---
107
+
108
+ ## クイックスタート / Quick Start
109
+
110
+ ### 基本的な評価 / Basic evaluation
111
+
112
+ ```python
113
+ from llm_eval import BLEUMetric, ROUGEMetric, SemanticSimilarityMetric
114
+
115
+ predictions = [
116
+ "The cat is on the mat",
117
+ "A dog was running in the park",
118
+ ]
119
+ references = [
120
+ "A cat is sitting on a mat",
121
+ "The dog ran through the park",
122
+ ]
123
+
124
+ bleu = BLEUMetric()
125
+ rouge = ROUGEMetric()
126
+ semantic = SemanticSimilarityMetric()
127
+
128
+ print(bleu.compute(predictions, references))
129
+ # EvalResult(metric=bleu, score=0.4231)
130
+
131
+ print(rouge.compute(predictions, references))
132
+ # EvalResult(metric=rouge, score=0.6842)
133
+
134
+ print(semantic.compute(predictions, references))
135
+ # EvalResult(metric=semantic_similarity, score=0.8923)
136
+ ```
137
+
138
+ ### 正解テキストなしで評価(LLM-as-a-Judge) / Evaluate without reference texts
139
+
140
+ ```python
141
+ from llm_eval import LLMJudgeMetric
142
+
143
+ # 正解テキストなしで出力品質を評価できる
144
+ # Evaluate output quality without any reference text
145
+ judge = LLMJudgeMetric(judge_model="gpt-4o-mini")
146
+
147
+ questions = ["日本の首都はどこですか?"]
148
+ answers = ["日本の首都は東京です。政治・経済・文化の中心地として機能しています。"]
149
+
150
+ result = judge.compute(answers, questions)
151
+ print(result)
152
+ # EvalResult(metric=llm_judge, score=0.9)
153
+ ```
154
+
155
+ ### 複数の評価指標を一括実行 / Run multiple metrics at once
156
+
157
+ ```python
158
+ from llm_eval import BaseEvaluator, BLEUMetric, ROUGEMetric, SemanticSimilarityMetric
159
+
160
+ evaluator = BaseEvaluator(metrics=[
161
+ BLEUMetric(),
162
+ ROUGEMetric(),
163
+ SemanticSimilarityMetric(),
164
+ ])
165
+
166
+ results = evaluator.evaluate(predictions, references)
167
+ for metric_name, result in results.items():
168
+ print(f"{metric_name}: {result.score:.4f}")
169
+ ```
170
+
171
+ ### LLMプロバイダとの連携 / Use with LLM providers
172
+
173
+ ```python
174
+ from llm_eval import OpenAIProvider, GenerationConfig, BLEUMetric
175
+
176
+ # LLMで回答を生成してそのまま評価する
177
+ # Generate predictions from LLM and evaluate directly
178
+ provider = OpenAIProvider(
179
+ model="gpt-4o-mini",
180
+ config=GenerationConfig(temperature=0.0, max_tokens=256),
181
+ )
182
+
183
+ questions = ["機械学習とは何ですか?", "ニューラルネットワークを説明してください。"]
184
+ predictions = provider.generate_batch(questions)
185
+
186
+ references = [
187
+ "機械学習はデータから学習するAIの一分野です。",
188
+ "ニューラルネットワークは人間の脳を模倣した計算システムです。",
189
+ ]
190
+
191
+ result = BLEUMetric().compute(predictions, references)
192
+ print(result)
193
+ ```
194
+
195
+ ### ベンチマークデータセットの利用 / Load benchmark datasets
196
+
197
+ ```python
198
+ from llm_eval import DatasetLoader
199
+
200
+ # 組み込みベンチマークを使う / Use built-in benchmarks
201
+ squad = DatasetLoader.load_squad(max_samples=50)
202
+ print(squad)
203
+ # EvalDataset(name=squad, size=50)
204
+
205
+ # 自前データを使う / Use your own data
206
+ dataset = DatasetLoader.from_dict(
207
+ name="my_dataset",
208
+ questions=["質問1", "質問2"],
209
+ references=["回答1", "回答2"],
210
+ )
211
+ ```
212
+
213
+ ---
214
+
215
+ ## 対応評価指標 / Supported Metrics
216
+
217
+ | 指標 / Metric | 正解テキスト / Reference | 適したタスク / Best for |
218
+ |--------------|------------------------|------------------------|
219
+ | BLEU | 必要 / Yes | 翻訳・固定フォーマット生成 / Translation, fixed-format generation |
220
+ | ROUGE | 必要 / Yes | 要約 / Summarization |
221
+ | Semantic Similarity | 必要 / Yes | 言い換えを含むタスク / Paraphrase-heavy tasks |
222
+ | LLM-as-a-Judge | 不要 / No | 自由記述生成 / Open-ended generation |
223
+
224
+ ---
225
+
226
+ ## 対応プロバイダ / Supported Providers
227
+
228
+ | プロバイダ / Provider | インストール / Install | モデル例 / Models |
229
+ |----------------------|----------------------|------------------|
230
+ | OpenAI | `pip install llm-evaluation-toolkit[openai]` | gpt-4o, gpt-4o-mini |
231
+ | Anthropic | `pip install llm-evaluation-toolkit[anthropic]` | claude-opus-4-6, claude-haiku-4-5 |
232
+
233
+ ---
234
+
235
+ ## 環境変数 / Environment Variables
236
+
237
+ ```bash
238
+ OPENAI_API_KEY=your_openai_api_key
239
+ ANTHROPIC_API_KEY=your_anthropic_api_key
240
+ ```
241
+
242
+ ---
243
+
244
+ ## 開発環境のセットアップ / Development Setup
245
+
246
+ ```bash
247
+ git clone https://github.com/swoswoyuu1156/llm-evaluation-toolkit.git
248
+ cd llm-evaluation-toolkit
249
+ python -m venv .venv
250
+
251
+ # Mac/Linux
252
+ source .venv/bin/activate
253
+ # Windows
254
+ .venv\Scripts\activate
255
+
256
+ pip install -e ".[dev]"
257
+
258
+ # テスト実行 / Run tests
259
+ pytest tests/ -v --cov=src/llm_eval
260
+
261
+ # リント実行 / Run linter
262
+ ruff check src/ tests/
263
+ ```
264
+
265
+ ---
266
+
267
+ ## コントリビューション / Contributing
268
+
269
+ コントリビューションを歓迎します!まず [CONTRIBUTING.md](CONTRIBUTING.md) をご確認ください。
270
+
271
+ *Contributions are welcome! Please read [CONTRIBUTING.md](CONTRIBUTING.md) first.*
272
+
273
+ ---
274
+
275
+ ## ライセンス / License
276
+
277
+ MIT License — 詳細は [LICENSE](LICENSE) をご確認ください。
278
+
279
+ *MIT License — see [LICENSE](LICENSE) for details.*
@@ -0,0 +1,17 @@
1
+ llm_eval/__init__.py,sha256=WZGF2gsReDmuW8tTozYzRP2CsW32wxVSoh9FXGpEHrE,780
2
+ llm_eval/datasets.py,sha256=stNm2vvSZf9y5sz6qNYxGXowAe458levjSy35Qg4gOc,3528
3
+ llm_eval/types.py,sha256=J1nFoWtPsi1fwRmxF_UpIKHzXOkYcNlW1-mybLa30FI,418
4
+ llm_eval/evaluators/__init__.py,sha256=P6Ayk7a3DkB8YPSBm1v2IvWJQszqNwIVbxJ4xxSH0Eg,841
5
+ llm_eval/metrics/__init__.py,sha256=k57U6Y1JBDf6-uS5tgTL6W0ggEAotMh4VbYumJ6-PyM,373
6
+ llm_eval/metrics/base.py,sha256=Z4bFvy9n6LpG78-UXUaL4SpO8x73q95TotLecEIVuKs,491
7
+ llm_eval/metrics/bleu.py,sha256=X7YqDYZcS2lHESuIhb8FX-r9K3hHZ3zDQNn-uJQvIDs,1826
8
+ llm_eval/metrics/judge.py,sha256=edwJxALUNtwu0LABS3F-QeM0MQNpUF-Y8phLnEFKPm0,4115
9
+ llm_eval/metrics/rouge.py,sha256=CEt-bAvKQTAXdxlU1z76bxt9wePlKDSluyaUZ05EQ-o,1762
10
+ llm_eval/metrics/semantic.py,sha256=FtqrWYQkr_juOezlOs-p4aYvaKuf6Z4CHZkZt5PDxjI,2729
11
+ llm_eval/providers/__init__.py,sha256=O_ommltS7L6GyI__ff8DDDVCXhWY5kxDp2eI4aAu42o,1067
12
+ llm_eval/providers/anthropic_provider.py,sha256=Qhh0bgmzr3-kTG0PVr4POuBWTfGbF17ch_oloDh4esQ,1115
13
+ llm_eval/providers/openai_provider.py,sha256=XYhUed104Vfvrlo1vHqXJbi_pZG6buzXqdN3bqgpQuc,1184
14
+ llm_evaluation_toolkit-0.1.0.dist-info/METADATA,sha256=06QyqOU_lJsq-9EJAQESmcRCjjO2GaMiiDD6AIFUHbg,10275
15
+ llm_evaluation_toolkit-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
16
+ llm_evaluation_toolkit-0.1.0.dist-info/licenses/LICENSE,sha256=o38RZRbQzW-aKIgfjXQ3tNCZoSDUNvY3-OC15qE6baE,1070
17
+ llm_evaluation_toolkit-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 swoswoyuu1156
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.