llm-evaluation-toolkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. llm_evaluation_toolkit-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
  2. llm_evaluation_toolkit-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +15 -0
  3. llm_evaluation_toolkit-0.1.0/.github/pull_request_template.md +15 -0
  4. llm_evaluation_toolkit-0.1.0/.github/workflows/ci.yml +41 -0
  5. llm_evaluation_toolkit-0.1.0/.github/workflows/release.yml +42 -0
  6. llm_evaluation_toolkit-0.1.0/.gitignore +234 -0
  7. llm_evaluation_toolkit-0.1.0/CHANGELOG.md +16 -0
  8. llm_evaluation_toolkit-0.1.0/CODE_OF_CONDUCT.md +0 -0
  9. llm_evaluation_toolkit-0.1.0/CONTRIBUTING.md +91 -0
  10. llm_evaluation_toolkit-0.1.0/LICENSE +21 -0
  11. llm_evaluation_toolkit-0.1.0/PKG-INFO +279 -0
  12. llm_evaluation_toolkit-0.1.0/README.md +219 -0
  13. llm_evaluation_toolkit-0.1.0/examples/basic_usage.py +72 -0
  14. llm_evaluation_toolkit-0.1.0/pyproject.toml +60 -0
  15. llm_evaluation_toolkit-0.1.0/src/llm_eval/__init__.py +30 -0
  16. llm_evaluation_toolkit-0.1.0/src/llm_eval/datasets.py +120 -0
  17. llm_evaluation_toolkit-0.1.0/src/llm_eval/evaluators/__init__.py +30 -0
  18. llm_evaluation_toolkit-0.1.0/src/llm_eval/metrics/__init__.py +13 -0
  19. llm_evaluation_toolkit-0.1.0/src/llm_eval/metrics/base.py +23 -0
  20. llm_evaluation_toolkit-0.1.0/src/llm_eval/metrics/bleu.py +61 -0
  21. llm_evaluation_toolkit-0.1.0/src/llm_eval/metrics/judge.py +123 -0
  22. llm_evaluation_toolkit-0.1.0/src/llm_eval/metrics/rouge.py +55 -0
  23. llm_evaluation_toolkit-0.1.0/src/llm_eval/metrics/semantic.py +78 -0
  24. llm_evaluation_toolkit-0.1.0/src/llm_eval/providers/__init__.py +36 -0
  25. llm_evaluation_toolkit-0.1.0/src/llm_eval/providers/anthropic_provider.py +37 -0
  26. llm_evaluation_toolkit-0.1.0/src/llm_eval/providers/openai_provider.py +38 -0
  27. llm_evaluation_toolkit-0.1.0/src/llm_eval/types.py +15 -0
  28. llm_evaluation_toolkit-0.1.0/tests/__init__.py +0 -0
  29. llm_evaluation_toolkit-0.1.0/tests/conftest.py +28 -0
  30. llm_evaluation_toolkit-0.1.0/tests/test_evaluators.py +39 -0
  31. llm_evaluation_toolkit-0.1.0/tests/test_metrics.py +138 -0
  32. llm_evaluation_toolkit-0.1.0/tests/test_types.py +53 -0
@@ -0,0 +1,30 @@
1
+ ---
2
+ name: Bug report
3
+ about: バグの報告
4
+ title: '[BUG] '
5
+ labels: bug
6
+ ---
7
+
8
+ ## バグの説明
9
+ <!-- 何が起きているか簡潔に説明してください -->
10
+
11
+ ## 再現手順
12
+ 1.
13
+ 2.
14
+ 3.
15
+
16
+ ## 期待される動作
17
+ <!-- 本来どう動くべきか -->
18
+
19
+ ## 実際の動作
20
+ <!-- 実際に何が起きているか -->
21
+
22
+ ## 環境
23
+ - OS:
24
+ - Python version:
25
+ - llm-evaluation-toolkit version:
26
+
27
+ ## エラーメッセージ
28
+ ```
29
+ エラーをここに貼り付け
30
+ ```
@@ -0,0 +1,15 @@
1
+ ---
2
+ name: Feature request
3
+ about: 新機能の提案
4
+ title: '[FEAT] '
5
+ labels: enhancement
6
+ ---
7
+
8
+ ## 解決したい問題
9
+ <!-- どんな課題があるか -->
10
+
11
+ ## 提案する機能
12
+ <!-- どんな機能があれば解決できるか -->
13
+
14
+ ## 代替案
15
+ <!-- 他に考えられる解決方法があれば -->
@@ -0,0 +1,15 @@
1
+ ## 変更内容
2
+ <!-- このPRで何を変更したか -->
3
+
4
+ ## 変更の種類
5
+ - [ ] バグ修正
6
+ - [ ] 新機能
7
+ - [ ] ドキュメント更新
8
+ - [ ] リファクタリング
9
+
10
+ ## テスト
11
+ - [ ] 既存のテストが全てパスする
12
+ - [ ] 新しいテストを追加した
13
+
14
+ ## 関連Issue
15
+ closes #
@@ -0,0 +1,41 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ main, develop ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.9", "3.10", "3.11"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install -e ".[dev]"
28
+
29
+ - name: Run linter
30
+ run: |
31
+ ruff check src/ tests/
32
+
33
+ - name: Run tests
34
+ run: |
35
+ pytest tests/ -v --cov=src/llm_eval --cov-report=xml --cov-fail-under=70
36
+
37
+ - name: Upload coverage report
38
+ uses: codecov/codecov-action@v4
39
+ with:
40
+ file: ./coverage.xml
41
+ fail_ci_if_error: false
@@ -0,0 +1,42 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*.*.*"
7
+
8
+ jobs:
9
+ release:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ contents: write
13
+ env:
14
+ FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.11"
23
+
24
+ - name: Install build tools
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install build twine
28
+
29
+ - name: Build package
30
+ run: python -m build
31
+
32
+ - name: Publish to PyPI
33
+ env:
34
+ TWINE_USERNAME: __token__
35
+ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
36
+ run: twine upload dist/*
37
+
38
+ - name: Create GitHub Release
39
+ uses: softprops/action-gh-release@v2
40
+ with:
41
+ files: dist/*
42
+ generate_release_notes: true
@@ -0,0 +1,234 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ # .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ # .vscode/
203
+ # Temporary file for partial code execution
204
+ tempCodeRunnerFile.py
205
+
206
+ # Ruff stuff:
207
+ .ruff_cache/
208
+
209
+ # PyPI configuration file
210
+ .pypirc
211
+
212
+ # Marimo
213
+ marimo/_static/
214
+ marimo/_lsp/
215
+ __marimo__/
216
+
217
+ # Streamlit
218
+ .streamlit/secrets.toml
219
+
220
+ # Virtual environment
221
+ .venv/
222
+
223
+ # Build artifacts
224
+ dist/
225
+ *.egg-info/
226
+
227
+ # Coverage
228
+ .coverage
229
+ htmlcov/
230
+
231
+ # Environment variables
232
+ .env
233
+
234
+ ".venv_test/"
@@ -0,0 +1,16 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented here.
4
+
5
+ ## [0.1.0] - 2024-01-01
6
+
7
+ ### Added
8
+ - `BLEUMetric` for translation and generation evaluation
9
+ - `ROUGEMetric` for summarization evaluation
10
+ - `SemanticSimilarityMetric` using sentence-transformers
11
+ - `LLMJudgeMetric` for reference-free evaluation
12
+ - `OpenAIProvider` and `AnthropicProvider` for LLM integration
13
+ - `DatasetLoader` with SQuAD and CNN/DailyMail support
14
+ - `BaseEvaluator` for running multiple metrics at once
15
+ - GitHub Actions CI/CD pipeline
16
+ - Support for Python 3.9, 3.10, 3.11
File without changes
@@ -0,0 +1,91 @@
1
+ # llm-evaluation-toolkit への貢献ガイド
2
+ # Contributing to llm-evaluation-toolkit
3
+
4
+ 貢献に興味を持っていただきありがとうございます!
5
+
6
+ *Thank you for your interest in contributing!*
7
+
8
+ ---
9
+
10
+ ## はじめに / Getting Started
11
+
12
+ 1. リポジトリをフォークする / Fork the repository
13
+ 2. `develop` ブランチからフィーチャーブランチを作成する / Create a feature branch from `develop`
14
+ ```bash
15
+ git checkout develop
16
+ git checkout -b feat/your-feature-name
17
+ ```
18
+ 3. 変更を加える / Make your changes
19
+ 4. テストとリントを実行する / Run tests and linter
20
+ ```bash
21
+ ruff check src/ tests/
22
+ pytest tests/ -v --cov=src/llm_eval
23
+ ```
24
+ 5. コミットしてプッシュする / Commit and push
25
+ ```bash
26
+ git commit -m "feat: your feature description"
27
+ git push origin feat/your-feature-name
28
+ ```
29
+ 6. `develop` ブランチへのPull Requestを作成する / Open a Pull Request to `develop`
30
+
31
+ ---
32
+
33
+ ## 新しい評価指標を追加する / Adding a New Metric
34
+
35
+ 1. `src/llm_eval/metrics/your_metric.py` を作成する / Create `src/llm_eval/metrics/your_metric.py`
36
+ 2. `BaseMetric` を継承して `compute()` を実装する / Inherit from `BaseMetric` and implement `compute()`
37
+ 3. `src/llm_eval/metrics/__init__.py` からエクスポートする / Export from `src/llm_eval/metrics/__init__.py`
38
+ 4. `tests/test_metrics.py` にテストを追加する / Add tests in `tests/test_metrics.py`
39
+
40
+ ```python
41
+ from __future__ import annotations
42
+
43
+ from llm_eval.metrics.base import BaseMetric
44
+ from llm_eval.types import EvalResult
45
+
46
+
47
+ class YourMetric(BaseMetric):
48
+ def __init__(self):
49
+ super().__init__(name="your_metric")
50
+
51
+ def compute(
52
+ self,
53
+ predictions: list[str],
54
+ references: list[str],
55
+ ) -> EvalResult:
56
+ # ここに実装を書く / your implementation here
57
+ return EvalResult(metric_name=self.name, score=0.0)
58
+ ```
59
+
60
+ ---
61
+
62
+ ## コミットメッセージの形式 / Commit Message Format
63
+
64
+ | プレフィックス / Prefix | 用途 / Usage |
65
+ |------------------------|-------------|
66
+ | `feat:` | 新機能 / New feature |
67
+ | `fix:` | バグ修正 / Bug fix |
68
+ | `test:` | テスト追加 / Adding tests |
69
+ | `docs:` | ドキュメント更新 / Documentation |
70
+ | `ci:` | CI/CD関連 / CI/CD changes |
71
+ | `chore:` | メンテナンス / Maintenance |
72
+
73
+ ---
74
+
75
+ ## コードスタイル / Code Style
76
+
77
+ このプロジェクトは `ruff` を使用しています。コミット前に必ず実行してください。
78
+
79
+ *This project uses `ruff` for linting. Run before committing:*
80
+
81
+ ```bash
82
+ ruff check src/ tests/
83
+ ```
84
+
85
+ ---
86
+
87
+ ## 質問・提案 / Questions & Suggestions
88
+
89
+ バグ報告や機能提案は [Issues](https://github.com/swoswoyuu1156/llm-evaluation-toolkit/issues) からお気軽にどうぞ。
90
+
91
+ *For bug reports and feature requests, feel free to open an [Issue](https://github.com/swoswoyuu1156/llm-evaluation-toolkit/issues).*
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 swoswoyuu1156
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.