meta-evaluator 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. meta_evaluator-0.1.0/.env.example +53 -0
  2. meta_evaluator-0.1.0/.github/workflows/ci.yml +32 -0
  3. meta_evaluator-0.1.0/.github/workflows/publish.yml +24 -0
  4. meta_evaluator-0.1.0/.gitignore +37 -0
  5. meta_evaluator-0.1.0/.pre-commit-config.yaml +27 -0
  6. meta_evaluator-0.1.0/.python-version +1 -0
  7. meta_evaluator-0.1.0/CLAUDE.md +94 -0
  8. meta_evaluator-0.1.0/LICENSE +21 -0
  9. meta_evaluator-0.1.0/PKG-INFO +349 -0
  10. meta_evaluator-0.1.0/README.md +304 -0
  11. meta_evaluator-0.1.0/docker/Dockerfile +21 -0
  12. meta_evaluator-0.1.0/docker/Dockerfile.airbase +25 -0
  13. meta_evaluator-0.1.0/docker/docker-compose.yml +36 -0
  14. meta_evaluator-0.1.0/docs/annotation_guide/annotation.md +216 -0
  15. meta_evaluator-0.1.0/docs/annotation_guide/deployment.md +167 -0
  16. meta_evaluator-0.1.0/docs/assets/example_accuracy_scores.png +0 -0
  17. meta_evaluator-0.1.0/docs/assets/example_aggregate_winning_rates.png +0 -0
  18. meta_evaluator-0.1.0/docs/assets/example_score_report_console.png +0 -0
  19. meta_evaluator-0.1.0/docs/assets/example_score_report_html.png +0 -0
  20. meta_evaluator-0.1.0/docs/assets/simple_annotation_interface.png +0 -0
  21. meta_evaluator-0.1.0/docs/assets/simple_annotation_interface_deprecated.png +0 -0
  22. meta_evaluator-0.1.0/docs/assets/simple_annotation_interface_mobile.jpg +0 -0
  23. meta_evaluator-0.1.0/docs/guides/base.md +104 -0
  24. meta_evaluator-0.1.0/docs/guides/evaldata.md +151 -0
  25. meta_evaluator-0.1.0/docs/guides/evaltask.md +367 -0
  26. meta_evaluator-0.1.0/docs/guides/judges_load.md +320 -0
  27. meta_evaluator-0.1.0/docs/guides/judges_run.md +201 -0
  28. meta_evaluator-0.1.0/docs/guides/results.md +189 -0
  29. meta_evaluator-0.1.0/docs/guides/scoring.md +789 -0
  30. meta_evaluator-0.1.0/docs/index.md +42 -0
  31. meta_evaluator-0.1.0/docs/stylesheets/extra.css +37 -0
  32. meta_evaluator-0.1.0/docs/tutorial.md +354 -0
  33. meta_evaluator-0.1.0/examples/rabakbench/run_evaluation.py +148 -0
  34. meta_evaluator-0.1.0/examples/rabakbench/run_human_annotation.py +83 -0
  35. meta_evaluator-0.1.0/examples/rejection/data/sample_rejection.csv +235 -0
  36. meta_evaluator-0.1.0/examples/rejection/judges.yaml +50 -0
  37. meta_evaluator-0.1.0/examples/rejection/logs/.gitkeep +0 -0
  38. meta_evaluator-0.1.0/examples/rejection/project_dir/.gitkeep +0 -0
  39. meta_evaluator-0.1.0/examples/rejection/prompt.md +23 -0
  40. meta_evaluator-0.1.0/examples/rejection/run_evaluation.py +152 -0
  41. meta_evaluator-0.1.0/examples/rejection/run_evaluation_sync.py +141 -0
  42. meta_evaluator-0.1.0/examples/rejection/run_human_annotation.py +67 -0
  43. meta_evaluator-0.1.0/examples/rejection/run_scoring_only.py +284 -0
  44. meta_evaluator-0.1.0/mkdocs.yml +68 -0
  45. meta_evaluator-0.1.0/pyproject.toml +86 -0
  46. meta_evaluator-0.1.0/src/meta_evaluator/__init__.py +19 -0
  47. meta_evaluator-0.1.0/src/meta_evaluator/annotator/exceptions.py +60 -0
  48. meta_evaluator-0.1.0/src/meta_evaluator/annotator/interface/__init__.py +7 -0
  49. meta_evaluator-0.1.0/src/meta_evaluator/annotator/interface/streamlit_app.py +815 -0
  50. meta_evaluator-0.1.0/src/meta_evaluator/annotator/interface/streamlit_session_manager.py +316 -0
  51. meta_evaluator-0.1.0/src/meta_evaluator/annotator/launcher/__init__.py +7 -0
  52. meta_evaluator-0.1.0/src/meta_evaluator/annotator/launcher/entry_point.py +26 -0
  53. meta_evaluator-0.1.0/src/meta_evaluator/annotator/launcher/streamlit_launcher.py +248 -0
  54. meta_evaluator-0.1.0/src/meta_evaluator/common/__init__.py +1 -0
  55. meta_evaluator-0.1.0/src/meta_evaluator/common/async_utils.py +32 -0
  56. meta_evaluator-0.1.0/src/meta_evaluator/common/error_constants.py +12 -0
  57. meta_evaluator-0.1.0/src/meta_evaluator/common/models.py +26 -0
  58. meta_evaluator-0.1.0/src/meta_evaluator/data/__init__.py +20 -0
  59. meta_evaluator-0.1.0/src/meta_evaluator/data/dataloader.py +219 -0
  60. meta_evaluator-0.1.0/src/meta_evaluator/data/eval_data.py +913 -0
  61. meta_evaluator-0.1.0/src/meta_evaluator/data/exceptions.py +177 -0
  62. meta_evaluator-0.1.0/src/meta_evaluator/data/serialization.py +26 -0
  63. meta_evaluator-0.1.0/src/meta_evaluator/eval_task/__init__.py +8 -0
  64. meta_evaluator-0.1.0/src/meta_evaluator/eval_task/eval_task.py +299 -0
  65. meta_evaluator-0.1.0/src/meta_evaluator/eval_task/exceptions.py +28 -0
  66. meta_evaluator-0.1.0/src/meta_evaluator/eval_task/serialization.py +20 -0
  67. meta_evaluator-0.1.0/src/meta_evaluator/judge/__init__.py +5 -0
  68. meta_evaluator-0.1.0/src/meta_evaluator/judge/async_evaluator.py +716 -0
  69. meta_evaluator-0.1.0/src/meta_evaluator/judge/enums.py +34 -0
  70. meta_evaluator-0.1.0/src/meta_evaluator/judge/exceptions.py +129 -0
  71. meta_evaluator-0.1.0/src/meta_evaluator/judge/judge.py +646 -0
  72. meta_evaluator-0.1.0/src/meta_evaluator/judge/models.py +304 -0
  73. meta_evaluator-0.1.0/src/meta_evaluator/judge/serialization.py +22 -0
  74. meta_evaluator-0.1.0/src/meta_evaluator/judge/sync_evaluator.py +667 -0
  75. meta_evaluator-0.1.0/src/meta_evaluator/meta_evaluator/__init__.py +5 -0
  76. meta_evaluator-0.1.0/src/meta_evaluator/meta_evaluator/base.py +712 -0
  77. meta_evaluator-0.1.0/src/meta_evaluator/meta_evaluator/exceptions.py +345 -0
  78. meta_evaluator-0.1.0/src/meta_evaluator/meta_evaluator/judge.py +1067 -0
  79. meta_evaluator-0.1.0/src/meta_evaluator/meta_evaluator/scoring.py +1134 -0
  80. meta_evaluator-0.1.0/src/meta_evaluator/meta_evaluator/serialization.py +20 -0
  81. meta_evaluator-0.1.0/src/meta_evaluator/results/__init__.py +41 -0
  82. meta_evaluator-0.1.0/src/meta_evaluator/results/base.py +582 -0
  83. meta_evaluator-0.1.0/src/meta_evaluator/results/enums.py +29 -0
  84. meta_evaluator-0.1.0/src/meta_evaluator/results/exceptions.py +134 -0
  85. meta_evaluator-0.1.0/src/meta_evaluator/results/human_results.py +344 -0
  86. meta_evaluator-0.1.0/src/meta_evaluator/results/judge_results.py +617 -0
  87. meta_evaluator-0.1.0/src/meta_evaluator/results/models.py +188 -0
  88. meta_evaluator-0.1.0/src/meta_evaluator/results/serialization.py +50 -0
  89. meta_evaluator-0.1.0/src/meta_evaluator/scores/__init__.py +24 -0
  90. meta_evaluator-0.1.0/src/meta_evaluator/scores/base_scorer.py +149 -0
  91. meta_evaluator-0.1.0/src/meta_evaluator/scores/base_scoring_result.py +68 -0
  92. meta_evaluator-0.1.0/src/meta_evaluator/scores/enums.py +16 -0
  93. meta_evaluator-0.1.0/src/meta_evaluator/scores/exceptions.py +52 -0
  94. meta_evaluator-0.1.0/src/meta_evaluator/scores/metrics/__init__.py +15 -0
  95. meta_evaluator-0.1.0/src/meta_evaluator/scores/metrics/agreement/alt_test.py +729 -0
  96. meta_evaluator-0.1.0/src/meta_evaluator/scores/metrics/agreement/iaa.py +164 -0
  97. meta_evaluator-0.1.0/src/meta_evaluator/scores/metrics/classification/classification_scorer.py +276 -0
  98. meta_evaluator-0.1.0/src/meta_evaluator/scores/metrics/text_comparison/semantic_similarity.py +336 -0
  99. meta_evaluator-0.1.0/src/meta_evaluator/scores/metrics/text_comparison/text_similarity.py +193 -0
  100. meta_evaluator-0.1.0/src/meta_evaluator/scores/metrics_config.py +136 -0
  101. meta_evaluator-0.1.0/src/meta_evaluator/scores/utils.py +93 -0
  102. meta_evaluator-0.1.0/src/meta_evaluator/scores_reporting/__init__.py +5 -0
  103. meta_evaluator-0.1.0/src/meta_evaluator/scores_reporting/score_report.py +231 -0
  104. meta_evaluator-0.1.0/temp +0 -0
  105. meta_evaluator-0.1.0/tests/__init__.py +1 -0
  106. meta_evaluator-0.1.0/tests/annotator/conftest.py +405 -0
  107. meta_evaluator-0.1.0/tests/annotator/interface/test_streamlit_app.py +1555 -0
  108. meta_evaluator-0.1.0/tests/annotator/interface/test_streamlit_session_manager.py +602 -0
  109. meta_evaluator-0.1.0/tests/annotator/launcher/test_streamlit_launcher.py +350 -0
  110. meta_evaluator-0.1.0/tests/annotator/test_annotator_integration.py +315 -0
  111. meta_evaluator-0.1.0/tests/annotator/test_human_results.py +666 -0
  112. meta_evaluator-0.1.0/tests/conftest.py +296 -0
  113. meta_evaluator-0.1.0/tests/data/__init__.py +1 -0
  114. meta_evaluator-0.1.0/tests/data/conftest.py +404 -0
  115. meta_evaluator-0.1.0/tests/data/test_dataloader.py +269 -0
  116. meta_evaluator-0.1.0/tests/data/test_eval_data.py +564 -0
  117. meta_evaluator-0.1.0/tests/data/test_sample_eval_data.py +611 -0
  118. meta_evaluator-0.1.0/tests/judge/__init__.py +1 -0
  119. meta_evaluator-0.1.0/tests/judge/conftest.py +520 -0
  120. meta_evaluator-0.1.0/tests/judge/test_judge.py +510 -0
  121. meta_evaluator-0.1.0/tests/judge/test_judge_async_evaluation.py +858 -0
  122. meta_evaluator-0.1.0/tests/judge/test_judge_results.py +535 -0
  123. meta_evaluator-0.1.0/tests/judge/test_judge_sync_evaluation.py +822 -0
  124. meta_evaluator-0.1.0/tests/meta_evaluator/__init__.py +1 -0
  125. meta_evaluator-0.1.0/tests/meta_evaluator/conftest.py +992 -0
  126. meta_evaluator-0.1.0/tests/meta_evaluator/test_metaevaluator.py +1238 -0
  127. meta_evaluator-0.1.0/tests/meta_evaluator/test_metaevaluator_integration.py +583 -0
  128. meta_evaluator-0.1.0/tests/meta_evaluator/test_metaevaluator_judges.py +1782 -0
  129. meta_evaluator-0.1.0/tests/meta_evaluator/test_metaevaluator_scoring.py +1358 -0
  130. meta_evaluator-0.1.0/tests/scores/__init__.py +1 -0
  131. meta_evaluator-0.1.0/tests/scores/conftest.py +644 -0
  132. meta_evaluator-0.1.0/tests/scores/test_agreement.py +342 -0
  133. meta_evaluator-0.1.0/tests/scores/test_classification.py +97 -0
  134. meta_evaluator-0.1.0/tests/scores/test_custom_scorer.py +177 -0
  135. meta_evaluator-0.1.0/tests/scores/test_metrics_config.py +153 -0
  136. meta_evaluator-0.1.0/tests/scores/test_text_comparison.py +110 -0
  137. meta_evaluator-0.1.0/tests/scores_reporting/test_score_report.py +168 -0
  138. meta_evaluator-0.1.0/uv.lock +2025 -0
@@ -0,0 +1,53 @@
1
+ # Environment Variables for LLM Providers
2
+ # All variables are optional - only set the ones for providers you plan to use
3
+ # For detailed setup instructions, see: https://docs.litellm.ai/docs/providers
4
+
5
+ # ================================================================================
6
+ # OpenAI
7
+ # ================================================================================
8
+ # Required for OpenAI models (gpt-4, gpt-3.5-turbo, etc.)
9
+ export OPENAI_API_KEY=sk-your-openai-api-key-here
10
+
11
+ # ================================================================================
12
+ # Azure OpenAI
13
+ # ================================================================================
14
+ # Required for Azure OpenAI models
15
+ export AZURE_API_BASE=https://your-resource-name.openai.azure.com/
16
+ export AZURE_API_KEY=your-azure-api-key-here
17
+ export AZURE_API_VERSION=2024-02-15-preview
18
+
19
+ # ================================================================================
20
+ # Anthropic (Claude)
21
+ # ================================================================================
22
+ # Required for Claude models (claude-3-opus, claude-3-sonnet, etc.)
23
+ export ANTHROPIC_API_KEY=sk-ant-your-anthropic-api-key-here
24
+ export ANTHROPIC_DEFAULT_MODEL=claude-3-sonnet
25
+
26
+ # ================================================================================
27
+ # AWS Bedrock
28
+ # ================================================================================
29
+ # Standard AWS credentials for Bedrock models
30
+ export AWS_ACCESS_KEY_ID=your-aws-access-key-id
31
+ export AWS_SECRET_ACCESS_KEY=your-aws-secret-access-key
32
+ export AWS_SESSION_TOKEN=your-aws-session-token
33
+
34
+ # Alternative: Bearer token for Bedrock
35
+ export AWS_BEARER_TOKEN_BEDROCK=your-bedrock-bearer-token
36
+
37
+ # ================================================================================
38
+ # Other Providers
39
+ # ================================================================================
40
+ # Hugging Face models
41
+ export HF_TOKEN=hf_your-hugging-face-token
42
+
43
+ # OpenRouter (provides access to multiple models)
44
+ export OPENROUTER_API_KEY=sk-or-your-openrouter-key
45
+
46
+ # xAI (Grok models)
47
+ export XAI_API_KEY=xai-your-api-key-here
48
+
49
+ # Groq (fast inference)
50
+ export GROQ_API_KEY=gsk_your-groq-api-key
51
+
52
+ # Fireworks AI
53
+ export FIREWORKS_AI_API_KEY=your-fireworks-api-key
@@ -0,0 +1,32 @@
1
+ # .github/workflows/code-quality.yml
2
+ name: Code Quality
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ pre-commit:
7
+ runs-on: ubuntu-latest
8
+
9
+ steps:
10
+ # 1 — Check out the code
11
+ - uses: actions/checkout@v4
12
+
13
+ # 2 — Install uv, but turn off its built-in cache
14
+ - uses: astral-sh/setup-uv@v3
15
+ with:
16
+ enable-cache: false # we will manage caching explicitly
17
+
18
+ # 3 — Restore (or create) a cache keyed to lock-file + pyproject
19
+ - name: Restore uv cache
20
+ uses: actions/cache@v4
21
+ with:
22
+ path: ~/.cache/uv # default uv cache dir :contentReference[oaicite:3]{index=3}
23
+ key: uv-${{ runner.os }}-${{ hashFiles('uv.lock', 'pyproject.toml') }}
24
+
25
+ # 4 — Install all declared dependencies every run
26
+ - name: Sync dependencies
27
+ run: uv sync --group dev --extra all
28
+
29
+ # 5 — Run the same hooks you use locally
30
+ - name: Run pre-commit hooks
31
+ run: uv run pre-commit run --all-files
32
+
@@ -0,0 +1,24 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ publish:
9
+ name: Build and publish to PyPI
10
+ runs-on: ubuntu-latest
11
+ environment: pypi
12
+ permissions:
13
+ id-token: write
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - name: Install uv
18
+ uses: astral-sh/setup-uv@v4
19
+
20
+ - name: Build package
21
+ run: uv build
22
+
23
+ - name: Publish to PyPI
24
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,37 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+ .env
12
+ node_modules/
13
+ package-lock.json
14
+ plan.md
15
+
16
+ # OS generated files
17
+ .DS_Store
18
+
19
+ # MkDocs build output
20
+ site/
21
+
22
+ # Data directories
23
+ */examples/demo_annotations/
24
+
25
+ # Ignore all examples except tracked ones
26
+ examples/*
27
+ !examples/rejection/
28
+ !examples/rabakbench/run_evaluation.py
29
+ !examples/rabakbench/run_human_annotation.py
30
+
31
+ # Project directories - ignore contents but keep structure
32
+ **/project_dir/*
33
+ !**/project_dir/.gitkeep
34
+
35
+ # Logs
36
+ **/logs/*
37
+ !**/logs/.gitkeep
@@ -0,0 +1,27 @@
1
+ repos:
2
+ - repo: local
3
+ hooks:
4
+ - id: ruff-check
5
+ name: ruff check
6
+ entry: uv run ruff check --preview --fix
7
+ language: system
8
+ types: [python]
9
+ pass_filenames: false
10
+ - id: ruff-format
11
+ name: ruff format
12
+ entry: uv run ruff format
13
+ language: system
14
+ types: [python]
15
+ pass_filenames: false
16
+ - id: pyright
17
+ name: pyright
18
+ entry: uv run pyright
19
+ language: system
20
+ types: [python]
21
+ pass_filenames: false
22
+ - id: pytest
23
+ name: pytest
24
+ entry: uv run pytest -k "not integration"
25
+ language: system
26
+ types: [python]
27
+ pass_filenames: false
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,94 @@
1
+ # Claude Development Notes
2
+
3
+ ## App Development Guide
4
+
5
+ This is a Python application to develop a MetaEvaluator.
6
+ Given an evaluation task and dataset, the MetaEvaluator gathers results from LLM Judges and Human annotators, and calculates alignment metrics to measure the performance of different LLM-as-a-Judge.
7
+
8
+ ## Logging
9
+
10
+ For each main class, initialize a logger object with the main class name, and utilize the same logger through the class methods.
11
+
12
+ ## Error Handling
13
+
14
+ Always use custom exceptions. Define an exceptions file in each main functionality group, and define all custom exceptions within the file.
15
+ Before implementing specific error messaging within each exception class, refer to the common utility (meta_evaluator.common.error_constants.py) and utilize existing error messages where applicable.
16
+
17
+ ## Linting and Code Quality
18
+
19
+ After every task, you must run both:
20
+ ```bash
21
+ uv tool run ruff check --preview --fix
22
+ uv tool run ruff format .
23
+ ```
24
+
25
+ Always fix all errors from the ruff check. If there are unfixable errors, document them and ask for guidance.
26
+
27
+ ## Type Checking
28
+
29
+ Streamlit is an optional dependency (`ui` extra). To ensure pyright can resolve all imports, sync with extras before type checking:
30
+ ```bash
31
+ uv sync --extra all
32
+ ```
33
+
34
+ After every task, run type checking:
35
+ ```bash
36
+ uv run pyright
37
+ ```
38
+
39
+ If there are type errors that cannot be resolved, document them and ask for guidance before proceeding.
40
+
41
+ ## Testing
42
+
43
+ Always look for existing pytest fixtures in the corresponding conftest.py files before implementing new fixtures.
44
+ All pytest fixtures should be defined in the corresponding conftest.py files with clear documentation.
45
+
46
+ To run tests, always use:
47
+ ```bash
48
+ uv run pytest
49
+ ```
50
+
51
+ You can use regular pytest options after `uv run pytest`, for example:
52
+ ```bash
53
+ uv run pytest tests/specific_test.py
54
+ uv run pytest -v
55
+ uv run pytest -k "test_name"
56
+ ```
57
+
58
+ After every task, run tests excluding integration tests (integration tests require external services and are slower):
59
+ ```bash
60
+ uv run pytest -m "not integration"
61
+ ```
62
+
63
+ ## Task Completion Workflow
64
+
65
+ At the end of every task, run in order:
66
+
67
+ 1. **Linting and formatting:**
68
+ ```bash
69
+ uv tool run ruff check --preview --fix
70
+ uv tool run ruff format .
71
+ ```
72
+
73
+ 2. **Type checking:**
74
+ ```bash
75
+ uv run pyright
76
+ ```
77
+
78
+ 3. **Testing:**
79
+ ```bash
80
+ uv run pytest -m "not integration"
81
+ ```
82
+
83
+ Ensure your last command is always `uv tool run ruff format .`
84
+
85
+ ## Packaging
86
+
87
+ - The package version is defined in `src/meta_evaluator/__init__.py` (`__version__`) and read by hatchling via `[tool.hatch.version]`.
88
+ - Optional dependency groups: `docs` (mkdocs), `ui` (streamlit), `all` (everything). Core deps include matplotlib.
89
+ - Publishing is automated via `.github/workflows/publish.yml` on GitHub release creation using trusted publishing (OIDC).
90
+
91
+ ## Additional Reminders (VERY IMPORTANT!)
92
+
93
+ - If you have any questions or doubts about the instructions, ask me before you begin.
94
+ - Do NOT make any assumptions about what the user is asking. ALWAYS CLARIFY.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 GovTech Singapore AI Practice - Responsible AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,349 @@
1
+ Metadata-Version: 2.4
2
+ Name: meta-evaluator
3
+ Version: 0.1.0
4
+ Summary: Evaluate LLM-as-a-Judge systems by measuring alignment between judge outputs and human annotations
5
+ Project-URL: Homepage, https://github.com/govtech-responsibleai/meta-evaluator
6
+ Project-URL: Repository, https://github.com/govtech-responsibleai/meta-evaluator
7
+ Project-URL: Issues, https://github.com/govtech-responsibleai/meta-evaluator/issues
8
+ Author: aipractice
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: alignment,evaluation,judge,llm,meta-evaluation,nlp
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.13
20
+ Requires-Dist: beartype>=0.21.0
21
+ Requires-Dist: boto3>=1.40.10
22
+ Requires-Dist: botocore>=1.40.10
23
+ Requires-Dist: google-auth>=2.40.3
24
+ Requires-Dist: instructor>=1.8.3
25
+ Requires-Dist: litellm>=1.65.1
26
+ Requires-Dist: matplotlib>=3.5.0
27
+ Requires-Dist: openai>=1.82.1
28
+ Requires-Dist: polars>=1.30.0
29
+ Requires-Dist: pydantic>=2.11.5
30
+ Requires-Dist: python-dotenv>=1.1.0
31
+ Requires-Dist: pyyaml>=6.0.0
32
+ Requires-Dist: scikit-learn>=1.5.0
33
+ Requires-Dist: scipy>=1.9.0
34
+ Requires-Dist: tqdm>=4.66.0
35
+ Provides-Extra: all
36
+ Requires-Dist: mkdocs-material>=9.6.17; extra == 'all'
37
+ Requires-Dist: mkdocs>=1.6.1; extra == 'all'
38
+ Requires-Dist: streamlit>=1.50.0; extra == 'all'
39
+ Provides-Extra: docs
40
+ Requires-Dist: mkdocs-material>=9.6.17; extra == 'docs'
41
+ Requires-Dist: mkdocs>=1.6.1; extra == 'docs'
42
+ Provides-Extra: ui
43
+ Requires-Dist: streamlit>=1.50.0; extra == 'ui'
44
+ Description-Content-Type: text/markdown
45
+
46
+ # MetaEvaluator
47
+
48
+ Evaluate LLM-as-a-Judge systems by measuring alignment between judge outputs with human annotations.
49
+
50
+ ## Overview
51
+
52
+ MetaEvaluator helps you assess LLM judges by:
53
+ - 🤖 **Running multiple judges** (OpenAI, Anthropic, Google, AWS, etc.) using **LiteLLM integration**
54
+ - 👥 **Collecting human annotations** through a built-in Streamlit interface
55
+ - 📊 **Computing alignment metrics** (Accuracy, Cohen's Kappa, Alt-Test, text/semantic similarity) and **generating comprehensive reports** with visualizations and statistical analysis
56
+
57
+ ## Installation
58
+
59
+ 1. **Install the package:**
60
+ ```bash
61
+ # Requires Python 3.13+
62
+ pip install meta-evaluator
63
+ ```
64
+
65
+ **Optional dependencies:**
66
+ ```bash
67
+ pip install meta-evaluator[ui] # streamlit for human annotation interface
68
+ pip install meta-evaluator[docs] # mkdocs for documentation
69
+ pip install meta-evaluator[all] # all optional dependencies
70
+ ```
71
+
72
+ Or install directly from GitHub:
73
+ ```bash
74
+ pip install git+https://github.com/govtech-responsibleai/meta-evaluator
75
+ ```
76
+
77
+ 2. **Set up environment variables:**
78
+ You can either:
79
+ - Copy the [.env.example](https://github.com/govtech-responsibleai/meta-evaluator/blob/main/.env.example) file from the GitHub repo, replace with your API keys, and use `dotenv.load_dotenv()` in your script
80
+ - Set the environment variables directly in your shell
81
+
82
+ See [LiteLLM providers documentation](https://docs.litellm.ai/docs/providers) for all supported providers.
83
+
84
+ 3. **(Optional) For developers: clone the repository and set up dev tools:**
85
+ ```bash
86
+ git clone https://github.com/govtech-responsibleai/meta-evaluator
87
+ cd meta-evaluator
88
+ uv sync
89
+ uv run pre-commit install
90
+ ```
91
+
92
+ ## Getting Started
93
+
94
+ See our [**Tutorial**](docs/tutorial.md) for a complete walkthrough, or check out the full example at: [`examples/rejection/run_evaluation.py`](examples/rejection/run_evaluation.py)
95
+ The sections below provide an overview of the main components.
96
+
97
+ ### 1. Initialize MetaEvaluator
98
+ Start by creating a MetaEvaluator instance:
99
+
100
+ ```python
101
+ from meta_evaluator import MetaEvaluator
102
+
103
+ # Create new project
104
+ evaluator = MetaEvaluator(project_dir="my_project")
105
+ ```
106
+
107
+ ### 2. Load Data
108
+ Load your evaluation datasets from CSV, JSON, or Parquet files:
109
+
110
+ ```python
111
+ from meta_evaluator.data import DataLoader
112
+
113
+ data = DataLoader.load_csv(
114
+ name="evaluation_data",
115
+ file_path="data/samples.csv"
116
+ )
117
+ evaluator.add_data(data)
118
+ ```
119
+
120
+ ### 3. Define Task
121
+ Define what and how to evaluate using EvalTask:
122
+
123
+ ```python
124
+ from meta_evaluator.eval_task import EvalTask
125
+
126
+ task = EvalTask(
127
+ task_schemas={
128
+ "rejection": ["rejection", "not rejection"], # Classification (required by default)
129
+ "explanation": None, # Free-form text (not required by default)
130
+ },
131
+ # required_tasks not specified - only classification tasks required by default
132
+ prompt_columns=["prompt"], # Context columns
133
+ response_columns=["llm_response"], # What to evaluate
134
+ answering_method="structured", # JSON output parsing
135
+ structured_outputs_fallback=True # Fallback support
136
+ )
137
+ evaluator.add_eval_task(task)
138
+ ```
139
+
140
+ ### 4. Collect Human Annotations
141
+ Collect human ground truth using the built-in Streamlit interface:
142
+
143
+ ```python
144
+ # Launch annotation interface
145
+ evaluator.launch_annotator(port=8501)
146
+ ```
147
+
148
+ For deployment options including remote access (ngrok) and Docker deployment for classified data, see the [Deployment Guide](docs/annotation_guide/deployment.md).
149
+
150
+ ### 5. Judge Configuration & Prompt Templates
151
+ Configure multiple LLM judges using YAML and template-based prompts:
152
+
153
+ ```yaml
154
+ judges:
155
+ - id: gpt_4_judge
156
+ llm_client: openai
157
+ model: gpt-4o-mini
158
+ prompt_file: ./prompt.md # Filepath relative to YAML file
159
+ temperature: 0.0 # Optional: sampling temperature
160
+
161
+ - id: claude_judge
162
+ llm_client: anthropic
163
+ model: claude-3-5-haiku-latest
164
+ prompt_file: ./prompt.md # Filepath relative to YAML file
165
+
166
+ - id: hf_judge
167
+ llm_client: huggingface/together
168
+ model: meta-llama/Llama-3.3-70B-Instruct
169
+ prompt_file: ./prompt.md # Filepath relative to YAML file
170
+ extra_headers: # Optional: e.g. bill to a HuggingFace organisation
171
+ X-HF-Bill-To: your-org-name
172
+ ```
173
+
174
+ **Prompt Template System**: MetaEvaluator uses a template-based system where you can define placeholders in your `prompt.md` files using curly braces (`{variable_name}`). These variables are automatically substituted with columns found in your dataset during evaluation.
175
+
176
+ ```markdown
177
+ ## Instructions:
178
+ Evaluate whether the given response is a rejection.
179
+
180
+ ## To Evaluate:
181
+ Prompt: {prompt}
182
+ Response: {llm_response}
183
+ ```
184
+
185
+ ### 6. Run LLM Judges
186
+ Load and run your configured judges:
187
+
188
+ ```python
189
+ # Load judges and run evaluation
190
+ evaluator.load_judges_from_yaml("judges.yaml", async_mode=True)
191
+ evaluator.run_judges_async(
192
+ skip_duplicates=True,
193
+ consistency=1, # Optional: run each judge N times and aggregate results
194
+ )
195
+ ```
196
+
197
+ ### 7. Scoring
198
+ Compare judge performance against human annotations:
199
+
200
+ ```python
201
+ # Configure metrics
202
+ from meta_evaluator.scores import MetricConfig, MetricsConfig
203
+ from meta_evaluator.scores.metrics import (
204
+ ClassificationScorer, CohensKappaScorer, SemanticSimilarityScorer
205
+ )
206
+
207
+ config = MetricsConfig(
208
+ metrics=[
209
+ MetricConfig(
210
+ scorer=ClassificationScorer("accuracy"),
211
+ task_names=["rejection"],
212
+ task_strategy="single",
213
+ annotator_aggregation="majority_vote" # Use consensus approach
214
+ ),
215
+ MetricConfig(
216
+ scorer=SemanticSimilarityScorer(), # This metric requires OPENAI_API_KEY
217
+ task_names=["explanation"],
218
+ task_strategy="single",
219
+ annotator_aggregation="individual_average" # Individual averaging
220
+ ),
221
+ ]
222
+ )
223
+
224
+ # Add metrics configuration and run comparison
225
+ evaluator.add_metrics_config(config) # Creates evaluator.score_report automatically
226
+ evaluator.compare_async()
227
+
228
+ # Generate summary report
229
+ evaluator.score_report.save("score_report.html", format="html") # Save HTML report
230
+ evaluator.score_report.save("score_report.csv", format="csv") # Save CSV report
231
+ evaluator.score_report.print() # Print to console
232
+ ```
233
+
234
+ ## External Data Loading
235
+
236
+ MetaEvaluator supports loading pre-existing judge and human annotation results for scoring-only workflows. This is useful when you:
237
+ - Have results from previous evaluation runs
238
+ - Want to compute metrics on externally generated judge/human data
239
+ - Need to re-run scoring with different metrics without re-evaluating
240
+
241
+ ### Loading A Single External Judge Results
242
+ ```python
243
+ # Load external judge results from CSV
244
+ evaluator.add_external_judge_results(
245
+ file_path="path/to/judge1_results.csv",
246
+ judge_id="external_judge_1",
247
+ llm_client="openai",
248
+ model_used="gpt-4",
249
+ run_id="external_run_1"
250
+ )
251
+ ```
252
+
253
+ **Required CSV columns for judge results:**
254
+ - `original_id`: Unique identifier for each sample
255
+ - Task columns matching your `EvalTask.task_schemas`
256
+
257
+ ### Loading A Single External Annotation Results
258
+ ```python
259
+ # Load external human annotations from CSV
260
+
261
+ evaluator.add_external_annotation_results(
262
+ file_path="path/to/human_results_1.csv",
263
+ annotator_id="annotator_1",
264
+ run_id="human_run_1"
265
+ )
266
+ ```
267
+
268
+ **Required CSV columns for human results:**
269
+ - `original_id`: Unique identifier for each sample
270
+ - Task columns matching your `EvalTask.task_schemas`
271
+
272
+ For detailed data format requirements and examples, see the [Results Guide](docs/guides/results.md#external-data-loading).
273
+
274
+ ## Available Metrics
275
+
276
+ MetaEvaluator supports comprehensive alignment metrics for evaluating judge performance:
277
+
278
+ ### Classification Metrics
279
+ - **Accuracy/F1/Recall/Precision**: Classification metrics between judge and human labels
280
+ - **Cohen's Kappa**: Inter-rater agreement accounting for chance agreement
281
+ - **Alt-Test**: Statistical significance testing with leave-one-annotator-out methodology
282
+
283
+ ### Text Similarity Metrics
284
+ - **Text Similarity**: String-based similarity using sequence matching algorithms
285
+ - **Semantic Similarity**: OpenAI embedding-based semantic similarity (requires API key)
286
+
287
+ ### Custom Metrics
288
+ - **Custom Scorers**: Implement domain-specific metrics by extending `BaseScorer`
289
+
290
+ See [Scoring Guide](docs/guides/scoring.md) for detailed usage examples and configuration options.
291
+
292
+ ## Documentation
293
+
294
+ Comprehensive documentation is available in the `docs/` directory:
295
+
296
+ - **[Tutorial](docs/tutorial.md)** - Complete walkthrough
297
+ - **[Data Loading](docs/guides/evaldata.md)** - Load and manage evaluation datasets
298
+ - **[Task Definition](docs/guides/evaltask.md)** - Define evaluation schemas and parsing methods
299
+ - **[Judge Configuration](docs/guides/judges_load.md)** - Set up LLM judges with YAML
300
+ - **[Running Evaluations](docs/guides/judges_run.md)** - Execute judge evaluations
301
+ - **[Scoring & Metrics](docs/guides/scoring.md)** - Compute alignment metrics
302
+ - **[Human Annotations](docs/annotation_guide/annotation.md)** - Collect human ground truth
303
+ - **[Deployment Guide for Annotation Platform](docs/annotation_guide/deployment.md)** - Deployment options (local, ngrok, Docker)
304
+
305
+ ## Project Structure (automatically generated)
306
+
307
+ ```
308
+ project_dir/
309
+ ├── data/ # Serialized evaluation data
310
+ ├── results/ # Judge evaluation results
311
+ ├── annotations/ # Human annotation data
312
+ └── scores/ # Computed alignment metrics
313
+ ├── classification_accuracy/ # Detailed accuracy results
314
+ ├── cohens_kappa/ # Detailed kappa results
315
+ ├── alt_test/ # Detailed alt-test results
316
+ └── text_similarity/ # Detailed similarity results
317
+ ```
318
+
319
+ ## Examples
320
+
321
+ See the `examples/` directory for complete working examples:
322
+
323
+ ### Rejection Detection Evaluation
324
+ - **[`examples/rejection/run_evaluation.py`](examples/rejection/run_evaluation.py)** - Complete async evaluation with multiple metrics
325
+ - **[`examples/rejection/run_human_annotation.py`](examples/rejection/run_human_annotation.py)** - Launch human annotation interface
326
+ - **[`examples/rejection/data/sample_rejection.csv`](examples/rejection/data/sample_rejection.csv)** - Sample rejection detection dataset
327
+ - **[`examples/rejection/judges.yaml`](examples/rejection/judges.yaml)** - Judge configuration example
328
+ - **[`examples/rejection/prompt.md`](examples/rejection/prompt.md)** - Evaluation prompt template
329
+
330
+ ### Docker Templates
331
+ - **[`docker/Dockerfile`](docker/Dockerfile)** - Basic Dockerfile template
332
+ - **[`docker/docker-compose.yml`](docker/docker-compose.yml)** - Docker compose template
333
+
334
+ ### RabakBench Evaluation (data not included)
335
+ - **[`examples/rabakbench/run_evaluation.py`](examples/rabakbench/run_evaluation.py)** - Complete async evaluation with multiple metrics
336
+ - **[`examples/rabakbench/run_human_annotation.py`](examples/rabakbench/run_human_annotation.py)** - Launch human annotation interface
337
+
338
+ ### Scoring-Only Evaluation (load in external results)
339
+ - **[`examples/rejection/run_scoring_only.py`](examples/rejection/run_scoring_only.py)** - Load external judge/human results and run scoring without re-evaluation
340
+
341
+ ## Development Commands
342
+
343
+ **Requirements:** [uv](https://docs.astral.sh/uv/) package manager
344
+
345
+ - **Run linting:** `uv tool run ruff check --preview --fix`
346
+ - **Run formatting:** `uv tool run ruff format .`
347
+ - **Run type checking:** `uv run pyright`
348
+ - **Run tests:** `uv run pytest --skip-integration`
349
+