judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: judgeval
|
|
3
|
+
Version: 0.22.2
|
|
4
|
+
Summary: Judgeval Package
|
|
5
|
+
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
|
6
|
+
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
|
7
|
+
Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
|
|
8
|
+
License-Expression: Apache-2.0
|
|
9
|
+
License-File: LICENSE.md
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Requires-Dist: boto3>=1.40.11
|
|
14
|
+
Requires-Dist: click<8.2.0
|
|
15
|
+
Requires-Dist: dotenv
|
|
16
|
+
Requires-Dist: httpx>=0.28.1
|
|
17
|
+
Requires-Dist: litellm>=1.75.0
|
|
18
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
|
|
19
|
+
Requires-Dist: opentelemetry-sdk>=1.36.0
|
|
20
|
+
Requires-Dist: orjson>=3.9.0
|
|
21
|
+
Requires-Dist: typer>=0.9.0
|
|
22
|
+
Provides-Extra: s3
|
|
23
|
+
Requires-Dist: boto3>=1.40.11; extra == 's3'
|
|
24
|
+
Provides-Extra: trainer
|
|
25
|
+
Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
<div align="center">
|
|
29
|
+
|
|
30
|
+
<a href="https://judgmentlabs.ai/">
|
|
31
|
+
<picture>
|
|
32
|
+
<source media="(prefers-color-scheme: dark)" srcset="assets/logo_darkmode.svg">
|
|
33
|
+
<img src="assets/logo_lightmode.svg" alt="Judgment Logo" width="400" />
|
|
34
|
+
</picture>
|
|
35
|
+
</a>
|
|
36
|
+
|
|
37
|
+
<br>
|
|
38
|
+
|
|
39
|
+
## Agent Behavior Monitoring (ABM)
|
|
40
|
+
|
|
41
|
+
Track and judge any agent behavior in online and offline setups. Set up Sentry-style alerts and analyze agent behaviors / topic patterns at scale!
|
|
42
|
+
|
|
43
|
+
[](https://docs.judgmentlabs.ai/documentation)
|
|
44
|
+
[](https://app.judgmentlabs.ai/register)
|
|
45
|
+
[](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
[](https://x.com/JudgmentLabs)
|
|
49
|
+
[](https://www.linkedin.com/company/judgmentlabs)
|
|
50
|
+
|
|
51
|
+
</div>
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
</table>
|
|
55
|
+
|
|
56
|
+
## [NEW] 🎆 Agent Reinforcement Learning
|
|
57
|
+
|
|
58
|
+
Train your agents with multi-turn reinforcement learning using judgeval and [Fireworks AI](https://fireworks.ai/)! Judgeval's ABM now integrates with Fireworks' Reinforcement Fine-Tuning (RFT) endpoint, supporting gpt-oss, qwen3, Kimi2, DeepSeek, and more.
|
|
59
|
+
|
|
60
|
+
Judgeval's agent monitoring infra provides a simple harness for integrating GRPO into any Python agent, giving builders a quick method to **try RL with minimal code changes** to their existing agents!
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
await trainer.train(
|
|
64
|
+
agent_function=your_agent_function, # entry point to your agent
|
|
65
|
+
scorers=[RewardScorer()], # Custom scorer you define based on task criteria, acts as reward
|
|
66
|
+
prompts=training_prompts # Tasks
|
|
67
|
+
)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**That's it!** Judgeval automatically manages trajectory collection and reward tagging - your agent can learn from production data with minimal code changes.
|
|
71
|
+
|
|
72
|
+
👉 Check out the [Wikipedia Racer notebook](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb), where an agent learns to navigate Wikipedia using RL, to see Judgeval in action.
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
You can view and monitor training progress for free via the [Judgment Dashboard](https://app.judgmentlabs.ai/).
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
## Judgeval Overview
|
|
79
|
+
|
|
80
|
+
Judgeval is an open-source framework for agent behavior monitoring. Judgeval offers a toolkit to track and judge agent behavior in online and offline setups, enabling you to convert interaction data from production/test environments into improved agents. To get started, try running one of the notebooks below or dive deeper in our [docs](https://docs.judgmentlabs.ai/documentation).
|
|
81
|
+
|
|
82
|
+
Our mission is to unlock the power of production data for agent development, enabling teams to improve their apps by catching real-time failures and optimizing over their users' preferences.
|
|
83
|
+
|
|
84
|
+
## 📚 Cookbooks
|
|
85
|
+
|
|
86
|
+
| Try Out | Notebook | Description |
|
|
87
|
+
|:---------|:-----|:------------|
|
|
88
|
+
| RL | [Wikipedia Racer](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb) | Train agents with reinforcement learning |
|
|
89
|
+
| Online ABM | [Research Agent](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/monitoring/Research_Agent_Online_Monitoring.ipynb) | Monitor agent behavior in production |
|
|
90
|
+
| Custom Scorers | [HumanEval](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/custom_scorers/HumanEval_Custom_Scorer.ipynb) | Build custom evaluators for your agents |
|
|
91
|
+
| Offline Testing | [Get Started For Free] | Compare how different prompts, models, or agent configs affect performance across ANY metric |
|
|
92
|
+
|
|
93
|
+
You can access our [repo of cookbooks](https://github.com/JudgmentLabs/judgment-cookbook).
|
|
94
|
+
|
|
95
|
+
You can find a list of [video tutorials for Judgeval use cases](https://www.youtube.com/@Alexshander-JL).
|
|
96
|
+
|
|
97
|
+
## Why Judgeval?
|
|
98
|
+
|
|
99
|
+
🤖 **Simple to run multi-turn RL**: Optimize your agents with multi-turn RL without managing compute infrastructure or data pipelines. Just add a few lines of code to your existing agent code and train!
|
|
100
|
+
|
|
101
|
+
⚙️ **Custom Evaluators**: No restriction to only monitoring with prefab scorers. Judgeval provides simple abstractions for custom Python scorers, supporting any LLM-as-a-judge rubrics/models and code-based scorers that integrate to our live agent-tracking infrastructure. [Learn more](https://docs.judgmentlabs.ai/documentation/evaluation/custom-scorers)
|
|
102
|
+
|
|
103
|
+
🚨 **Production Monitoring**: Run any custom scorer in a hosted, virtualized secure container to flag agent behaviors online in production. Get Slack alerts for failures and add custom hooks to address regressions before they impact users. [Learn more](https://docs.judgmentlabs.ai/documentation/performance/online-evals)
|
|
104
|
+
|
|
105
|
+
📊 **Behavior/Topic Grouping**: Group agent runs by behavior type or topic for deeper analysis. Drill down into subsets of users, agents, or use cases to reveal patterns of agent behavior.
|
|
106
|
+
<!-- Add link to Bucketing docs once we have it -->
|
|
107
|
+
<!--
|
|
108
|
+
TODO: Once we have trainer code docs, plug in here
|
|
109
|
+
-->
|
|
110
|
+
|
|
111
|
+
🧪 **Run experiments on your agents**: Compare test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors.
|
|
112
|
+
|
|
113
|
+
<!--
|
|
114
|
+
Use this once we have AI PM features:
|
|
115
|
+
|
|
116
|
+
**Run experiments on your agents**: A/B test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors. [Learn more]
|
|
117
|
+
|
|
118
|
+
-->
|
|
119
|
+
|
|
120
|
+
## 🛠️ Quickstart
|
|
121
|
+
|
|
122
|
+
Get started with Judgeval by installing our SDK using pip:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
pip install judgeval
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
export JUDGMENT_API_KEY=...
|
|
132
|
+
export JUDGMENT_ORG_ID=...
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
**If you don't have keys, [create an account for free](https://app.judgmentlabs.ai/register) on the platform!**
|
|
136
|
+
|
|
137
|
+
### Start monitoring with Judgeval
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from judgeval.tracer import Tracer, wrap
|
|
141
|
+
from judgeval.data import Example
|
|
142
|
+
from judgeval.scorers import AnswerRelevancyScorer
|
|
143
|
+
from openai import OpenAI
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
judgment = Tracer(project_name="default_project")
|
|
147
|
+
client = wrap(OpenAI()) # tracks all LLM calls
|
|
148
|
+
|
|
149
|
+
@judgment.observe(span_type="tool")
|
|
150
|
+
def format_question(question: str) -> str:
|
|
151
|
+
# dummy tool
|
|
152
|
+
return f"Question : {question}"
|
|
153
|
+
|
|
154
|
+
@judgment.observe(span_type="function")
|
|
155
|
+
def run_agent(prompt: str) -> str:
|
|
156
|
+
task = format_question(prompt)
|
|
157
|
+
response = client.chat.completions.create(
|
|
158
|
+
model="gpt-5-mini",
|
|
159
|
+
messages=[{"role": "user", "content": task}]
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
judgment.async_evaluate( # trigger online monitoring
|
|
163
|
+
scorer=AnswerRelevancyScorer(threshold=0.5), # swap with any scorer
|
|
164
|
+
example=Example(input=task, actual_output=response), # customize to your data
|
|
165
|
+
model="gpt-5",
|
|
166
|
+
)
|
|
167
|
+
return response.choices[0].message.content
|
|
168
|
+
|
|
169
|
+
run_agent("What is the capital of the United States?")
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Running this code will deliver monitoring results to your [free platform account](https://app.judgmentlabs.ai/register) and should look like this:
|
|
173
|
+
|
|
174
|
+

|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
### Customizable Scorers Over Agent Behavior
|
|
178
|
+
|
|
179
|
+
Judgeval's strongest suit is the full customization over the types of scorers you can run online monitoring with. No restrictions to only single-prompt LLM judges or prefab scorers - if you can express your scorer
|
|
180
|
+
in python code, judgeval can monitor it! Under the hood, judgeval hosts your scorer in a virtualized secure container, enabling online monitoring for any scorer.
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
First, create a behavior scorer in a file called `helpfulness_scorer.py`:
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from judgeval.data import Example
|
|
187
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
|
188
|
+
|
|
189
|
+
# Define custom example class
|
|
190
|
+
class QuestionAnswer(Example):
|
|
191
|
+
question: str
|
|
192
|
+
answer: str
|
|
193
|
+
|
|
194
|
+
# Define a server-hosted custom scorer
|
|
195
|
+
class HelpfulnessScorer(ExampleScorer):
|
|
196
|
+
name: str = "Helpfulness Scorer"
|
|
197
|
+
server_hosted: bool = True # Enable server hosting
|
|
198
|
+
async def a_score_example(self, example: QuestionAnswer):
|
|
199
|
+
# Custom scoring logic for agent behavior
|
|
200
|
+
# Can be an arbitrary combination of code and LLM calls
|
|
201
|
+
if len(example.answer) > 10 and "?" not in example.answer:
|
|
202
|
+
self.reason = "Answer is detailed and provides helpful information"
|
|
203
|
+
return 1.0
|
|
204
|
+
else:
|
|
205
|
+
self.reason = "Answer is too brief or unclear"
|
|
206
|
+
return 0.0
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
Then deploy your scorer to Judgment's infrastructure:
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
echo "pydantic" > requirements.txt
|
|
213
|
+
uv run judgeval upload_scorer helpfulness_scorer.py requirements.txt
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Now you can instrument your agent with monitoring and online evaluation:
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
from judgeval.tracer import Tracer, wrap
|
|
220
|
+
from helpfulness_scorer import HelpfulnessScorer, QuestionAnswer
|
|
221
|
+
from openai import OpenAI
|
|
222
|
+
|
|
223
|
+
judgment = Tracer(project_name="default_project")
|
|
224
|
+
client = wrap(OpenAI()) # tracks all LLM calls
|
|
225
|
+
|
|
226
|
+
@judgment.observe(span_type="tool")
|
|
227
|
+
def format_task(question: str) -> str: # replace with your prompt engineering
|
|
228
|
+
return f"Please answer the following question: {question}"
|
|
229
|
+
|
|
230
|
+
@judgment.observe(span_type="tool")
|
|
231
|
+
def answer_question(prompt: str) -> str: # replace with your LLM system calls
|
|
232
|
+
response = client.chat.completions.create(
|
|
233
|
+
model="gpt-5-mini",
|
|
234
|
+
messages=[{"role": "user", "content": prompt}]
|
|
235
|
+
)
|
|
236
|
+
return response.choices[0].message.content
|
|
237
|
+
|
|
238
|
+
@judgment.observe(span_type="function")
|
|
239
|
+
def run_agent(question: str) -> str:
|
|
240
|
+
task = format_task(question)
|
|
241
|
+
answer = answer_question(task)
|
|
242
|
+
|
|
243
|
+
# Add online evaluation with server-hosted scorer
|
|
244
|
+
judgment.async_evaluate(
|
|
245
|
+
scorer=HelpfulnessScorer(),
|
|
246
|
+
example=QuestionAnswer(question=question, answer=answer),
|
|
247
|
+
sampling_rate=0.9 # Evaluate 90% of agent runs
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
return answer
|
|
251
|
+
|
|
252
|
+
if __name__ == "__main__":
|
|
253
|
+
result = run_agent("What is the capital of the United States?")
|
|
254
|
+
print(result)
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
Congratulations! Your online eval result should look like this:
|
|
258
|
+
|
|
259
|
+

|
|
260
|
+
|
|
261
|
+
You can now run any online scorer in a secure Firecracker microVMs with no latency impact on your applications.
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
judgeval/__init__.py,sha256=RRiBbXUj7M1VW3NqFvMZlXyI72duh3VA5bfIWqPmKNw,6670
|
|
2
|
+
judgeval/cli.py,sha256=T9nKO9eHMOiLCgxaxuihqtRHsG_dMT06sW6X873MmnI,2209
|
|
3
|
+
judgeval/constants.py,sha256=JZZJ1MqzZZDVk-5PRPRbmLnM8mXI-RDL5vxa1JFuscs,3408
|
|
4
|
+
judgeval/env.py,sha256=uFggNNKmfDaa5dmZMwwXVIDdHAHe524jDWUpByV4hm4,1879
|
|
5
|
+
judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
|
|
6
|
+
judgeval/logger.py,sha256=VP5blbsJ53mvJbNHfBf5p2KrARUrkrErpPkB-__Hh3U,1562
|
|
7
|
+
judgeval/version.py,sha256=j1d7CQ2JT0bsK7bGd5vCKR0rT4ebA9YYUF2-5heFZd8,74
|
|
8
|
+
judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
|
|
9
|
+
judgeval/api/__init__.py,sha256=dGZm9KtgLMnmbiyDEJ_D7suuVqmsibR_Cd0YZRJ7qHI,15210
|
|
10
|
+
judgeval/api/api_types.py,sha256=PJ5ZQWuvCl5GXFzhcpOw6Iuktr50lo5BaILmZcAKWfc,10085
|
|
11
|
+
judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
|
|
12
|
+
judgeval/data/evaluation_run.py,sha256=O41p99wNAuCAf6lsLNKzkZ6W-kL9LlzCYxVls7IcKkA,4727
|
|
13
|
+
judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
|
|
14
|
+
judgeval/data/judgment_types.py,sha256=7RsrB2FvnsRRtnqMMfQzAMMn9oNvA076hbE2tmzKNXc,18874
|
|
15
|
+
judgeval/data/result.py,sha256=XufFGSAkBDfevPUmzSgsR9HEqytISkM0U5HkhJmsjpY,2102
|
|
16
|
+
judgeval/data/scorer_data.py,sha256=HeP15ZgftFTJCF8JmDJCLWXRnZJIaGDJCzl7Hg6gWwE,2006
|
|
17
|
+
judgeval/data/trace.py,sha256=zSiR3o6xt8Z46XA3M9fJBtViF0BsPO6yKp9jxdscOSc,3881
|
|
18
|
+
judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
|
|
19
|
+
judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
|
|
20
|
+
judgeval/dataset/__init__.py,sha256=s7HuBH_TQOLZ1arqaY2QRiSp-4mI_fF_9OykK_1QbsI,8858
|
|
21
|
+
judgeval/evaluation/__init__.py,sha256=e9H4h73MINpcBlBYpkXiUaoCdWxnzvaYK0Ob0awY-kM,13064
|
|
22
|
+
judgeval/integrations/langgraph/__init__.py,sha256=HwXmtDxaO75Kn4KPErnMb6Ne6FcpRxV_SCYVuwFsve0,332
|
|
23
|
+
judgeval/integrations/openlit/__init__.py,sha256=-8D4D6-fGsWPwoOojw82OaE9X5sUbmb16x1bF-WfOmg,1571
|
|
24
|
+
judgeval/judges/__init__.py,sha256=e7JnTc1TG_SwqydDHTXHIP0EBazQxt-ydMQG7ghSU5A,228
|
|
25
|
+
judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
|
|
26
|
+
judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
|
|
27
|
+
judgeval/judges/together_judge.py,sha256=GzwlXZJzle8hT-vWKmq39JyIeanJqJfHDOkrksUbzk0,4398
|
|
28
|
+
judgeval/judges/utils.py,sha256=ITbYwvjU3o9-FIAReFvxh24yJrx9LV3l9BnSBgKUpxg,2068
|
|
29
|
+
judgeval/prompt/__init__.py,sha256=Qgrd8u4WniaOjbRAoEFEeMnTmaqIGx5ZGX_U85iqhs0,11010
|
|
30
|
+
judgeval/scorers/__init__.py,sha256=pomKzEy4YNFyygYp8vbS3co8iB5CMstRkQwdUgi1u4g,744
|
|
31
|
+
judgeval/scorers/agent_scorer.py,sha256=-qcNSkY6i7ur2LXkM7H1jTKuuFbDuXbjTq42o3vjeQ8,595
|
|
32
|
+
judgeval/scorers/api_scorer.py,sha256=jPBQUBs_T3Xq33QoIbIXDzUaXinz56qeDfo96dfdX0g,2036
|
|
33
|
+
judgeval/scorers/base_scorer.py,sha256=hsMuqdW8QtW5n9JzruXyaZC7im2K2sSmz1RDkbMisJ4,2702
|
|
34
|
+
judgeval/scorers/example_scorer.py,sha256=o_BGUztJXjnKnuOqIa9T4PXe0wPoWg63FyH518N1LxA,561
|
|
35
|
+
judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
|
|
36
|
+
judgeval/scorers/score.py,sha256=xquM59SCtNeuAsrBsHFgBQk3CHp4-bms4oFs24xfcU0,7176
|
|
37
|
+
judgeval/scorers/utils.py,sha256=dDxPKVjKa1lsMXNhZ8-aJFG3qk1usAH1JnKeC3vBQbU,304
|
|
38
|
+
judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
+
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=wrq7y9I30GZbwDXIrSh81KRO_-j7i-1DjwX5Hc3PScI,728
|
|
40
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=WUeFyWdr1Wc8dh-aQ1nrK-mbd9W0MT4VyzLT5CbJ2-Q,450
|
|
41
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
|
|
42
|
+
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ucYOI6ztAjfoYmcgTDzN8u5RrehlVqrkeLEfss9b1fk,441
|
|
43
|
+
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=V3RdrWhnR_vLBrtWw7QbgN9K_A-Och7-v9I2fN4z8gY,506
|
|
44
|
+
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=WhSkPs8tWyQ_cS-y-VTzrKAPlizKp-6zi_DmfgW4AgM,10773
|
|
45
|
+
judgeval/tracer/__init__.py,sha256=5OR0mxzrsWkh-tkT53WzrwtZ1EBIidx-rYXeO5nuWLc,39621
|
|
46
|
+
judgeval/tracer/constants.py,sha256=tLR5ClDaNlNg_MAv2XRdk62uQW4KyBnWaNbG_YYblTc,55
|
|
47
|
+
judgeval/tracer/keys.py,sha256=mYBo_X6-rC9xfiI-WpjHlO7rUtcMORtQXCQyO1F3Ycc,2387
|
|
48
|
+
judgeval/tracer/managers.py,sha256=JiUjX_evToxcuogKVcE6qpJkSvYOxAXCU4_z_hWXJOw,5199
|
|
49
|
+
judgeval/tracer/utils.py,sha256=xWha5iwC733wCf2HKbNqzxOPS1ovO1OymWIUFLz-UpQ,537
|
|
50
|
+
judgeval/tracer/exporters/__init__.py,sha256=3WDXC28iY5gYMM5s7ejmy7P-DVDQ_iIuzwovZxUKJXg,1295
|
|
51
|
+
judgeval/tracer/exporters/s3.py,sha256=N9gmw17cnR0VkfAQQkLsNj5BksgNRETThR5qYhWRjP4,4360
|
|
52
|
+
judgeval/tracer/exporters/store.py,sha256=pA_KINcm0amO0WEDYmMFU05SSsMOgJ5ogIRaevSX1sk,1885
|
|
53
|
+
judgeval/tracer/exporters/utils.py,sha256=JRcoSQuEHxMDJbXfyrUIfA2SHBVkZM82h4bTbYGxkNw,1154
|
|
54
|
+
judgeval/tracer/llm/__init__.py,sha256=ENxApieKSktYrIviofXWP9GU0WnhBm0Q9mlGe_m_gMY,139
|
|
55
|
+
judgeval/tracer/llm/config.py,sha256=J8-bTL82bgDqdTJPN-Px3Epvoa9FG7L-X329kitwBTc,2525
|
|
56
|
+
judgeval/tracer/llm/constants.py,sha256=IWa3CMes8wIt_UG7jrGEOztg2sHz54fdOMWIOOr-dz8,172
|
|
57
|
+
judgeval/tracer/llm/providers.py,sha256=VAimkmChOOjhC1cUv-0iG8pa5PhOw1HIOyt3zrIrbcM,628
|
|
58
|
+
judgeval/tracer/llm/llm_anthropic/__init__.py,sha256=HG0gIlTgaRt-Y0u1ERPQ19pUgb4YHkTh7tZQPeyR4oM,80
|
|
59
|
+
judgeval/tracer/llm/llm_anthropic/config.py,sha256=ICfKODPQvZsRxpK4xWQ-YE79pmWJTmY2wryddxpNdpM,153
|
|
60
|
+
judgeval/tracer/llm/llm_anthropic/messages.py,sha256=T7dApxJCsOWEpquYSZICACwTioZG3ZcxHdJjvF04T2E,15474
|
|
61
|
+
judgeval/tracer/llm/llm_anthropic/messages_stream.py,sha256=DKlZZnfK_yv_tEMwF2XxvsjgUjOFI3c5JUMQwERNV7k,12188
|
|
62
|
+
judgeval/tracer/llm/llm_anthropic/wrapper.py,sha256=JILcyC4NvjXZSqlFoZp-VB-JsCYZkQPMFEYaB4AysrA,1849
|
|
63
|
+
judgeval/tracer/llm/llm_google/__init__.py,sha256=otBZETsAfVZjtZaN5N36Ln0kw-I9jVB4tFGrV6novHo,74
|
|
64
|
+
judgeval/tracer/llm/llm_google/config.py,sha256=S3yCAE9oHbXjLVYiz5mGD16yIgXMBBUu5UN4lBjoCNQ,162
|
|
65
|
+
judgeval/tracer/llm/llm_google/generate_content.py,sha256=w1rIh1cTBYnkfBQTL4qHntwsKfBcSrf2VSS2y-BOMRU,4030
|
|
66
|
+
judgeval/tracer/llm/llm_google/wrapper.py,sha256=jqaMXGoM9dlPBbCFadMI5EqFrNHzBt0h9VkNn7KPVLk,901
|
|
67
|
+
judgeval/tracer/llm/llm_openai/__init__.py,sha256=CyzwhY0-zmqWKlEno7JPBcvO7G_hI8dp6-_5_KEzFqg,74
|
|
68
|
+
judgeval/tracer/llm/llm_openai/beta_chat_completions.py,sha256=IXw-Gu-WUxQ-gaBUIe-aAKOn1Pakn_RFl0b1C_1toP8,7326
|
|
69
|
+
judgeval/tracer/llm/llm_openai/chat_completions.py,sha256=U086NgaaLFiyvAYrgJncC-obaaSbG2r_3ehquNlVTDQ,17637
|
|
70
|
+
judgeval/tracer/llm/llm_openai/config.py,sha256=NE0ixKhd4WVeAVjY8jNTncuKYH6R4MQDLPmcCsd3zWY,144
|
|
71
|
+
judgeval/tracer/llm/llm_openai/responses.py,sha256=CCGYz35gn3jJOYE2anyR49OR2XhSDwy3dEsISbzMO8Q,18137
|
|
72
|
+
judgeval/tracer/llm/llm_openai/utils.py,sha256=fpy9war8dyke25qHxGW2Yo028RA4Siq0RBLA4G63yUw,1480
|
|
73
|
+
judgeval/tracer/llm/llm_openai/wrapper.py,sha256=Z5Ndib228yd1pXEQ4xIu7_CJHxpW_t0ofZAC6FLc5eU,2055
|
|
74
|
+
judgeval/tracer/llm/llm_together/__init__.py,sha256=MEnsF77IgFD4h73hNCMpo-9a1PHHdm-OxPlOalXOMac,78
|
|
75
|
+
judgeval/tracer/llm/llm_together/chat_completions.py,sha256=RySsK3tqG0NpJHPlVQ705bXxIfseSQUhvIoS-sz4rOg,14380
|
|
76
|
+
judgeval/tracer/llm/llm_together/config.py,sha256=jCJY0KQcHJZZJk2vq038GKIDUMusqgvRjQ0B6OV5uEc,150
|
|
77
|
+
judgeval/tracer/llm/llm_together/wrapper.py,sha256=HFqy_MabQeSq8oj2diZhEuk1SDt_hDfk5MFdPn9MFhg,1733
|
|
78
|
+
judgeval/tracer/processors/__init__.py,sha256=BdOOPOD1RfMI5YHW76DNPKR07EAev-JxoolZ3KaXNNU,7100
|
|
79
|
+
judgeval/trainer/__init__.py,sha256=nJo913vFdss3E_PR-M1OUjznS0SYgNZ-MP-Y_6Mj5PA,437
|
|
80
|
+
judgeval/trainer/base_trainer.py,sha256=Lxm6OxJpifonLKofNIRG3TU7n_jZWQZ0I_f_jwtb_WU,4018
|
|
81
|
+
judgeval/trainer/config.py,sha256=7ZSwr6p7vq0MRadh9axm6XB-RAotdWqULZ5yDl0xGbQ,4340
|
|
82
|
+
judgeval/trainer/console.py,sha256=SvokkFEU-K1vLV4Rd1m6YJJ7HyYwTr4Azdzwx_JPZUY,4351
|
|
83
|
+
judgeval/trainer/fireworks_trainer.py,sha256=_B-fWovdhIpxh1RbXU0W5BlFGc9ZzuYtFw7CBtKTRO8,16074
|
|
84
|
+
judgeval/trainer/trainable_model.py,sha256=T-Sioi_sXtfYlcu3lE0cd60PHs8DrYaZ-Kxb4h1nU04,8993
|
|
85
|
+
judgeval/trainer/trainer.py,sha256=twLEHNaomelTg6ZYG6veI9OpB3wzhPCtPVQMTnDZWx4,2626
|
|
86
|
+
judgeval/utils/async_utils.py,sha256=AF1xdu8Ao5GyhFvfaLOaKJHn1RISyXZ4U70UZe9zfBA,1083
|
|
87
|
+
judgeval/utils/file_utils.py,sha256=vq-n5WZEZjVbZ5S9QTkW8nSH6Pvw-Jx0ttsQ1t0wnPQ,3140
|
|
88
|
+
judgeval/utils/guards.py,sha256=_DaKZxvjD10J97Ze2paHhbCiV2MpDz3FZQmNwaL5k0w,945
|
|
89
|
+
judgeval/utils/meta.py,sha256=RAqZuvOlymqMwFoS0joBW_r65lcN9bY8BpNYHoytKps,773
|
|
90
|
+
judgeval/utils/project.py,sha256=kGpYmp6QGTD6h-GjQ-ovT7kBmGnyb99MWDJmRGFQHOg,527
|
|
91
|
+
judgeval/utils/serialize.py,sha256=WbforbVFGINuk68T2YtWhj-ECMC6rWol3g5dxz9nsm8,6265
|
|
92
|
+
judgeval/utils/testing.py,sha256=m5Nexv65tmfSj1XvAPK5Ear7aJ7w5xjDtZN0tLZ_RBk,2939
|
|
93
|
+
judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
|
|
94
|
+
judgeval/utils/version_check.py,sha256=se4Ft8rjcl5u7fHMxSGQpka844V2AcZpOYl6StLWTio,1081
|
|
95
|
+
judgeval/utils/decorators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
96
|
+
judgeval/utils/decorators/dont_throw.py,sha256=Q4xlx6RUnQdNjKM0A_X0FEeLBF_71rMKnKdRbVnX82o,989
|
|
97
|
+
judgeval/utils/decorators/use_once.py,sha256=8mgj5VK9v08VOOWX2Bstc0CezNsOVUKMIv7N2R83E8s,288
|
|
98
|
+
judgeval/utils/wrappers/README.md,sha256=-Jyagu6NPH92ty8pTMbzRLVJZzufULrjxcyohXgsGMc,76
|
|
99
|
+
judgeval/utils/wrappers/__init__.py,sha256=iAcpjCOkYqoe6z2utrS_3yZLmdQPD1Y64MMefai8h0Y,546
|
|
100
|
+
judgeval/utils/wrappers/immutable_wrap_async.py,sha256=a0LWyEa235tPpfjN3W0A516_GWDL13uhumMVzsMpgW4,1909
|
|
101
|
+
judgeval/utils/wrappers/immutable_wrap_async_iterator.py,sha256=hfVxBoFE6m6I0g0KcSLJXyfVv6pfZuoJuHyLJHtLjjg,2268
|
|
102
|
+
judgeval/utils/wrappers/immutable_wrap_sync.py,sha256=_gOUaPK4Le-pifWCZOH4lDvY-cLfYoC0fy7DTNIG-0A,1823
|
|
103
|
+
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py,sha256=aDC4HpLp4l9A3aFLS0cTCkien-xGgQRU04F7P1pJ6w8,2229
|
|
104
|
+
judgeval/utils/wrappers/mutable_wrap_async.py,sha256=stHISOUCGFUJXY8seXmxUo4ZpMF4LErSBIz0HlWR7Bo,2941
|
|
105
|
+
judgeval/utils/wrappers/mutable_wrap_sync.py,sha256=t5jygAQ1vqhy8s1GfiLeYygYgaLTgfoYASN47U5JiPs,2888
|
|
106
|
+
judgeval/utils/wrappers/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
107
|
+
judgeval/utils/wrappers/utils.py,sha256=j18vaa6JWDw2s3nQy1z5PfV_9Xxio-bVARaHG_0XyL0,1228
|
|
108
|
+
judgeval-0.22.2.dist-info/METADATA,sha256=9F5AvYGpPCC9BQQYj3-4UQ1jVR1mc06L3nDMfYaH_Uw,11483
|
|
109
|
+
judgeval-0.22.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
110
|
+
judgeval-0.22.2.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
|
|
111
|
+
judgeval-0.22.2.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
|
112
|
+
judgeval-0.22.2.dist-info/RECORD,,
|
judgeval/clients.py
DELETED
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from dotenv import load_dotenv
|
|
3
|
-
from openai import OpenAI
|
|
4
|
-
from langfuse import Langfuse
|
|
5
|
-
from typing import Optional
|
|
6
|
-
from together import Together, AsyncTogether
|
|
7
|
-
|
|
8
|
-
PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
|
|
9
|
-
load_dotenv(dotenv_path=PATH_TO_DOTENV)
|
|
10
|
-
|
|
11
|
-
# Initialize required clients
|
|
12
|
-
langfuse = Langfuse(
|
|
13
|
-
secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
|
|
14
|
-
public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
|
|
15
|
-
host=os.getenv("LANGFUSE_HOST"),
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
# Initialize optional OpenAI client
|
|
19
|
-
client: Optional['OpenAI'] = None
|
|
20
|
-
if os.getenv("OPENAI_API_KEY"):
|
|
21
|
-
try:
|
|
22
|
-
from openai import OpenAI
|
|
23
|
-
client = OpenAI()
|
|
24
|
-
except ImportError:
|
|
25
|
-
# openai package not installed
|
|
26
|
-
pass
|
|
27
|
-
|
|
28
|
-
# Initialize optional Together clients
|
|
29
|
-
together_client: Optional['Together'] = None
|
|
30
|
-
async_together_client: Optional['AsyncTogether'] = None
|
|
31
|
-
|
|
32
|
-
# Only initialize Together clients if API key is available
|
|
33
|
-
if os.getenv("TOGETHERAI_API_KEY"):
|
|
34
|
-
try:
|
|
35
|
-
together_client = Together(api_key=os.getenv("TOGETHERAI_API_KEY"))
|
|
36
|
-
async_together_client = AsyncTogether(api_key=os.getenv("TOGETHERAI_API_KEY"))
|
|
37
|
-
except Exception:
|
|
38
|
-
pass
|
|
39
|
-
|
judgeval/common/__init__.py
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
from judgeval.common.utils import (
|
|
2
|
-
get_chat_completion,
|
|
3
|
-
aget_chat_completion,
|
|
4
|
-
get_completion_multiple_models,
|
|
5
|
-
aget_completion_multiple_models
|
|
6
|
-
)
|
|
7
|
-
|
|
8
|
-
__all__ = ["get_chat_completion", "aget_chat_completion", "get_completion_multiple_models", "aget_completion_multiple_models"]
|
judgeval/common/exceptions.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Common Exceptions in Judgeval
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class MissingTestCaseParamsError(Exception):
|
|
7
|
-
pass
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class JudgmentAPIError(Exception):
|
|
11
|
-
"""
|
|
12
|
-
Exception raised when an error occurs while executing a Judgment API request
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
def __init__(self, message: str):
|
|
16
|
-
super().__init__(message)
|
|
17
|
-
self.message = message
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class InvalidJudgeModelError(Exception):
|
|
21
|
-
"""
|
|
22
|
-
Exception raised when an invalid judge model is provided
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
def __init__(self, message: str):
|
|
26
|
-
super().__init__(message)
|
|
27
|
-
self.message = message
|
|
28
|
-
|
judgeval/common/logger.py
DELETED
|
@@ -1,189 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from logging.handlers import RotatingFileHandler
|
|
3
|
-
import sys
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from datetime import datetime
|
|
6
|
-
from contextlib import contextmanager
|
|
7
|
-
|
|
8
|
-
# Global variables
|
|
9
|
-
logger = None
|
|
10
|
-
class LoggingState:
|
|
11
|
-
enabled = False
|
|
12
|
-
path = None
|
|
13
|
-
|
|
14
|
-
LOGGING_STATE = LoggingState()
|
|
15
|
-
|
|
16
|
-
# Add these as module-level variables
|
|
17
|
-
current_example_id = None
|
|
18
|
-
current_timestamp = None
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@contextmanager
|
|
22
|
-
def enable_logging(name: str = "judgeval", path: str = "./logs", max_bytes: int = 1024 * 1024, backup_count: int = 5):
|
|
23
|
-
"""
|
|
24
|
-
Context manager to temporarily enable logging for a specific block of code.
|
|
25
|
-
"""
|
|
26
|
-
global logger
|
|
27
|
-
LOGGING_STATE.enabled = True
|
|
28
|
-
LOGGING_STATE.path = path
|
|
29
|
-
# Initialize logger if not already initialized
|
|
30
|
-
if logger is None:
|
|
31
|
-
logger = _initialize_logger(name=name, path=path, max_bytes=max_bytes, backup_count=backup_count)
|
|
32
|
-
try:
|
|
33
|
-
logger.info("Logging enabled")
|
|
34
|
-
yield
|
|
35
|
-
finally:
|
|
36
|
-
logger.info("Logging disabled")
|
|
37
|
-
LOGGING_STATE.enabled = False
|
|
38
|
-
LOGGING_STATE.path = None
|
|
39
|
-
|
|
40
|
-
def _initialize_logger(
|
|
41
|
-
name: str = "judgeval",
|
|
42
|
-
max_bytes: int = 1024 * 1024, # 1MB
|
|
43
|
-
backup_count: int = 5,
|
|
44
|
-
path: str = "./logs" # Added path parameter with default
|
|
45
|
-
) -> logging.Logger:
|
|
46
|
-
"""
|
|
47
|
-
Initialize the global logger instance if it doesn't exist.
|
|
48
|
-
Returns the global logger instance.
|
|
49
|
-
"""
|
|
50
|
-
global logger
|
|
51
|
-
|
|
52
|
-
log_dir = Path(path)
|
|
53
|
-
log_dir.mkdir(exist_ok=True, parents=True)
|
|
54
|
-
log_file = log_dir / f"{name}.log"
|
|
55
|
-
if log_file.exists():
|
|
56
|
-
log_file.unlink() # Delete existing log file
|
|
57
|
-
|
|
58
|
-
if logger is not None:
|
|
59
|
-
return logger
|
|
60
|
-
|
|
61
|
-
# Create logs directory if it doesn't exist
|
|
62
|
-
log_dir = Path(path)
|
|
63
|
-
log_dir.mkdir(exist_ok=True)
|
|
64
|
-
|
|
65
|
-
# Create formatter
|
|
66
|
-
formatter = logging.Formatter(
|
|
67
|
-
fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
68
|
-
datefmt='%Y-%m-%d %H:%M:%S'
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
# Create a custom formatter that includes example info when available
|
|
72
|
-
class ExampleFormatter(logging.Formatter):
|
|
73
|
-
def format(self, record):
|
|
74
|
-
if current_example_id is not None and current_timestamp is not None:
|
|
75
|
-
record.example_id = current_example_id
|
|
76
|
-
record.timestamp = current_timestamp
|
|
77
|
-
return logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - [Example_%(example_id)s][%(timestamp)s] %(message)s',
|
|
78
|
-
datefmt='%Y-%m-%d %H:%M:%S').format(record)
|
|
79
|
-
return logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
80
|
-
datefmt='%Y-%m-%d %H:%M:%S').format(record)
|
|
81
|
-
|
|
82
|
-
# Use the custom formatter
|
|
83
|
-
console_handler = logging.StreamHandler(sys.stdout)
|
|
84
|
-
console_handler.setFormatter(ExampleFormatter())
|
|
85
|
-
console_handler.setLevel(logging.DEBUG)
|
|
86
|
-
|
|
87
|
-
log_filename = f"{name}.log"
|
|
88
|
-
file_handler = RotatingFileHandler(
|
|
89
|
-
log_dir / log_filename,
|
|
90
|
-
maxBytes=max_bytes,
|
|
91
|
-
backupCount=backup_count,
|
|
92
|
-
mode='a'
|
|
93
|
-
)
|
|
94
|
-
file_handler.setFormatter(ExampleFormatter())
|
|
95
|
-
file_handler.setLevel(logging.DEBUG)
|
|
96
|
-
|
|
97
|
-
# Get logger
|
|
98
|
-
logger = logging.getLogger(name)
|
|
99
|
-
logger.setLevel(logging.DEBUG)
|
|
100
|
-
|
|
101
|
-
# Prevent adding handlers multiple times
|
|
102
|
-
if not logger.handlers:
|
|
103
|
-
logger.addHandler(console_handler)
|
|
104
|
-
logger.addHandler(file_handler)
|
|
105
|
-
|
|
106
|
-
return logger
|
|
107
|
-
|
|
108
|
-
# Initialize the global logger when module is imported
|
|
109
|
-
# logger = _initialize_logger()
|
|
110
|
-
|
|
111
|
-
def log_if_enabled(func):
|
|
112
|
-
"""Decorator to check if logging is enabled before executing logging statements"""
|
|
113
|
-
def wrapper(*args, **kwargs):
|
|
114
|
-
if LOGGING_STATE.enabled:
|
|
115
|
-
return func(*args, **kwargs)
|
|
116
|
-
return wrapper
|
|
117
|
-
|
|
118
|
-
@log_if_enabled
|
|
119
|
-
def debug(msg: str, example_idx: int = None):
|
|
120
|
-
"""Log debug message if logging is enabled"""
|
|
121
|
-
logger.debug(msg)
|
|
122
|
-
|
|
123
|
-
@log_if_enabled
|
|
124
|
-
def info(msg: str, example_idx: int = None):
|
|
125
|
-
"""Log info message if logging is enabled"""
|
|
126
|
-
logger.info(msg)
|
|
127
|
-
|
|
128
|
-
@log_if_enabled
|
|
129
|
-
def warning(msg: str, example_idx: int = None):
|
|
130
|
-
"""Log warning message if logging is enabled"""
|
|
131
|
-
logger.warning(msg)
|
|
132
|
-
|
|
133
|
-
@log_if_enabled
|
|
134
|
-
def error(msg: str, example_idx: int = None):
|
|
135
|
-
"""Log error message if logging is enabled"""
|
|
136
|
-
logger.error(msg)
|
|
137
|
-
|
|
138
|
-
def create_example_handler(
|
|
139
|
-
timestamp: str,
|
|
140
|
-
example_idx: int,
|
|
141
|
-
path: str = "./logs" # Added path parameter with default
|
|
142
|
-
) -> RotatingFileHandler:
|
|
143
|
-
"""Creates a file handler for a specific example"""
|
|
144
|
-
debug(f"Creating example handler for timestamp={timestamp}, example_idx={example_idx}")
|
|
145
|
-
log_dir = Path(path) / "examples"
|
|
146
|
-
log_dir.mkdir(exist_ok=True, parents=True)
|
|
147
|
-
|
|
148
|
-
formatter = logging.Formatter(
|
|
149
|
-
fmt='%(asctime)s - %(name)s - %(levelname)s - [Example_%(example_id)s][%(timestamp)s] %(message)s',
|
|
150
|
-
datefmt='%Y-%m-%d %H:%M:%S'
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
# Create a unique file for each example
|
|
154
|
-
file_handler = RotatingFileHandler(
|
|
155
|
-
log_dir / f"{timestamp}_example_{example_idx}.log",
|
|
156
|
-
maxBytes=1024 * 1024, # 1MB
|
|
157
|
-
backupCount=5,
|
|
158
|
-
mode='a'
|
|
159
|
-
)
|
|
160
|
-
file_handler.setFormatter(formatter)
|
|
161
|
-
file_handler.setLevel(logging.DEBUG)
|
|
162
|
-
info(f"Created example handler for example {example_idx}")
|
|
163
|
-
return file_handler
|
|
164
|
-
|
|
165
|
-
@contextmanager
|
|
166
|
-
def example_logging_context(timestamp: str, example_idx: int):
|
|
167
|
-
"""Context manager for example-specific logging"""
|
|
168
|
-
if not LOGGING_STATE.enabled:
|
|
169
|
-
yield
|
|
170
|
-
return
|
|
171
|
-
|
|
172
|
-
global current_example_id, current_timestamp
|
|
173
|
-
|
|
174
|
-
debug(f"Entering example logging context for example {example_idx}")
|
|
175
|
-
current_example_id = example_idx
|
|
176
|
-
current_timestamp = timestamp
|
|
177
|
-
|
|
178
|
-
handler = create_example_handler(timestamp, example_idx, path=LOGGING_STATE.path)
|
|
179
|
-
if handler:
|
|
180
|
-
logger.addHandler(handler)
|
|
181
|
-
try:
|
|
182
|
-
yield
|
|
183
|
-
finally:
|
|
184
|
-
current_example_id = None
|
|
185
|
-
current_timestamp = None
|
|
186
|
-
if handler:
|
|
187
|
-
logger.removeHandler(handler)
|
|
188
|
-
handler.close()
|
|
189
|
-
debug(f"Closed example handler for example {example_idx}")
|