judgeval 0.0.41__py3-none-any.whl → 0.0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/s3_storage.py +3 -1
- judgeval/common/tracer.py +967 -111
- judgeval/common/utils.py +1 -1
- judgeval/constants.py +5 -0
- judgeval/data/trace.py +2 -1
- judgeval/integrations/langgraph.py +218 -34
- judgeval/rules.py +60 -50
- judgeval/run_evaluation.py +36 -26
- judgeval/utils/alerts.py +8 -0
- {judgeval-0.0.41.dist-info → judgeval-0.0.43.dist-info}/METADATA +35 -46
- {judgeval-0.0.41.dist-info → judgeval-0.0.43.dist-info}/RECORD +13 -13
- {judgeval-0.0.41.dist-info → judgeval-0.0.43.dist-info}/WHEEL +0 -0
- {judgeval-0.0.41.dist-info → judgeval-0.0.43.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -100,9 +100,9 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
100
100
|
raise JudgmentAPIError(error_message)
|
101
101
|
return response_data
|
102
102
|
|
103
|
-
def execute_api_trace_eval(trace_run: TraceRun) ->
|
103
|
+
def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
|
104
104
|
"""
|
105
|
-
Executes an evaluation of a list of `
|
105
|
+
Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
|
106
106
|
"""
|
107
107
|
|
108
108
|
try:
|
@@ -146,46 +146,47 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
|
|
146
146
|
"""
|
147
147
|
# No merge required
|
148
148
|
if not local_results and api_results:
|
149
|
-
return api_results
|
149
|
+
return [result.model_copy() for result in api_results]
|
150
150
|
if not api_results and local_results:
|
151
|
-
return local_results
|
151
|
+
return [result.model_copy() for result in local_results]
|
152
152
|
|
153
153
|
if len(api_results) != len(local_results):
|
154
154
|
# Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
|
155
155
|
raise ValueError(f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}")
|
156
156
|
|
157
|
+
# Create a copy of api_results to avoid modifying the input
|
158
|
+
merged_results = [result.model_copy() for result in api_results]
|
159
|
+
|
157
160
|
# Each ScoringResult in api and local have all the same fields besides `scorers_data`
|
158
|
-
for
|
159
|
-
if not (
|
161
|
+
for merged_result, local_result in zip(merged_results, local_results):
|
162
|
+
if not (merged_result.data_object and local_result.data_object):
|
160
163
|
raise ValueError("Data object is None in one of the results.")
|
161
|
-
if
|
164
|
+
if merged_result.data_object.input != local_result.data_object.input:
|
162
165
|
raise ValueError("The API and local results are not aligned.")
|
163
|
-
if
|
166
|
+
if merged_result.data_object.actual_output != local_result.data_object.actual_output:
|
164
167
|
raise ValueError("The API and local results are not aligned.")
|
165
|
-
if
|
168
|
+
if merged_result.data_object.expected_output != local_result.data_object.expected_output:
|
166
169
|
raise ValueError("The API and local results are not aligned.")
|
167
|
-
if
|
170
|
+
if merged_result.data_object.context != local_result.data_object.context:
|
168
171
|
raise ValueError("The API and local results are not aligned.")
|
169
|
-
if
|
172
|
+
if merged_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
|
170
173
|
raise ValueError("The API and local results are not aligned.")
|
171
|
-
if
|
174
|
+
if merged_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
|
172
175
|
raise ValueError("The API and local results are not aligned.")
|
173
|
-
if
|
176
|
+
if merged_result.data_object.tools_called != local_result.data_object.tools_called:
|
174
177
|
raise ValueError("The API and local results are not aligned.")
|
175
|
-
if
|
178
|
+
if merged_result.data_object.expected_tools != local_result.data_object.expected_tools:
|
176
179
|
raise ValueError("The API and local results are not aligned.")
|
177
180
|
|
178
|
-
|
179
181
|
# Merge ScorerData from the API and local scorers together
|
180
|
-
api_scorer_data =
|
182
|
+
api_scorer_data = merged_result.scorers_data
|
181
183
|
local_scorer_data = local_result.scorers_data
|
182
184
|
if api_scorer_data is None and local_scorer_data is not None:
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
api_result.scorers_data = api_scorer_data + local_scorer_data
|
185
|
+
merged_result.scorers_data = local_scorer_data
|
186
|
+
elif api_scorer_data is not None and local_scorer_data is not None:
|
187
|
+
merged_result.scorers_data = api_scorer_data + local_scorer_data
|
187
188
|
|
188
|
-
return
|
189
|
+
return merged_results
|
189
190
|
|
190
191
|
|
191
192
|
def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
|
@@ -405,8 +406,15 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
405
406
|
)
|
406
407
|
if function and tracer:
|
407
408
|
new_traces: List[Trace] = []
|
408
|
-
|
409
|
-
tracer
|
409
|
+
|
410
|
+
# Handle case where tracer is actually a callback handler
|
411
|
+
actual_tracer = tracer
|
412
|
+
if hasattr(tracer, 'tracer') and hasattr(tracer.tracer, 'traces'):
|
413
|
+
# This is a callback handler, get the underlying tracer
|
414
|
+
actual_tracer = tracer.tracer
|
415
|
+
|
416
|
+
actual_tracer.offline_mode = True
|
417
|
+
actual_tracer.traces = []
|
410
418
|
for example in examples:
|
411
419
|
if example.input:
|
412
420
|
if isinstance(example.input, str):
|
@@ -417,19 +425,21 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
417
425
|
raise ValueError(f"Input must be string or dict, got {type(example.input)}")
|
418
426
|
else:
|
419
427
|
result = run_with_spinner("Running agent function: ", function)
|
420
|
-
|
428
|
+
|
429
|
+
|
430
|
+
for i, trace in enumerate(actual_tracer.traces):
|
421
431
|
# We set the root-level trace span with the expected tools of the Trace
|
422
432
|
trace = Trace(**trace)
|
423
433
|
trace.trace_spans[0].expected_tools = examples[i].expected_tools
|
424
434
|
new_traces.append(trace)
|
425
435
|
trace_run.traces = new_traces
|
426
|
-
|
436
|
+
actual_tracer.traces = []
|
427
437
|
|
428
438
|
# Execute evaluation using Judgment API
|
429
439
|
info("Starting API evaluation")
|
430
440
|
try: # execute an EvaluationRun with just JudgmentScorers
|
431
441
|
debug("Sending request to Judgment API")
|
432
|
-
response_data:
|
442
|
+
response_data: Dict = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
|
433
443
|
scoring_results = [ScoringResult(**result) for result in response_data["results"]]
|
434
444
|
info(f"Received {len(scoring_results)} results from API")
|
435
445
|
except JudgmentAPIError as e:
|
judgeval/utils/alerts.py
CHANGED
@@ -20,12 +20,20 @@ class AlertResult(BaseModel):
|
|
20
20
|
status: Status of the alert (triggered or not)
|
21
21
|
conditions_result: List of condition evaluation results
|
22
22
|
metadata: Dictionary containing example_id, timestamp, and other metadata
|
23
|
+
notification: Optional notification configuration for triggered alerts
|
24
|
+
combine_type: The combination type used ("all" or "any")
|
25
|
+
project_id: Optional project identifier
|
26
|
+
trace_span_id: Optional trace span identifier
|
23
27
|
"""
|
24
28
|
rule_name: str
|
25
29
|
rule_id: Optional[str] = None # The unique identifier of the rule
|
26
30
|
status: AlertStatus
|
27
31
|
conditions_result: List[Dict[str, Any]] = []
|
28
32
|
metadata: Dict[str, Any] = {}
|
33
|
+
notification: Optional[Any] = None # NotificationConfig when triggered, None otherwise
|
34
|
+
combine_type: Optional[str] = None # "all" or "any"
|
35
|
+
project_id: Optional[str] = None # Project identifier
|
36
|
+
trace_span_id: Optional[str] = None # Trace span identifier
|
29
37
|
|
30
38
|
@property
|
31
39
|
def example_id(self) -> Optional[str]:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.43
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -18,6 +18,7 @@ Requires-Dist: langchain-core
|
|
18
18
|
Requires-Dist: langchain-huggingface
|
19
19
|
Requires-Dist: langchain-openai
|
20
20
|
Requires-Dist: litellm==1.61.15
|
21
|
+
Requires-Dist: matplotlib>=3.10.3
|
21
22
|
Requires-Dist: nest-asyncio
|
22
23
|
Requires-Dist: openai
|
23
24
|
Requires-Dist: pandas
|
@@ -31,37 +32,37 @@ Description-Content-Type: text/markdown
|
|
31
32
|
<img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
|
32
33
|
<img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
|
33
34
|
|
34
|
-
|
35
|
+
<br>
|
36
|
+
<div style="font-size: 1.5em;">
|
37
|
+
Open source tracing, evals, and metrics to debug, test, and monitor LLM agents.
|
38
|
+
</div>
|
35
39
|
|
36
|
-
|
40
|
+
## [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started) • [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc)
|
37
41
|
|
38
|
-
|
42
|
+
[Docs](https://docs.judgmentlabs.ai/introduction) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
|
39
43
|
|
40
|
-
|
44
|
+
We're hiring! Join us in our mission to unleash optimized agents.
|
41
45
|
|
42
46
|
[](https://x.com/JudgmentLabs)
|
43
47
|
[](https://www.linkedin.com/company/judgmentlabs)
|
44
48
|
[](https://discord.gg/ZCnSXYug)
|
45
49
|
|
46
|
-
|
50
|
+
<img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
|
47
51
|
|
48
|
-
|
52
|
+
</div>
|
49
53
|
|
50
|
-
Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
|
51
54
|
|
52
|
-
Judgeval
|
55
|
+
Judgeval offers **robust open-source tooling** for tracing, evaluating, and monitoring LLM agents. It helps AI teams effectively **test and monitor** agents in development and production, **closing the agent feedback loop**.
|
53
56
|
|
54
|
-
|
57
|
+
Judgeval can be set up **(cloud-hosted or self-hosted) in 5 minutes**!
|
58
|
+
> 🎁 Generous monthly [free tier](https://judgmentlabs.ai/pricing) (10k traces, 1k evals) - No credit card required!
|
55
59
|
|
56
60
|
Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
57
61
|
|
58
62
|
## 📋 Table of Contents
|
59
|
-
- [🌐 Landing Page • 📚 Docs • 🚀 Demos](#-landing-page----docs---demos)
|
60
|
-
- [Judgeval: open-source testing, monitoring, and optimization for AI agents](#judgeval-open-source-testing-monitoring-and-optimization-for-ai-agents)
|
61
|
-
- [📋 Table of Contents](#-table-of-contents)
|
62
63
|
- [✨ Features](#-features)
|
63
64
|
- [🛠️ Installation](#️-installation)
|
64
|
-
- [🏁
|
65
|
+
- [🏁 Quickstarts](#-quickstarts)
|
65
66
|
- [🛰️ Tracing](#️-tracing)
|
66
67
|
- [📝 Offline Evaluations](#-offline-evaluations)
|
67
68
|
- [📡 Online Evaluations](#-online-evaluations)
|
@@ -69,12 +70,6 @@ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
|
69
70
|
- [Key Features](#key-features)
|
70
71
|
- [Getting Started](#getting-started)
|
71
72
|
- [📚 Cookbooks](#-cookbooks)
|
72
|
-
- [Sample Agents](#sample-agents)
|
73
|
-
- [💰 LangGraph Financial QA Agent](#-langgraph-financial-qa-agent)
|
74
|
-
- [✈️ OpenAI Travel Agent](#️-openai-travel-agent)
|
75
|
-
- [Custom Evaluators](#custom-evaluators)
|
76
|
-
- [🔍 PII Detection](#-pii-detection)
|
77
|
-
- [📧 Cold Email Generation](#-cold-email-generation)
|
78
73
|
- [💻 Development with Cursor](#-development-with-cursor)
|
79
74
|
- [⭐ Star Us on GitHub](#-star-us-on-github)
|
80
75
|
- [❤️ Contributors](#️-contributors)
|
@@ -86,11 +81,10 @@ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
|
86
81
|
|
87
82
|
| | |
|
88
83
|
|:---|:---:|
|
89
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
90
|
-
| <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>
|
91
|
-
| <h3>📡 Monitoring</h3>
|
92
|
-
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets
|
93
|
-
| <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
|
84
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
85
|
+
| <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
|
86
|
+
| <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
87
|
+
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
94
88
|
|
95
89
|
## 🛠️ Installation
|
96
90
|
|
@@ -100,17 +94,19 @@ Get started with Judgeval by installing our SDK using pip:
|
|
100
94
|
pip install judgeval
|
101
95
|
```
|
102
96
|
|
103
|
-
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
|
97
|
+
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
|
104
98
|
|
105
|
-
|
99
|
+
```bash
|
100
|
+
export JUDGMENT_API_KEY=...
|
101
|
+
export JUDGMENT_ORG_ID=...
|
102
|
+
```
|
106
103
|
|
107
|
-
|
104
|
+
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
108
105
|
|
109
|
-
|
106
|
+
## 🏁 Quickstarts
|
110
107
|
|
111
108
|
### 🛰️ Tracing
|
112
109
|
|
113
|
-
Track your agent execution with full observability with just a few lines of code.
|
114
110
|
Create a file named `traces.py` with the following code:
|
115
111
|
|
116
112
|
```python
|
@@ -135,12 +131,15 @@ def main():
|
|
135
131
|
|
136
132
|
main()
|
137
133
|
```
|
134
|
+
You'll see your trace exported to the Judgment Platform:
|
135
|
+
|
136
|
+
<p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
|
137
|
+
|
138
138
|
|
139
139
|
[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-trace) for a more detailed explanation.
|
140
140
|
|
141
141
|
### 📝 Offline Evaluations
|
142
142
|
|
143
|
-
You can evaluate your agent's execution to measure quality metrics such as hallucination.
|
144
143
|
Create a file named `evaluate.py` with the following code:
|
145
144
|
|
146
145
|
```python evaluate.py
|
@@ -156,7 +155,7 @@ example = Example(
|
|
156
155
|
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
|
157
156
|
)
|
158
157
|
|
159
|
-
scorer = FaithfulnessScorer(threshold=0.5)
|
158
|
+
scorer = FaithfulnessScorer(threshold=0.5) # Hallucination detector
|
160
159
|
results = client.run_evaluation(
|
161
160
|
examples=[example],
|
162
161
|
scorers=[scorer],
|
@@ -205,6 +204,8 @@ def main():
|
|
205
204
|
main()
|
206
205
|
```
|
207
206
|
|
207
|
+
You should see an evaluation attached to your trace on the Judgment Platform.
|
208
|
+
|
208
209
|
[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-online-evaluation) for a more detailed explanation.
|
209
210
|
|
210
211
|
## 🏢 Self-Hosting
|
@@ -229,20 +230,8 @@ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judg
|
|
229
230
|
|
230
231
|
### Sample Agents
|
231
232
|
|
232
|
-
####
|
233
|
-
A
|
234
|
-
|
235
|
-
#### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
|
236
|
-
A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
|
237
|
-
|
238
|
-
### Custom Evaluators
|
239
|
-
|
240
|
-
#### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
|
241
|
-
Detecting and evaluating Personal Identifiable Information (PII) leakage.
|
242
|
-
|
243
|
-
#### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
|
244
|
-
|
245
|
-
Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
|
233
|
+
#### [Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent)
|
234
|
+
A multi-agent system augmented with tool calls designed for general purpose tasks like financial research and math. Traced and evaluated on Faithfulness (factual adherence to retrieval context).
|
246
235
|
|
247
236
|
## 💻 Development with Cursor
|
248
237
|
When building agents and LLM workflows in Cursor, providing proper context to your coding assistant helps ensure seamless integration with Judgment. This rule file supplies the essential context your coding assistant needs for successful implementation.
|
@@ -1,29 +1,29 @@
|
|
1
1
|
judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
|
2
2
|
judgeval/clients.py,sha256=EiTmvvWksTPyWIuMC9jz06SPY2vFzokIJUIGoScpisA,989
|
3
|
-
judgeval/constants.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=MmkgNXdwQOyYSVJc_I8EjX12OWZdFEzjaqXduRowuU4,6033
|
4
4
|
judgeval/evaluation_run.py,sha256=KNGtaGAwD18pDNOKF7PCMlLnQe9SpRLTs0XWFMrCiLc,6684
|
5
5
|
judgeval/judgment_client.py,sha256=JO3AkU-disPHQVK5g1SM-bs_EUSy8QZ3AaAj_Q2ag6s,24968
|
6
|
-
judgeval/rules.py,sha256=
|
7
|
-
judgeval/run_evaluation.py,sha256=
|
6
|
+
judgeval/rules.py,sha256=LLojqmiKzQ90jAczccfaOoc3b9LBJCWX0hZ7p439no8,21110
|
7
|
+
judgeval/run_evaluation.py,sha256=JI-BCyEVKW61JJ4qxFMk1ww4tams-1g_0aaCE4cHrU8,50252
|
8
8
|
judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
|
9
9
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
10
10
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
11
11
|
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
12
|
-
judgeval/common/s3_storage.py,sha256=
|
13
|
-
judgeval/common/tracer.py,sha256=
|
14
|
-
judgeval/common/utils.py,sha256=
|
12
|
+
judgeval/common/s3_storage.py,sha256=UZZzQ8CP9_42SKDoKpPncJx8CL5Dchh4jFlKxDKi-cs,3938
|
13
|
+
judgeval/common/tracer.py,sha256=jbXtgBgrfGH-zxW6Kf4VDpq8ot-yb0ggaC5isCQFGvw,128882
|
14
|
+
judgeval/common/utils.py,sha256=l2nvm3-LeeScZ02H9TB2AcJh1gJSK1lNdi1Tu0p_fNQ,34276
|
15
15
|
judgeval/data/__init__.py,sha256=GX_GloDtBB35mv3INWbSTP2r9cwCU2IeIYjzRT0SAd8,530
|
16
16
|
judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
|
17
17
|
judgeval/data/example.py,sha256=jcK78ff-TKNl9Qtxvbd1g61crpo-s4fWHaqyMIbQNq0,6877
|
18
18
|
judgeval/data/result.py,sha256=KfU9lhAKG_Xo2eGDm2uKVVRZpf177IDASg1cIwedJwE,3184
|
19
19
|
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
20
20
|
judgeval/data/tool.py,sha256=eEEvGDNNYWhcQiI6cjDv3rO1VoOJJS5LWGS76Gb_gtY,1813
|
21
|
-
judgeval/data/trace.py,sha256=
|
21
|
+
judgeval/data/trace.py,sha256=5HSJbCMvNTF4O8D_364dGv2cs-0oa4rCQcYR_hS5FG4,4881
|
22
22
|
judgeval/data/trace_run.py,sha256=fiB5Z5il9U9XqvksdA2DbLNd96U_Wrz8K00RuFJBy38,2324
|
23
23
|
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
24
24
|
judgeval/data/datasets/dataset.py,sha256=pq9-A1mg2Brpjg1TufDU_eLo9sQhX0nw-UTGaf3jCXA,12952
|
25
25
|
judgeval/data/datasets/eval_dataset_client.py,sha256=LJ1bf1sZAC4ZBCRTQ1Y4VrJuNSslYBQ1y9YKuhYxwqY,15176
|
26
|
-
judgeval/integrations/langgraph.py,sha256=
|
26
|
+
judgeval/integrations/langgraph.py,sha256=Ogk3MFE116WfRV4w_2c6mp3d27Uea7vmLstltML8VBM,31963
|
27
27
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
28
28
|
judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
|
29
29
|
judgeval/judges/litellm_judge.py,sha256=DhB6px9ELZL3gbMb2w4FkBliuTlaCVIcjE8v149G6NM,2425
|
@@ -60,9 +60,9 @@ judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne
|
|
60
60
|
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
|
61
61
|
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=O9xq2Cxcg16pFNZwHTb_MDJ5ehFab6oDiiNtC47AnY4,2584
|
62
62
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
63
|
-
judgeval/utils/alerts.py,sha256=
|
63
|
+
judgeval/utils/alerts.py,sha256=7HO42fEskQpwocUU-lq6EX4LGPzpxbIhaiJ5pkH31-I,3278
|
64
64
|
judgeval/utils/data_utils.py,sha256=pB4GBWi8XoM2zSR2NlLXH5kqcQ029BVhDxaVKkdmiBY,1860
|
65
|
-
judgeval-0.0.
|
66
|
-
judgeval-0.0.
|
67
|
-
judgeval-0.0.
|
68
|
-
judgeval-0.0.
|
65
|
+
judgeval-0.0.43.dist-info/METADATA,sha256=nA8kVqJDfwTJTzV31R3dbgkt4VTbcWTc_WRxyuWZZtQ,55748
|
66
|
+
judgeval-0.0.43.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
67
|
+
judgeval-0.0.43.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
68
|
+
judgeval-0.0.43.dist-info/RECORD,,
|
File without changes
|
File without changes
|