judgeval 0.0.41__py3-none-any.whl → 0.0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -100,9 +100,9 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
100
100
  raise JudgmentAPIError(error_message)
101
101
  return response_data
102
102
 
103
- def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]:
103
+ def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
104
104
  """
105
- Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
105
+ Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
106
106
  """
107
107
 
108
108
  try:
@@ -146,46 +146,47 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
146
146
  """
147
147
  # No merge required
148
148
  if not local_results and api_results:
149
- return api_results
149
+ return [result.model_copy() for result in api_results]
150
150
  if not api_results and local_results:
151
- return local_results
151
+ return [result.model_copy() for result in local_results]
152
152
 
153
153
  if len(api_results) != len(local_results):
154
154
  # Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
155
155
  raise ValueError(f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}")
156
156
 
157
+ # Create a copy of api_results to avoid modifying the input
158
+ merged_results = [result.model_copy() for result in api_results]
159
+
157
160
  # Each ScoringResult in api and local have all the same fields besides `scorers_data`
158
- for api_result, local_result in zip(api_results, local_results):
159
- if not (api_result.data_object and local_result.data_object):
161
+ for merged_result, local_result in zip(merged_results, local_results):
162
+ if not (merged_result.data_object and local_result.data_object):
160
163
  raise ValueError("Data object is None in one of the results.")
161
- if api_result.data_object.input != local_result.data_object.input:
164
+ if merged_result.data_object.input != local_result.data_object.input:
162
165
  raise ValueError("The API and local results are not aligned.")
163
- if api_result.data_object.actual_output != local_result.data_object.actual_output:
166
+ if merged_result.data_object.actual_output != local_result.data_object.actual_output:
164
167
  raise ValueError("The API and local results are not aligned.")
165
- if api_result.data_object.expected_output != local_result.data_object.expected_output:
168
+ if merged_result.data_object.expected_output != local_result.data_object.expected_output:
166
169
  raise ValueError("The API and local results are not aligned.")
167
- if api_result.data_object.context != local_result.data_object.context:
170
+ if merged_result.data_object.context != local_result.data_object.context:
168
171
  raise ValueError("The API and local results are not aligned.")
169
- if api_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
172
+ if merged_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
170
173
  raise ValueError("The API and local results are not aligned.")
171
- if api_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
174
+ if merged_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
172
175
  raise ValueError("The API and local results are not aligned.")
173
- if api_result.data_object.tools_called != local_result.data_object.tools_called:
176
+ if merged_result.data_object.tools_called != local_result.data_object.tools_called:
174
177
  raise ValueError("The API and local results are not aligned.")
175
- if api_result.data_object.expected_tools != local_result.data_object.expected_tools:
178
+ if merged_result.data_object.expected_tools != local_result.data_object.expected_tools:
176
179
  raise ValueError("The API and local results are not aligned.")
177
180
 
178
-
179
181
  # Merge ScorerData from the API and local scorers together
180
- api_scorer_data = api_result.scorers_data
182
+ api_scorer_data = merged_result.scorers_data
181
183
  local_scorer_data = local_result.scorers_data
182
184
  if api_scorer_data is None and local_scorer_data is not None:
183
- api_result.scorers_data = local_scorer_data
184
-
185
- if api_scorer_data is not None and local_scorer_data is not None:
186
- api_result.scorers_data = api_scorer_data + local_scorer_data
185
+ merged_result.scorers_data = local_scorer_data
186
+ elif api_scorer_data is not None and local_scorer_data is not None:
187
+ merged_result.scorers_data = api_scorer_data + local_scorer_data
187
188
 
188
- return api_results
189
+ return merged_results
189
190
 
190
191
 
191
192
  def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
@@ -405,8 +406,15 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
405
406
  )
406
407
  if function and tracer:
407
408
  new_traces: List[Trace] = []
408
- tracer.offline_mode = True
409
- tracer.traces = []
409
+
410
+ # Handle case where tracer is actually a callback handler
411
+ actual_tracer = tracer
412
+ if hasattr(tracer, 'tracer') and hasattr(tracer.tracer, 'traces'):
413
+ # This is a callback handler, get the underlying tracer
414
+ actual_tracer = tracer.tracer
415
+
416
+ actual_tracer.offline_mode = True
417
+ actual_tracer.traces = []
410
418
  for example in examples:
411
419
  if example.input:
412
420
  if isinstance(example.input, str):
@@ -417,19 +425,21 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
417
425
  raise ValueError(f"Input must be string or dict, got {type(example.input)}")
418
426
  else:
419
427
  result = run_with_spinner("Running agent function: ", function)
420
- for i, trace in enumerate(tracer.traces):
428
+
429
+
430
+ for i, trace in enumerate(actual_tracer.traces):
421
431
  # We set the root-level trace span with the expected tools of the Trace
422
432
  trace = Trace(**trace)
423
433
  trace.trace_spans[0].expected_tools = examples[i].expected_tools
424
434
  new_traces.append(trace)
425
435
  trace_run.traces = new_traces
426
- tracer.traces = []
436
+ actual_tracer.traces = []
427
437
 
428
438
  # Execute evaluation using Judgment API
429
439
  info("Starting API evaluation")
430
440
  try: # execute an EvaluationRun with just JudgmentScorers
431
441
  debug("Sending request to Judgment API")
432
- response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
442
+ response_data: Dict = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
433
443
  scoring_results = [ScoringResult(**result) for result in response_data["results"]]
434
444
  info(f"Received {len(scoring_results)} results from API")
435
445
  except JudgmentAPIError as e:
judgeval/utils/alerts.py CHANGED
@@ -20,12 +20,20 @@ class AlertResult(BaseModel):
20
20
  status: Status of the alert (triggered or not)
21
21
  conditions_result: List of condition evaluation results
22
22
  metadata: Dictionary containing example_id, timestamp, and other metadata
23
+ notification: Optional notification configuration for triggered alerts
24
+ combine_type: The combination type used ("all" or "any")
25
+ project_id: Optional project identifier
26
+ trace_span_id: Optional trace span identifier
23
27
  """
24
28
  rule_name: str
25
29
  rule_id: Optional[str] = None # The unique identifier of the rule
26
30
  status: AlertStatus
27
31
  conditions_result: List[Dict[str, Any]] = []
28
32
  metadata: Dict[str, Any] = {}
33
+ notification: Optional[Any] = None # NotificationConfig when triggered, None otherwise
34
+ combine_type: Optional[str] = None # "all" or "any"
35
+ project_id: Optional[str] = None # Project identifier
36
+ trace_span_id: Optional[str] = None # Trace span identifier
29
37
 
30
38
  @property
31
39
  def example_id(self) -> Optional[str]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.41
3
+ Version: 0.0.43
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -18,6 +18,7 @@ Requires-Dist: langchain-core
18
18
  Requires-Dist: langchain-huggingface
19
19
  Requires-Dist: langchain-openai
20
20
  Requires-Dist: litellm==1.61.15
21
+ Requires-Dist: matplotlib>=3.10.3
21
22
  Requires-Dist: nest-asyncio
22
23
  Requires-Dist: openai
23
24
  Requires-Dist: pandas
@@ -31,37 +32,37 @@ Description-Content-Type: text/markdown
31
32
  <img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
32
33
  <img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
33
34
 
34
- **Build monitoring & evaluation pipelines for complex agents**
35
+ <br>
36
+ <div style="font-size: 1.5em;">
37
+ Open source tracing, evals, and metrics to debug, test, and monitor LLM agents.
38
+ </div>
35
39
 
36
- <img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
40
+ ## [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started) [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc)
37
41
 
38
- <br>
42
+ [Docs](https://docs.judgmentlabs.ai/introduction) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
39
43
 
40
- ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [📚 Docs](https://docs.judgmentlabs.ai/introduction) [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
44
+ We're hiring! Join us in our mission to unleash optimized agents.
41
45
 
42
46
  [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
43
47
  [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
44
48
  [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
45
49
 
46
- </div>
50
+ <img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
47
51
 
48
- ## Judgeval: open-source testing, monitoring, and optimization for AI agents
52
+ </div>
49
53
 
50
- Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
51
54
 
52
- Judgeval gets you started in five minutes, after which you'll be ready to use all of its features as your agent becomes more complex. Judgeval is natively connected to the [Judgment Platform](https://www.judgmentlabs.ai/) for free and you can export your data and self-host at any time.
55
+ Judgeval offers **robust open-source tooling** for tracing, evaluating, and monitoring LLM agents. It helps AI teams effectively **test and monitor** agents in development and production, **closing the agent feedback loop**.
53
56
 
54
- We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and allow custom eval integrations for any use case. Check out our quickstarts below or our [setup guide](https://docs.judgmentlabs.ai/getting-started) to get started.
57
+ Judgeval can be set up **(cloud-hosted or self-hosted) in 5 minutes**!
58
+ > 🎁 Generous monthly [free tier](https://judgmentlabs.ai/pricing) (10k traces, 1k evals) - No credit card required!
55
59
 
56
60
  Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
57
61
 
58
62
  ## 📋 Table of Contents
59
- - [🌐 Landing Page • 📚 Docs • 🚀 Demos](#-landing-page----docs---demos)
60
- - [Judgeval: open-source testing, monitoring, and optimization for AI agents](#judgeval-open-source-testing-monitoring-and-optimization-for-ai-agents)
61
- - [📋 Table of Contents](#-table-of-contents)
62
63
  - [✨ Features](#-features)
63
64
  - [🛠️ Installation](#️-installation)
64
- - [🏁 Get Started](#-get-started)
65
+ - [🏁 Quickstarts](#-quickstarts)
65
66
  - [🛰️ Tracing](#️-tracing)
66
67
  - [📝 Offline Evaluations](#-offline-evaluations)
67
68
  - [📡 Online Evaluations](#-online-evaluations)
@@ -69,12 +70,6 @@ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
69
70
  - [Key Features](#key-features)
70
71
  - [Getting Started](#getting-started)
71
72
  - [📚 Cookbooks](#-cookbooks)
72
- - [Sample Agents](#sample-agents)
73
- - [💰 LangGraph Financial QA Agent](#-langgraph-financial-qa-agent)
74
- - [✈️ OpenAI Travel Agent](#️-openai-travel-agent)
75
- - [Custom Evaluators](#custom-evaluators)
76
- - [🔍 PII Detection](#-pii-detection)
77
- - [📧 Cold Email Generation](#-cold-email-generation)
78
73
  - [💻 Development with Cursor](#-development-with-cursor)
79
74
  - [⭐ Star Us on GitHub](#-star-us-on-github)
80
75
  - [❤️ Contributors](#️-contributors)
@@ -86,11 +81,10 @@ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
86
81
 
87
82
  | | |
88
83
  |:---|:---:|
89
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
90
- | <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Build custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails <br><br> | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
91
- | <h3>📡 Monitoring</h3>Real-time performance tracking of your agents in production environments. **Track all your metrics in one place.**<br><br>Set up **Slack/email alerts** for critical metrics and receive notifications when thresholds are exceeded.<br><br> **Useful for:** <br>•📉 Identifying degradation early <br>•📈 Visualizing performance trends across versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
92
- | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets hosted on Judgment's Platform. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🔄 Scaled analysis for A/B tests <br>• 🗃️ Filtered collections of agent runtime data| <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
93
- | <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
84
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
85
+ | <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
86
+ | <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
87
+ | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
94
88
 
95
89
  ## 🛠️ Installation
96
90
 
@@ -100,17 +94,19 @@ Get started with Judgeval by installing our SDK using pip:
100
94
  pip install judgeval
101
95
  ```
102
96
 
103
- Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
97
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
104
98
 
105
- **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
99
+ ```bash
100
+ export JUDGMENT_API_KEY=...
101
+ export JUDGMENT_ORG_ID=...
102
+ ```
106
103
 
107
- ## 🏁 Get Started
104
+ **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
108
105
 
109
- Here's how you can quickly start using Judgeval:
106
+ ## 🏁 Quickstarts
110
107
 
111
108
  ### 🛰️ Tracing
112
109
 
113
- Track your agent execution with full observability with just a few lines of code.
114
110
  Create a file named `traces.py` with the following code:
115
111
 
116
112
  ```python
@@ -135,12 +131,15 @@ def main():
135
131
 
136
132
  main()
137
133
  ```
134
+ You'll see your trace exported to the Judgment Platform:
135
+
136
+ <p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
137
+
138
138
 
139
139
  [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-trace) for a more detailed explanation.
140
140
 
141
141
  ### 📝 Offline Evaluations
142
142
 
143
- You can evaluate your agent's execution to measure quality metrics such as hallucination.
144
143
  Create a file named `evaluate.py` with the following code:
145
144
 
146
145
  ```python evaluate.py
@@ -156,7 +155,7 @@ example = Example(
156
155
  retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
157
156
  )
158
157
 
159
- scorer = FaithfulnessScorer(threshold=0.5)
158
+ scorer = FaithfulnessScorer(threshold=0.5) # Hallucination detector
160
159
  results = client.run_evaluation(
161
160
  examples=[example],
162
161
  scorers=[scorer],
@@ -205,6 +204,8 @@ def main():
205
204
  main()
206
205
  ```
207
206
 
207
+ You should see an evaluation attached to your trace on the Judgment Platform.
208
+
208
209
  [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-online-evaluation) for a more detailed explanation.
209
210
 
210
211
  ## 🏢 Self-Hosting
@@ -229,20 +230,8 @@ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judg
229
230
 
230
231
  ### Sample Agents
231
232
 
232
- #### 💰 [LangGraph Financial QA Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/financial_agent/demo.py)
233
- A LangGraph-based agent for financial queries, featuring RAG capabilities with a vector database for contextual data retrieval and evaluation of its reasoning and data accuracy.
234
-
235
- #### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
236
- A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
237
-
238
- ### Custom Evaluators
239
-
240
- #### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
241
- Detecting and evaluating Personal Identifiable Information (PII) leakage.
242
-
243
- #### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
244
-
245
- Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
233
+ #### [Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent)
234
+ A multi-agent system augmented with tool calls designed for general purpose tasks like financial research and math. Traced and evaluated on Faithfulness (factual adherence to retrieval context).
246
235
 
247
236
  ## 💻 Development with Cursor
248
237
  When building agents and LLM workflows in Cursor, providing proper context to your coding assistant helps ensure seamless integration with Judgment. This rule file supplies the essential context your coding assistant needs for successful implementation.
@@ -1,29 +1,29 @@
1
1
  judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
2
2
  judgeval/clients.py,sha256=EiTmvvWksTPyWIuMC9jz06SPY2vFzokIJUIGoScpisA,989
3
- judgeval/constants.py,sha256=xuO-Und5c0-K3yTRn2fAkwyY2uTf8b7dGd39CPVqkSQ,5661
3
+ judgeval/constants.py,sha256=MmkgNXdwQOyYSVJc_I8EjX12OWZdFEzjaqXduRowuU4,6033
4
4
  judgeval/evaluation_run.py,sha256=KNGtaGAwD18pDNOKF7PCMlLnQe9SpRLTs0XWFMrCiLc,6684
5
5
  judgeval/judgment_client.py,sha256=JO3AkU-disPHQVK5g1SM-bs_EUSy8QZ3AaAj_Q2ag6s,24968
6
- judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
7
- judgeval/run_evaluation.py,sha256=MshtOGvWm_eGj2JamEtiMWvPjdCwrKTp9WcAUrBm2Fs,49673
6
+ judgeval/rules.py,sha256=LLojqmiKzQ90jAczccfaOoc3b9LBJCWX0hZ7p439no8,21110
7
+ judgeval/run_evaluation.py,sha256=JI-BCyEVKW61JJ4qxFMk1ww4tams-1g_0aaCE4cHrU8,50252
8
8
  judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
9
9
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
10
10
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
11
11
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
12
- judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
13
- judgeval/common/tracer.py,sha256=rYNmyB3Z955xfnKmlase6gub8Xf5xz6nQefONs_Td5U,90870
14
- judgeval/common/utils.py,sha256=sWdHfqgiF6AnKTQNmeUBfoEsddXgInI5M24t2-QYexk,34271
12
+ judgeval/common/s3_storage.py,sha256=UZZzQ8CP9_42SKDoKpPncJx8CL5Dchh4jFlKxDKi-cs,3938
13
+ judgeval/common/tracer.py,sha256=jbXtgBgrfGH-zxW6Kf4VDpq8ot-yb0ggaC5isCQFGvw,128882
14
+ judgeval/common/utils.py,sha256=l2nvm3-LeeScZ02H9TB2AcJh1gJSK1lNdi1Tu0p_fNQ,34276
15
15
  judgeval/data/__init__.py,sha256=GX_GloDtBB35mv3INWbSTP2r9cwCU2IeIYjzRT0SAd8,530
16
16
  judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
17
17
  judgeval/data/example.py,sha256=jcK78ff-TKNl9Qtxvbd1g61crpo-s4fWHaqyMIbQNq0,6877
18
18
  judgeval/data/result.py,sha256=KfU9lhAKG_Xo2eGDm2uKVVRZpf177IDASg1cIwedJwE,3184
19
19
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
20
20
  judgeval/data/tool.py,sha256=eEEvGDNNYWhcQiI6cjDv3rO1VoOJJS5LWGS76Gb_gtY,1813
21
- judgeval/data/trace.py,sha256=S9IQunatke-Kcxi2-qXg3CtbmxBk8VGBDJzWshx7zJg,4798
21
+ judgeval/data/trace.py,sha256=5HSJbCMvNTF4O8D_364dGv2cs-0oa4rCQcYR_hS5FG4,4881
22
22
  judgeval/data/trace_run.py,sha256=fiB5Z5il9U9XqvksdA2DbLNd96U_Wrz8K00RuFJBy38,2324
23
23
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
24
24
  judgeval/data/datasets/dataset.py,sha256=pq9-A1mg2Brpjg1TufDU_eLo9sQhX0nw-UTGaf3jCXA,12952
25
25
  judgeval/data/datasets/eval_dataset_client.py,sha256=LJ1bf1sZAC4ZBCRTQ1Y4VrJuNSslYBQ1y9YKuhYxwqY,15176
26
- judgeval/integrations/langgraph.py,sha256=L9zPPWVLGL2HWuwHPqM5Kic4S7EfQ_Y1Y3YKBJNfGCA,23004
26
+ judgeval/integrations/langgraph.py,sha256=Ogk3MFE116WfRV4w_2c6mp3d27Uea7vmLstltML8VBM,31963
27
27
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
28
28
  judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
29
29
  judgeval/judges/litellm_judge.py,sha256=DhB6px9ELZL3gbMb2w4FkBliuTlaCVIcjE8v149G6NM,2425
@@ -60,9 +60,9 @@ judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne
60
60
  judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
61
61
  judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=O9xq2Cxcg16pFNZwHTb_MDJ5ehFab6oDiiNtC47AnY4,2584
62
62
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
63
- judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
63
+ judgeval/utils/alerts.py,sha256=7HO42fEskQpwocUU-lq6EX4LGPzpxbIhaiJ5pkH31-I,3278
64
64
  judgeval/utils/data_utils.py,sha256=pB4GBWi8XoM2zSR2NlLXH5kqcQ029BVhDxaVKkdmiBY,1860
65
- judgeval-0.0.41.dist-info/METADATA,sha256=-sO68MUEmN3s4ji7Vf1gTuPv60R7Ny6bMcuuKlFSSI8,57358
66
- judgeval-0.0.41.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
67
- judgeval-0.0.41.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
68
- judgeval-0.0.41.dist-info/RECORD,,
65
+ judgeval-0.0.43.dist-info/METADATA,sha256=nA8kVqJDfwTJTzV31R3dbgkt4VTbcWTc_WRxyuWZZtQ,55748
66
+ judgeval-0.0.43.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
67
+ judgeval-0.0.43.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
68
+ judgeval-0.0.43.dist-info/RECORD,,