judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,265 @@
1
+ Metadata-Version: 2.4
2
+ Name: judgeval
3
+ Version: 0.22.2
4
+ Summary: Judgeval Package
5
+ Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
+ Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
+ Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
+ License-Expression: Apache-2.0
9
+ License-File: LICENSE.md
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.10
13
+ Requires-Dist: boto3>=1.40.11
14
+ Requires-Dist: click<8.2.0
15
+ Requires-Dist: dotenv
16
+ Requires-Dist: httpx>=0.28.1
17
+ Requires-Dist: litellm>=1.75.0
18
+ Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
19
+ Requires-Dist: opentelemetry-sdk>=1.36.0
20
+ Requires-Dist: orjson>=3.9.0
21
+ Requires-Dist: typer>=0.9.0
22
+ Provides-Extra: s3
23
+ Requires-Dist: boto3>=1.40.11; extra == 's3'
24
+ Provides-Extra: trainer
25
+ Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
26
+ Description-Content-Type: text/markdown
27
+
28
+ <div align="center">
29
+
30
+ <a href="https://judgmentlabs.ai/">
31
+ <picture>
32
+ <source media="(prefers-color-scheme: dark)" srcset="assets/logo_darkmode.svg">
33
+ <img src="assets/logo_lightmode.svg" alt="Judgment Logo" width="400" />
34
+ </picture>
35
+ </a>
36
+
37
+ <br>
38
+
39
+ ## Agent Behavior Monitoring (ABM)
40
+
41
+ Track and judge any agent behavior in online and offline setups. Set up Sentry-style alerts and analyze agent behaviors / topic patterns at scale!
42
+
43
+ [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.judgmentlabs.ai/documentation)
44
+ [![Judgment Cloud](https://img.shields.io/badge/Judgment%20Cloud-brightgreen)](https://app.judgmentlabs.ai/register)
45
+ [![Self-Host](https://img.shields.io/badge/Self--Host-orange)](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
46
+
47
+
48
+ [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
49
+ [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
50
+
51
+ </div>
52
+
53
+
54
+ </table>
55
+
56
+ ## [NEW] 🎆 Agent Reinforcement Learning
57
+
58
+ Train your agents with multi-turn reinforcement learning using judgeval and [Fireworks AI](https://fireworks.ai/)! Judgeval's ABM now integrates with Fireworks' Reinforcement Fine-Tuning (RFT) endpoint, supporting gpt-oss, qwen3, Kimi2, DeepSeek, and more.
59
+
60
+ Judgeval's agent monitoring infra provides a simple harness for integrating GRPO into any Python agent, giving builders a quick method to **try RL with minimal code changes** to their existing agents!
61
+
62
+ ```python
63
+ await trainer.train(
64
+ agent_function=your_agent_function, # entry point to your agent
65
+ scorers=[RewardScorer()], # Custom scorer you define based on task criteria, acts as reward
66
+ prompts=training_prompts # Tasks
67
+ )
68
+ ```
69
+
70
+ **That's it!** Judgeval automatically manages trajectory collection and reward tagging - your agent can learn from production data with minimal code changes.
71
+
72
+ 👉 Check out the [Wikipedia Racer notebook](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb), where an agent learns to navigate Wikipedia using RL, to see Judgeval in action.
73
+
74
+
75
+ You can view and monitor training progress for free via the [Judgment Dashboard](https://app.judgmentlabs.ai/).
76
+
77
+
78
+ ## Judgeval Overview
79
+
80
+ Judgeval is an open-source framework for agent behavior monitoring. Judgeval offers a toolkit to track and judge agent behavior in online and offline setups, enabling you to convert interaction data from production/test environments into improved agents. To get started, try running one of the notebooks below or dive deeper in our [docs](https://docs.judgmentlabs.ai/documentation).
81
+
82
+ Our mission is to unlock the power of production data for agent development, enabling teams to improve their apps by catching real-time failures and optimizing over their users' preferences.
83
+
84
+ ## 📚 Cookbooks
85
+
86
+ | Try Out | Notebook | Description |
87
+ |:---------|:-----|:------------|
88
+ | RL | [Wikipedia Racer](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb) | Train agents with reinforcement learning |
89
+ | Online ABM | [Research Agent](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/monitoring/Research_Agent_Online_Monitoring.ipynb) | Monitor agent behavior in production |
90
+ | Custom Scorers | [HumanEval](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/custom_scorers/HumanEval_Custom_Scorer.ipynb) | Build custom evaluators for your agents |
91
+ | Offline Testing | [Get Started For Free] | Compare how different prompts, models, or agent configs affect performance across ANY metric |
92
+
93
+ You can access our [repo of cookbooks](https://github.com/JudgmentLabs/judgment-cookbook).
94
+
95
+ You can find a list of [video tutorials for Judgeval use cases](https://www.youtube.com/@Alexshander-JL).
96
+
97
+ ## Why Judgeval?
98
+
99
+ 🤖 **Simple to run multi-turn RL**: Optimize your agents with multi-turn RL without managing compute infrastructure or data pipelines. Just add a few lines of code to your existing agent code and train!
100
+
101
+ ⚙️ **Custom Evaluators**: No restriction to only monitoring with prefab scorers. Judgeval provides simple abstractions for custom Python scorers, supporting any LLM-as-a-judge rubrics/models and code-based scorers that integrate to our live agent-tracking infrastructure. [Learn more](https://docs.judgmentlabs.ai/documentation/evaluation/custom-scorers)
102
+
103
+ 🚨 **Production Monitoring**: Run any custom scorer in a hosted, virtualized secure container to flag agent behaviors online in production. Get Slack alerts for failures and add custom hooks to address regressions before they impact users. [Learn more](https://docs.judgmentlabs.ai/documentation/performance/online-evals)
104
+
105
+ 📊 **Behavior/Topic Grouping**: Group agent runs by behavior type or topic for deeper analysis. Drill down into subsets of users, agents, or use cases to reveal patterns of agent behavior.
106
+ <!-- Add link to Bucketing docs once we have it -->
107
+ <!--
108
+ TODO: Once we have trainer code docs, plug in here
109
+ -->
110
+
111
+ 🧪 **Run experiments on your agents**: Compare test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors.
112
+
113
+ <!--
114
+ Use this once we have AI PM features:
115
+
116
+ **Run experiments on your agents**: A/B test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors. [Learn more]
117
+
118
+ -->
119
+
120
+ ## 🛠️ Quickstart
121
+
122
+ Get started with Judgeval by installing our SDK using pip:
123
+
124
+ ```bash
125
+ pip install judgeval
126
+ ```
127
+
128
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
129
+
130
+ ```bash
131
+ export JUDGMENT_API_KEY=...
132
+ export JUDGMENT_ORG_ID=...
133
+ ```
134
+
135
+ **If you don't have keys, [create an account for free](https://app.judgmentlabs.ai/register) on the platform!**
136
+
137
+ ### Start monitoring with Judgeval
138
+
139
+ ```python
140
+ from judgeval.tracer import Tracer, wrap
141
+ from judgeval.data import Example
142
+ from judgeval.scorers import AnswerRelevancyScorer
143
+ from openai import OpenAI
144
+
145
+
146
+ judgment = Tracer(project_name="default_project")
147
+ client = wrap(OpenAI()) # tracks all LLM calls
148
+
149
+ @judgment.observe(span_type="tool")
150
+ def format_question(question: str) -> str:
151
+ # dummy tool
152
+ return f"Question : {question}"
153
+
154
+ @judgment.observe(span_type="function")
155
+ def run_agent(prompt: str) -> str:
156
+ task = format_question(prompt)
157
+ response = client.chat.completions.create(
158
+ model="gpt-5-mini",
159
+ messages=[{"role": "user", "content": task}]
160
+ )
161
+
162
+ judgment.async_evaluate( # trigger online monitoring
163
+ scorer=AnswerRelevancyScorer(threshold=0.5), # swap with any scorer
164
+ example=Example(input=task, actual_output=response), # customize to your data
165
+ model="gpt-5",
166
+ )
167
+ return response.choices[0].message.content
168
+
169
+ run_agent("What is the capital of the United States?")
170
+ ```
171
+
172
+ Running this code will deliver monitoring results to your [free platform account](https://app.judgmentlabs.ai/register) and should look like this:
173
+
174
+ ![Judgment Platform Trajectory View](assets/quickstart_trajectory_ss.png)
175
+
176
+
177
+ ### Customizable Scorers Over Agent Behavior
178
+
179
+ Judgeval's strongest suit is the full customization over the types of scorers you can run online monitoring with. No restrictions to only single-prompt LLM judges or prefab scorers - if you can express your scorer
180
+ in python code, judgeval can monitor it! Under the hood, judgeval hosts your scorer in a virtualized secure container, enabling online monitoring for any scorer.
181
+
182
+
183
+ First, create a behavior scorer in a file called `helpfulness_scorer.py`:
184
+
185
+ ```python
186
+ from judgeval.data import Example
187
+ from judgeval.scorers.example_scorer import ExampleScorer
188
+
189
+ # Define custom example class
190
+ class QuestionAnswer(Example):
191
+ question: str
192
+ answer: str
193
+
194
+ # Define a server-hosted custom scorer
195
+ class HelpfulnessScorer(ExampleScorer):
196
+ name: str = "Helpfulness Scorer"
197
+ server_hosted: bool = True # Enable server hosting
198
+ async def a_score_example(self, example: QuestionAnswer):
199
+ # Custom scoring logic for agent behavior
200
+ # Can be an arbitrary combination of code and LLM calls
201
+ if len(example.answer) > 10 and "?" not in example.answer:
202
+ self.reason = "Answer is detailed and provides helpful information"
203
+ return 1.0
204
+ else:
205
+ self.reason = "Answer is too brief or unclear"
206
+ return 0.0
207
+ ```
208
+
209
+ Then deploy your scorer to Judgment's infrastructure:
210
+
211
+ ```bash
212
+ echo "pydantic" > requirements.txt
213
+ uv run judgeval upload_scorer helpfulness_scorer.py requirements.txt
214
+ ```
215
+
216
+ Now you can instrument your agent with monitoring and online evaluation:
217
+
218
+ ```python
219
+ from judgeval.tracer import Tracer, wrap
220
+ from helpfulness_scorer import HelpfulnessScorer, QuestionAnswer
221
+ from openai import OpenAI
222
+
223
+ judgment = Tracer(project_name="default_project")
224
+ client = wrap(OpenAI()) # tracks all LLM calls
225
+
226
+ @judgment.observe(span_type="tool")
227
+ def format_task(question: str) -> str: # replace with your prompt engineering
228
+ return f"Please answer the following question: {question}"
229
+
230
+ @judgment.observe(span_type="tool")
231
+ def answer_question(prompt: str) -> str: # replace with your LLM system calls
232
+ response = client.chat.completions.create(
233
+ model="gpt-5-mini",
234
+ messages=[{"role": "user", "content": prompt}]
235
+ )
236
+ return response.choices[0].message.content
237
+
238
+ @judgment.observe(span_type="function")
239
+ def run_agent(question: str) -> str:
240
+ task = format_task(question)
241
+ answer = answer_question(task)
242
+
243
+ # Add online evaluation with server-hosted scorer
244
+ judgment.async_evaluate(
245
+ scorer=HelpfulnessScorer(),
246
+ example=QuestionAnswer(question=question, answer=answer),
247
+ sampling_rate=0.9 # Evaluate 90% of agent runs
248
+ )
249
+
250
+ return answer
251
+
252
+ if __name__ == "__main__":
253
+ result = run_agent("What is the capital of the United States?")
254
+ print(result)
255
+ ```
256
+
257
+ Congratulations! Your online eval result should look like this:
258
+
259
+ ![Custom Scorer Online ABM](assets/custom_scorer_online_abm.png)
260
+
261
+ You can now run any online scorer in a secure Firecracker microVMs with no latency impact on your applications.
262
+
263
+ ---
264
+
265
+ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
@@ -0,0 +1,112 @@
1
+ judgeval/__init__.py,sha256=RRiBbXUj7M1VW3NqFvMZlXyI72duh3VA5bfIWqPmKNw,6670
2
+ judgeval/cli.py,sha256=T9nKO9eHMOiLCgxaxuihqtRHsG_dMT06sW6X873MmnI,2209
3
+ judgeval/constants.py,sha256=JZZJ1MqzZZDVk-5PRPRbmLnM8mXI-RDL5vxa1JFuscs,3408
4
+ judgeval/env.py,sha256=uFggNNKmfDaa5dmZMwwXVIDdHAHe524jDWUpByV4hm4,1879
5
+ judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
6
+ judgeval/logger.py,sha256=VP5blbsJ53mvJbNHfBf5p2KrARUrkrErpPkB-__Hh3U,1562
7
+ judgeval/version.py,sha256=j1d7CQ2JT0bsK7bGd5vCKR0rT4ebA9YYUF2-5heFZd8,74
8
+ judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
9
+ judgeval/api/__init__.py,sha256=dGZm9KtgLMnmbiyDEJ_D7suuVqmsibR_Cd0YZRJ7qHI,15210
10
+ judgeval/api/api_types.py,sha256=PJ5ZQWuvCl5GXFzhcpOw6Iuktr50lo5BaILmZcAKWfc,10085
11
+ judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
12
+ judgeval/data/evaluation_run.py,sha256=O41p99wNAuCAf6lsLNKzkZ6W-kL9LlzCYxVls7IcKkA,4727
13
+ judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
14
+ judgeval/data/judgment_types.py,sha256=7RsrB2FvnsRRtnqMMfQzAMMn9oNvA076hbE2tmzKNXc,18874
15
+ judgeval/data/result.py,sha256=XufFGSAkBDfevPUmzSgsR9HEqytISkM0U5HkhJmsjpY,2102
16
+ judgeval/data/scorer_data.py,sha256=HeP15ZgftFTJCF8JmDJCLWXRnZJIaGDJCzl7Hg6gWwE,2006
17
+ judgeval/data/trace.py,sha256=zSiR3o6xt8Z46XA3M9fJBtViF0BsPO6yKp9jxdscOSc,3881
18
+ judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
19
+ judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
20
+ judgeval/dataset/__init__.py,sha256=s7HuBH_TQOLZ1arqaY2QRiSp-4mI_fF_9OykK_1QbsI,8858
21
+ judgeval/evaluation/__init__.py,sha256=e9H4h73MINpcBlBYpkXiUaoCdWxnzvaYK0Ob0awY-kM,13064
22
+ judgeval/integrations/langgraph/__init__.py,sha256=HwXmtDxaO75Kn4KPErnMb6Ne6FcpRxV_SCYVuwFsve0,332
23
+ judgeval/integrations/openlit/__init__.py,sha256=-8D4D6-fGsWPwoOojw82OaE9X5sUbmb16x1bF-WfOmg,1571
24
+ judgeval/judges/__init__.py,sha256=e7JnTc1TG_SwqydDHTXHIP0EBazQxt-ydMQG7ghSU5A,228
25
+ judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
26
+ judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
27
+ judgeval/judges/together_judge.py,sha256=GzwlXZJzle8hT-vWKmq39JyIeanJqJfHDOkrksUbzk0,4398
28
+ judgeval/judges/utils.py,sha256=ITbYwvjU3o9-FIAReFvxh24yJrx9LV3l9BnSBgKUpxg,2068
29
+ judgeval/prompt/__init__.py,sha256=Qgrd8u4WniaOjbRAoEFEeMnTmaqIGx5ZGX_U85iqhs0,11010
30
+ judgeval/scorers/__init__.py,sha256=pomKzEy4YNFyygYp8vbS3co8iB5CMstRkQwdUgi1u4g,744
31
+ judgeval/scorers/agent_scorer.py,sha256=-qcNSkY6i7ur2LXkM7H1jTKuuFbDuXbjTq42o3vjeQ8,595
32
+ judgeval/scorers/api_scorer.py,sha256=jPBQUBs_T3Xq33QoIbIXDzUaXinz56qeDfo96dfdX0g,2036
33
+ judgeval/scorers/base_scorer.py,sha256=hsMuqdW8QtW5n9JzruXyaZC7im2K2sSmz1RDkbMisJ4,2702
34
+ judgeval/scorers/example_scorer.py,sha256=o_BGUztJXjnKnuOqIa9T4PXe0wPoWg63FyH518N1LxA,561
35
+ judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
36
+ judgeval/scorers/score.py,sha256=xquM59SCtNeuAsrBsHFgBQk3CHp4-bms4oFs24xfcU0,7176
37
+ judgeval/scorers/utils.py,sha256=dDxPKVjKa1lsMXNhZ8-aJFG3qk1usAH1JnKeC3vBQbU,304
38
+ judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=wrq7y9I30GZbwDXIrSh81KRO_-j7i-1DjwX5Hc3PScI,728
40
+ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=WUeFyWdr1Wc8dh-aQ1nrK-mbd9W0MT4VyzLT5CbJ2-Q,450
41
+ judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
42
+ judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ucYOI6ztAjfoYmcgTDzN8u5RrehlVqrkeLEfss9b1fk,441
43
+ judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=V3RdrWhnR_vLBrtWw7QbgN9K_A-Och7-v9I2fN4z8gY,506
44
+ judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=WhSkPs8tWyQ_cS-y-VTzrKAPlizKp-6zi_DmfgW4AgM,10773
45
+ judgeval/tracer/__init__.py,sha256=5OR0mxzrsWkh-tkT53WzrwtZ1EBIidx-rYXeO5nuWLc,39621
46
+ judgeval/tracer/constants.py,sha256=tLR5ClDaNlNg_MAv2XRdk62uQW4KyBnWaNbG_YYblTc,55
47
+ judgeval/tracer/keys.py,sha256=mYBo_X6-rC9xfiI-WpjHlO7rUtcMORtQXCQyO1F3Ycc,2387
48
+ judgeval/tracer/managers.py,sha256=JiUjX_evToxcuogKVcE6qpJkSvYOxAXCU4_z_hWXJOw,5199
49
+ judgeval/tracer/utils.py,sha256=xWha5iwC733wCf2HKbNqzxOPS1ovO1OymWIUFLz-UpQ,537
50
+ judgeval/tracer/exporters/__init__.py,sha256=3WDXC28iY5gYMM5s7ejmy7P-DVDQ_iIuzwovZxUKJXg,1295
51
+ judgeval/tracer/exporters/s3.py,sha256=N9gmw17cnR0VkfAQQkLsNj5BksgNRETThR5qYhWRjP4,4360
52
+ judgeval/tracer/exporters/store.py,sha256=pA_KINcm0amO0WEDYmMFU05SSsMOgJ5ogIRaevSX1sk,1885
53
+ judgeval/tracer/exporters/utils.py,sha256=JRcoSQuEHxMDJbXfyrUIfA2SHBVkZM82h4bTbYGxkNw,1154
54
+ judgeval/tracer/llm/__init__.py,sha256=ENxApieKSktYrIviofXWP9GU0WnhBm0Q9mlGe_m_gMY,139
55
+ judgeval/tracer/llm/config.py,sha256=J8-bTL82bgDqdTJPN-Px3Epvoa9FG7L-X329kitwBTc,2525
56
+ judgeval/tracer/llm/constants.py,sha256=IWa3CMes8wIt_UG7jrGEOztg2sHz54fdOMWIOOr-dz8,172
57
+ judgeval/tracer/llm/providers.py,sha256=VAimkmChOOjhC1cUv-0iG8pa5PhOw1HIOyt3zrIrbcM,628
58
+ judgeval/tracer/llm/llm_anthropic/__init__.py,sha256=HG0gIlTgaRt-Y0u1ERPQ19pUgb4YHkTh7tZQPeyR4oM,80
59
+ judgeval/tracer/llm/llm_anthropic/config.py,sha256=ICfKODPQvZsRxpK4xWQ-YE79pmWJTmY2wryddxpNdpM,153
60
+ judgeval/tracer/llm/llm_anthropic/messages.py,sha256=T7dApxJCsOWEpquYSZICACwTioZG3ZcxHdJjvF04T2E,15474
61
+ judgeval/tracer/llm/llm_anthropic/messages_stream.py,sha256=DKlZZnfK_yv_tEMwF2XxvsjgUjOFI3c5JUMQwERNV7k,12188
62
+ judgeval/tracer/llm/llm_anthropic/wrapper.py,sha256=JILcyC4NvjXZSqlFoZp-VB-JsCYZkQPMFEYaB4AysrA,1849
63
+ judgeval/tracer/llm/llm_google/__init__.py,sha256=otBZETsAfVZjtZaN5N36Ln0kw-I9jVB4tFGrV6novHo,74
64
+ judgeval/tracer/llm/llm_google/config.py,sha256=S3yCAE9oHbXjLVYiz5mGD16yIgXMBBUu5UN4lBjoCNQ,162
65
+ judgeval/tracer/llm/llm_google/generate_content.py,sha256=w1rIh1cTBYnkfBQTL4qHntwsKfBcSrf2VSS2y-BOMRU,4030
66
+ judgeval/tracer/llm/llm_google/wrapper.py,sha256=jqaMXGoM9dlPBbCFadMI5EqFrNHzBt0h9VkNn7KPVLk,901
67
+ judgeval/tracer/llm/llm_openai/__init__.py,sha256=CyzwhY0-zmqWKlEno7JPBcvO7G_hI8dp6-_5_KEzFqg,74
68
+ judgeval/tracer/llm/llm_openai/beta_chat_completions.py,sha256=IXw-Gu-WUxQ-gaBUIe-aAKOn1Pakn_RFl0b1C_1toP8,7326
69
+ judgeval/tracer/llm/llm_openai/chat_completions.py,sha256=U086NgaaLFiyvAYrgJncC-obaaSbG2r_3ehquNlVTDQ,17637
70
+ judgeval/tracer/llm/llm_openai/config.py,sha256=NE0ixKhd4WVeAVjY8jNTncuKYH6R4MQDLPmcCsd3zWY,144
71
+ judgeval/tracer/llm/llm_openai/responses.py,sha256=CCGYz35gn3jJOYE2anyR49OR2XhSDwy3dEsISbzMO8Q,18137
72
+ judgeval/tracer/llm/llm_openai/utils.py,sha256=fpy9war8dyke25qHxGW2Yo028RA4Siq0RBLA4G63yUw,1480
73
+ judgeval/tracer/llm/llm_openai/wrapper.py,sha256=Z5Ndib228yd1pXEQ4xIu7_CJHxpW_t0ofZAC6FLc5eU,2055
74
+ judgeval/tracer/llm/llm_together/__init__.py,sha256=MEnsF77IgFD4h73hNCMpo-9a1PHHdm-OxPlOalXOMac,78
75
+ judgeval/tracer/llm/llm_together/chat_completions.py,sha256=RySsK3tqG0NpJHPlVQ705bXxIfseSQUhvIoS-sz4rOg,14380
76
+ judgeval/tracer/llm/llm_together/config.py,sha256=jCJY0KQcHJZZJk2vq038GKIDUMusqgvRjQ0B6OV5uEc,150
77
+ judgeval/tracer/llm/llm_together/wrapper.py,sha256=HFqy_MabQeSq8oj2diZhEuk1SDt_hDfk5MFdPn9MFhg,1733
78
+ judgeval/tracer/processors/__init__.py,sha256=BdOOPOD1RfMI5YHW76DNPKR07EAev-JxoolZ3KaXNNU,7100
79
+ judgeval/trainer/__init__.py,sha256=nJo913vFdss3E_PR-M1OUjznS0SYgNZ-MP-Y_6Mj5PA,437
80
+ judgeval/trainer/base_trainer.py,sha256=Lxm6OxJpifonLKofNIRG3TU7n_jZWQZ0I_f_jwtb_WU,4018
81
+ judgeval/trainer/config.py,sha256=7ZSwr6p7vq0MRadh9axm6XB-RAotdWqULZ5yDl0xGbQ,4340
82
+ judgeval/trainer/console.py,sha256=SvokkFEU-K1vLV4Rd1m6YJJ7HyYwTr4Azdzwx_JPZUY,4351
83
+ judgeval/trainer/fireworks_trainer.py,sha256=_B-fWovdhIpxh1RbXU0W5BlFGc9ZzuYtFw7CBtKTRO8,16074
84
+ judgeval/trainer/trainable_model.py,sha256=T-Sioi_sXtfYlcu3lE0cd60PHs8DrYaZ-Kxb4h1nU04,8993
85
+ judgeval/trainer/trainer.py,sha256=twLEHNaomelTg6ZYG6veI9OpB3wzhPCtPVQMTnDZWx4,2626
86
+ judgeval/utils/async_utils.py,sha256=AF1xdu8Ao5GyhFvfaLOaKJHn1RISyXZ4U70UZe9zfBA,1083
87
+ judgeval/utils/file_utils.py,sha256=vq-n5WZEZjVbZ5S9QTkW8nSH6Pvw-Jx0ttsQ1t0wnPQ,3140
88
+ judgeval/utils/guards.py,sha256=_DaKZxvjD10J97Ze2paHhbCiV2MpDz3FZQmNwaL5k0w,945
89
+ judgeval/utils/meta.py,sha256=RAqZuvOlymqMwFoS0joBW_r65lcN9bY8BpNYHoytKps,773
90
+ judgeval/utils/project.py,sha256=kGpYmp6QGTD6h-GjQ-ovT7kBmGnyb99MWDJmRGFQHOg,527
91
+ judgeval/utils/serialize.py,sha256=WbforbVFGINuk68T2YtWhj-ECMC6rWol3g5dxz9nsm8,6265
92
+ judgeval/utils/testing.py,sha256=m5Nexv65tmfSj1XvAPK5Ear7aJ7w5xjDtZN0tLZ_RBk,2939
93
+ judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
94
+ judgeval/utils/version_check.py,sha256=se4Ft8rjcl5u7fHMxSGQpka844V2AcZpOYl6StLWTio,1081
95
+ judgeval/utils/decorators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
96
+ judgeval/utils/decorators/dont_throw.py,sha256=Q4xlx6RUnQdNjKM0A_X0FEeLBF_71rMKnKdRbVnX82o,989
97
+ judgeval/utils/decorators/use_once.py,sha256=8mgj5VK9v08VOOWX2Bstc0CezNsOVUKMIv7N2R83E8s,288
98
+ judgeval/utils/wrappers/README.md,sha256=-Jyagu6NPH92ty8pTMbzRLVJZzufULrjxcyohXgsGMc,76
99
+ judgeval/utils/wrappers/__init__.py,sha256=iAcpjCOkYqoe6z2utrS_3yZLmdQPD1Y64MMefai8h0Y,546
100
+ judgeval/utils/wrappers/immutable_wrap_async.py,sha256=a0LWyEa235tPpfjN3W0A516_GWDL13uhumMVzsMpgW4,1909
101
+ judgeval/utils/wrappers/immutable_wrap_async_iterator.py,sha256=hfVxBoFE6m6I0g0KcSLJXyfVv6pfZuoJuHyLJHtLjjg,2268
102
+ judgeval/utils/wrappers/immutable_wrap_sync.py,sha256=_gOUaPK4Le-pifWCZOH4lDvY-cLfYoC0fy7DTNIG-0A,1823
103
+ judgeval/utils/wrappers/immutable_wrap_sync_iterator.py,sha256=aDC4HpLp4l9A3aFLS0cTCkien-xGgQRU04F7P1pJ6w8,2229
104
+ judgeval/utils/wrappers/mutable_wrap_async.py,sha256=stHISOUCGFUJXY8seXmxUo4ZpMF4LErSBIz0HlWR7Bo,2941
105
+ judgeval/utils/wrappers/mutable_wrap_sync.py,sha256=t5jygAQ1vqhy8s1GfiLeYygYgaLTgfoYASN47U5JiPs,2888
106
+ judgeval/utils/wrappers/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
107
+ judgeval/utils/wrappers/utils.py,sha256=j18vaa6JWDw2s3nQy1z5PfV_9Xxio-bVARaHG_0XyL0,1228
108
+ judgeval-0.22.2.dist-info/METADATA,sha256=9F5AvYGpPCC9BQQYj3-4UQ1jVR1mc06L3nDMfYaH_Uw,11483
109
+ judgeval-0.22.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
110
+ judgeval-0.22.2.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
111
+ judgeval-0.22.2.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
112
+ judgeval-0.22.2.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ judgeval = judgeval.cli:app
judgeval/clients.py DELETED
@@ -1,39 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- from openai import OpenAI
4
- from langfuse import Langfuse
5
- from typing import Optional
6
- from together import Together, AsyncTogether
7
-
8
- PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
9
- load_dotenv(dotenv_path=PATH_TO_DOTENV)
10
-
11
- # Initialize required clients
12
- langfuse = Langfuse(
13
- secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
14
- public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
15
- host=os.getenv("LANGFUSE_HOST"),
16
- )
17
-
18
- # Initialize optional OpenAI client
19
- client: Optional['OpenAI'] = None
20
- if os.getenv("OPENAI_API_KEY"):
21
- try:
22
- from openai import OpenAI
23
- client = OpenAI()
24
- except ImportError:
25
- # openai package not installed
26
- pass
27
-
28
- # Initialize optional Together clients
29
- together_client: Optional['Together'] = None
30
- async_together_client: Optional['AsyncTogether'] = None
31
-
32
- # Only initialize Together clients if API key is available
33
- if os.getenv("TOGETHERAI_API_KEY"):
34
- try:
35
- together_client = Together(api_key=os.getenv("TOGETHERAI_API_KEY"))
36
- async_together_client = AsyncTogether(api_key=os.getenv("TOGETHERAI_API_KEY"))
37
- except Exception:
38
- pass
39
-
@@ -1,8 +0,0 @@
1
- from judgeval.common.utils import (
2
- get_chat_completion,
3
- aget_chat_completion,
4
- get_completion_multiple_models,
5
- aget_completion_multiple_models
6
- )
7
-
8
- __all__ = ["get_chat_completion", "aget_chat_completion", "get_completion_multiple_models", "aget_completion_multiple_models"]
@@ -1,28 +0,0 @@
1
- """
2
- Common Exceptions in Judgeval
3
- """
4
-
5
-
6
- class MissingTestCaseParamsError(Exception):
7
- pass
8
-
9
-
10
- class JudgmentAPIError(Exception):
11
- """
12
- Exception raised when an error occurs while executing a Judgment API request
13
- """
14
-
15
- def __init__(self, message: str):
16
- super().__init__(message)
17
- self.message = message
18
-
19
-
20
- class InvalidJudgeModelError(Exception):
21
- """
22
- Exception raised when an invalid judge model is provided
23
- """
24
-
25
- def __init__(self, message: str):
26
- super().__init__(message)
27
- self.message = message
28
-
judgeval/common/logger.py DELETED
@@ -1,189 +0,0 @@
1
- import logging
2
- from logging.handlers import RotatingFileHandler
3
- import sys
4
- from pathlib import Path
5
- from datetime import datetime
6
- from contextlib import contextmanager
7
-
8
- # Global variables
9
- logger = None
10
- class LoggingState:
11
- enabled = False
12
- path = None
13
-
14
- LOGGING_STATE = LoggingState()
15
-
16
- # Add these as module-level variables
17
- current_example_id = None
18
- current_timestamp = None
19
-
20
-
21
- @contextmanager
22
- def enable_logging(name: str = "judgeval", path: str = "./logs", max_bytes: int = 1024 * 1024, backup_count: int = 5):
23
- """
24
- Context manager to temporarily enable logging for a specific block of code.
25
- """
26
- global logger
27
- LOGGING_STATE.enabled = True
28
- LOGGING_STATE.path = path
29
- # Initialize logger if not already initialized
30
- if logger is None:
31
- logger = _initialize_logger(name=name, path=path, max_bytes=max_bytes, backup_count=backup_count)
32
- try:
33
- logger.info("Logging enabled")
34
- yield
35
- finally:
36
- logger.info("Logging disabled")
37
- LOGGING_STATE.enabled = False
38
- LOGGING_STATE.path = None
39
-
40
- def _initialize_logger(
41
- name: str = "judgeval",
42
- max_bytes: int = 1024 * 1024, # 1MB
43
- backup_count: int = 5,
44
- path: str = "./logs" # Added path parameter with default
45
- ) -> logging.Logger:
46
- """
47
- Initialize the global logger instance if it doesn't exist.
48
- Returns the global logger instance.
49
- """
50
- global logger
51
-
52
- log_dir = Path(path)
53
- log_dir.mkdir(exist_ok=True, parents=True)
54
- log_file = log_dir / f"{name}.log"
55
- if log_file.exists():
56
- log_file.unlink() # Delete existing log file
57
-
58
- if logger is not None:
59
- return logger
60
-
61
- # Create logs directory if it doesn't exist
62
- log_dir = Path(path)
63
- log_dir.mkdir(exist_ok=True)
64
-
65
- # Create formatter
66
- formatter = logging.Formatter(
67
- fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
68
- datefmt='%Y-%m-%d %H:%M:%S'
69
- )
70
-
71
- # Create a custom formatter that includes example info when available
72
- class ExampleFormatter(logging.Formatter):
73
- def format(self, record):
74
- if current_example_id is not None and current_timestamp is not None:
75
- record.example_id = current_example_id
76
- record.timestamp = current_timestamp
77
- return logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - [Example_%(example_id)s][%(timestamp)s] %(message)s',
78
- datefmt='%Y-%m-%d %H:%M:%S').format(record)
79
- return logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s',
80
- datefmt='%Y-%m-%d %H:%M:%S').format(record)
81
-
82
- # Use the custom formatter
83
- console_handler = logging.StreamHandler(sys.stdout)
84
- console_handler.setFormatter(ExampleFormatter())
85
- console_handler.setLevel(logging.DEBUG)
86
-
87
- log_filename = f"{name}.log"
88
- file_handler = RotatingFileHandler(
89
- log_dir / log_filename,
90
- maxBytes=max_bytes,
91
- backupCount=backup_count,
92
- mode='a'
93
- )
94
- file_handler.setFormatter(ExampleFormatter())
95
- file_handler.setLevel(logging.DEBUG)
96
-
97
- # Get logger
98
- logger = logging.getLogger(name)
99
- logger.setLevel(logging.DEBUG)
100
-
101
- # Prevent adding handlers multiple times
102
- if not logger.handlers:
103
- logger.addHandler(console_handler)
104
- logger.addHandler(file_handler)
105
-
106
- return logger
107
-
108
- # Initialize the global logger when module is imported
109
- # logger = _initialize_logger()
110
-
111
- def log_if_enabled(func):
112
- """Decorator to check if logging is enabled before executing logging statements"""
113
- def wrapper(*args, **kwargs):
114
- if LOGGING_STATE.enabled:
115
- return func(*args, **kwargs)
116
- return wrapper
117
-
118
- @log_if_enabled
119
- def debug(msg: str, example_idx: int = None):
120
- """Log debug message if logging is enabled"""
121
- logger.debug(msg)
122
-
123
- @log_if_enabled
124
- def info(msg: str, example_idx: int = None):
125
- """Log info message if logging is enabled"""
126
- logger.info(msg)
127
-
128
- @log_if_enabled
129
- def warning(msg: str, example_idx: int = None):
130
- """Log warning message if logging is enabled"""
131
- logger.warning(msg)
132
-
133
- @log_if_enabled
134
- def error(msg: str, example_idx: int = None):
135
- """Log error message if logging is enabled"""
136
- logger.error(msg)
137
-
138
- def create_example_handler(
139
- timestamp: str,
140
- example_idx: int,
141
- path: str = "./logs" # Added path parameter with default
142
- ) -> RotatingFileHandler:
143
- """Creates a file handler for a specific example"""
144
- debug(f"Creating example handler for timestamp={timestamp}, example_idx={example_idx}")
145
- log_dir = Path(path) / "examples"
146
- log_dir.mkdir(exist_ok=True, parents=True)
147
-
148
- formatter = logging.Formatter(
149
- fmt='%(asctime)s - %(name)s - %(levelname)s - [Example_%(example_id)s][%(timestamp)s] %(message)s',
150
- datefmt='%Y-%m-%d %H:%M:%S'
151
- )
152
-
153
- # Create a unique file for each example
154
- file_handler = RotatingFileHandler(
155
- log_dir / f"{timestamp}_example_{example_idx}.log",
156
- maxBytes=1024 * 1024, # 1MB
157
- backupCount=5,
158
- mode='a'
159
- )
160
- file_handler.setFormatter(formatter)
161
- file_handler.setLevel(logging.DEBUG)
162
- info(f"Created example handler for example {example_idx}")
163
- return file_handler
164
-
165
- @contextmanager
166
- def example_logging_context(timestamp: str, example_idx: int):
167
- """Context manager for example-specific logging"""
168
- if not LOGGING_STATE.enabled:
169
- yield
170
- return
171
-
172
- global current_example_id, current_timestamp
173
-
174
- debug(f"Entering example logging context for example {example_idx}")
175
- current_example_id = example_idx
176
- current_timestamp = timestamp
177
-
178
- handler = create_example_handler(timestamp, example_idx, path=LOGGING_STATE.path)
179
- if handler:
180
- logger.addHandler(handler)
181
- try:
182
- yield
183
- finally:
184
- current_example_id = None
185
- current_timestamp = None
186
- if handler:
187
- logger.removeHandler(handler)
188
- handler.close()
189
- debug(f"Closed example handler for example {example_idx}")