judgeval 0.0.26__tar.gz → 0.0.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.0.26 → judgeval-0.0.27}/PKG-INFO +1 -1
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/custom_scorers.mdx +29 -3
- {judgeval-0.0.26 → judgeval-0.0.27}/pyproject.toml +1 -1
- judgeval-0.0.27/src/demo/new_trace/example_complex_async.py +232 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/demo/travel_agent.py +1 -1
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/common/tracer.py +476 -161
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/constants.py +4 -2
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/__init__.py +0 -3
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/datasets/eval_dataset_client.py +59 -20
- judgeval-0.0.27/src/judgeval/data/result.py +76 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judgment_client.py +47 -15
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/run_evaluation.py +20 -36
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/score.py +9 -11
- judgeval-0.0.26/src/judgeval/data/api_example.py +0 -98
- judgeval-0.0.26/src/judgeval/data/result.py +0 -98
- {judgeval-0.0.26 → judgeval-0.0.27}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/.gitignore +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/LICENSE.md +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/Pipfile +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/Pipfile.lock +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/README.md +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/README.md +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/api_reference/judgment_client.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/api_reference/trace.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/development.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/comparison.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/execution_order.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/faithfulness.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/groundedness.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/hallucination.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/json_correctness.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/summarization.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/unit_testing.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/favicon.svg +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/getting_started.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/basic_trace_example.png +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/online_eval_fault.png +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/trace_ss.png +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/integration/langgraph.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/introduction.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/judgment/introduction.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/logo/light.svg +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/mint.json +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/monitoring/introduction.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/monitoring/tracing.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/pytest.ini +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/demo/demo.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/datasets/dataset.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/example.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/evaluation_run.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/integrations/langgraph.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/rules.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/prompt_scorer.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/utils/alerts.py +0 -0
@@ -4,6 +4,7 @@ description: ""
|
|
4
4
|
---
|
5
5
|
|
6
6
|
If none of `judgeval`'s built-in scorers fit your evaluation criteria, you can easily build your own custom metric to be run through a `JudgevalScorer`.
|
7
|
+
|
7
8
|
`JudgevalScorer`s are **automatically integrated** within `judgeval`'s infrastructure, so you can:
|
8
9
|
- Run your own scorer with the same syntax as any other `judgeval` scorer.
|
9
10
|
- Use `judgeval`'s batched evaluation infrastructure to execute **scalable evaluation runs**.
|
@@ -78,7 +79,6 @@ You can optionally set the self.reason attribute, depending on your preference.
|
|
78
79
|
</Note>
|
79
80
|
|
80
81
|
These methods are the core of your scorer, and you can implement them in any way you want. **Be creative!**
|
81
|
-
Check out this list of examples our users have implemented if you need inspiration: TODO add link here
|
82
82
|
|
83
83
|
#### Handling Errors
|
84
84
|
If you want to handle errors gracefully, you can use a `try` block and in the `except` block, set the `self.error` attribute to the error message.
|
@@ -144,11 +144,37 @@ class SampleScorer(JudgevalScorer):
|
|
144
144
|
def __name__(self):
|
145
145
|
return "Sample Scorer"
|
146
146
|
```
|
147
|
-
|
148
147
|
**Congratulations!** 🎉
|
149
148
|
|
150
149
|
You've made your first custom judgeval scorer! Now that your scorer is implemented, you can run it on your own datasets
|
151
150
|
just like any other `judgeval` scorer. Your scorer is fully integrated with `judgeval`'s infrastructure so you can view it on
|
152
151
|
the [Judgment platform](/judgment/introduction) too.
|
153
152
|
|
154
|
-
|
153
|
+
## Using a Custom Scorer
|
154
|
+
|
155
|
+
Once you've implemented your custom scorer, you can use it in the same way as any other scorer in `judgeval`.
|
156
|
+
They can be run in conjunction with other scorers in a single evaluation run!
|
157
|
+
|
158
|
+
```python run_custom_scorer.py
|
159
|
+
from judgeval import JudgmentClient
|
160
|
+
from your_custom_scorer import SampleScorer
|
161
|
+
|
162
|
+
client = JudgmentClient()
|
163
|
+
sample_scorer = SampleScorer()
|
164
|
+
|
165
|
+
results = client.run_evaluation(
|
166
|
+
examples=[example1],
|
167
|
+
scorers=[sample_scorer],
|
168
|
+
model="gpt-4o"
|
169
|
+
)
|
170
|
+
```
|
171
|
+
|
172
|
+
## Real World Examples
|
173
|
+
|
174
|
+
You can find some real world examples of how our community has used custom `JudgevalScorer`s to evaluate their LLM systems in our [cookbook repository](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/custom_scorers)!
|
175
|
+
Here are some of our favorites:
|
176
|
+
|
177
|
+
- [Code Style Scorer](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/code_style_scorer.py) - Evaluates code quality and style
|
178
|
+
- [Cold Email Scorer](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py) - Evaluates the effectiveness of cold emails
|
179
|
+
|
180
|
+
For more examples and detailed documentation on custom scorers, check out our [Custom Scorers Cookbook](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/README.md).
|
@@ -0,0 +1,232 @@
|
|
1
|
+
import asyncio
|
2
|
+
import time
|
3
|
+
import sys
|
4
|
+
import os
|
5
|
+
import functools
|
6
|
+
from unittest.mock import MagicMock, patch
|
7
|
+
from typing import Dict, Optional, List
|
8
|
+
import uuid
|
9
|
+
import json
|
10
|
+
|
11
|
+
# Standard library imports needed for the new class
|
12
|
+
import concurrent.futures
|
13
|
+
import contextvars
|
14
|
+
# Needed for partial in the executor
|
15
|
+
|
16
|
+
# Add src directory to Python path for imports
|
17
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
18
|
+
|
19
|
+
# Import and mock necessary components before initializing the tracer
|
20
|
+
from judgeval.common.tracer import Tracer, JudgmentClient, TraceClient, current_trace_var, TraceEntry, TraceManagerClient, TraceThreadPoolExecutor # Import the new class
|
21
|
+
|
22
|
+
# Initialize the tracer with test values
|
23
|
+
tracer = Tracer(
|
24
|
+
project_name="complex_async_test"
|
25
|
+
)
|
26
|
+
|
27
|
+
# In this example, we'll use a single trace with spans for all function calls
|
28
|
+
@tracer.observe(name="root_function")
|
29
|
+
async def root_function():
|
30
|
+
print("Root function starting")
|
31
|
+
|
32
|
+
# Direct await call to level 2
|
33
|
+
result1 = await level2_function("direct")
|
34
|
+
|
35
|
+
# Parallel calls (gather) to level 2 functions
|
36
|
+
# These should be level 2 - direct children of root
|
37
|
+
# Create two truly parallel functions that both have root_function as parent
|
38
|
+
level2_parallel1_task = level2_parallel1("gather1")
|
39
|
+
level2_parallel2_task = level2_parallel2("gather2")
|
40
|
+
|
41
|
+
# Use trace_gather instead of asyncio.gather to preserve context
|
42
|
+
# This ensures parent-child relationships are maintained in parallel tasks
|
43
|
+
# result2, result3 = await trace_gather(level2_parallel1_task, level2_parallel2_task) # OLD
|
44
|
+
result2, result3 = await asyncio.gather(level2_parallel1_task, level2_parallel2_task) # Use standard gather
|
45
|
+
|
46
|
+
|
47
|
+
print("Root function completed")
|
48
|
+
return f"Root results: {result1}, {result2}, {result3}"
|
49
|
+
|
50
|
+
# Level 2 - Direct child of root
|
51
|
+
# Using observe with same tracer - this will create spans in the parent trace
|
52
|
+
@tracer.observe()
|
53
|
+
async def level2_function(param):
|
54
|
+
# Capture this function in a span within the current trace
|
55
|
+
print(f"Level 2 function with {param}")
|
56
|
+
|
57
|
+
# Call to level 3
|
58
|
+
result = await level3_function(f"{param}_child")
|
59
|
+
|
60
|
+
return f"level2:{result}"
|
61
|
+
|
62
|
+
# Level 2 - First parallel function
|
63
|
+
@tracer.observe()
|
64
|
+
async def level2_parallel1(param):
|
65
|
+
# Capture this function in a span within the current trace
|
66
|
+
print(f"Level 2 parallel 1 with {param}")
|
67
|
+
|
68
|
+
# This parallel function makes another parallel call to level 3 functions
|
69
|
+
# These should be direct children of level2_parallel1
|
70
|
+
# r1, r2 = await trace_gather( # OLD
|
71
|
+
r1, r2 = await asyncio.gather( # Use standard gather
|
72
|
+
level3_parallel1(f"{param}_1"),
|
73
|
+
level3_parallel2(f"{param}_2")
|
74
|
+
)
|
75
|
+
|
76
|
+
return f"level2_parallel1:{r1},{r2}"
|
77
|
+
|
78
|
+
# Level 2 - Second parallel function
|
79
|
+
@tracer.observe()
|
80
|
+
async def level2_parallel2(param):
|
81
|
+
# Capture this function in a span within the current trace
|
82
|
+
print(f"Level 2 parallel 2 with {param}")
|
83
|
+
|
84
|
+
# Direct await to level 3
|
85
|
+
result = await level3_function(f"{param}_direct")
|
86
|
+
|
87
|
+
return f"level2_parallel2:{result}"
|
88
|
+
|
89
|
+
# Level 3 - Child of level 2 direct
|
90
|
+
@tracer.observe()
|
91
|
+
async def level3_function(param):
|
92
|
+
# Capture this function in a span within the current trace
|
93
|
+
print(f"Level 3 function with {param}")
|
94
|
+
|
95
|
+
# Call to level 4
|
96
|
+
result = await level4_function(f"{param}_deep")
|
97
|
+
|
98
|
+
return f"level3:{result}"
|
99
|
+
|
100
|
+
# Level 3 - First parallel function called by level2_parallel1
|
101
|
+
@tracer.observe()
|
102
|
+
async def level3_parallel1(param):
|
103
|
+
# Capture this function in a span within the current trace
|
104
|
+
print(f"Level 3 parallel 1 with {param}")
|
105
|
+
|
106
|
+
# This makes a nested gather call with level 4 functions
|
107
|
+
# results = await trace_gather( # OLD
|
108
|
+
results = await asyncio.gather( # Use standard gather
|
109
|
+
level4_function(f"{param}_a"),
|
110
|
+
level4_function(f"{param}_b"),
|
111
|
+
level4_function(f"{param}_c")
|
112
|
+
)
|
113
|
+
|
114
|
+
return f"level3_p1:{','.join(results)}"
|
115
|
+
|
116
|
+
# Level 3 - Second parallel function called by level2_parallel1
|
117
|
+
@tracer.observe()
|
118
|
+
async def level3_parallel2(param):
|
119
|
+
# Capture this function in a span within the current trace
|
120
|
+
print(f"Level 3 parallel 2 with {param}")
|
121
|
+
await asyncio.sleep(0.1)
|
122
|
+
|
123
|
+
# Direct call to level 4
|
124
|
+
result = await level4_deep_function(f"{param}_deep")
|
125
|
+
|
126
|
+
return f"level3_p2:{result}"
|
127
|
+
|
128
|
+
# Level 4 - Deepest regular function
|
129
|
+
@tracer.observe()
|
130
|
+
async def level4_function(param):
|
131
|
+
# Capture this function in a span within the current trace
|
132
|
+
print(f"Level 4 function with {param}")
|
133
|
+
await asyncio.sleep(0.05)
|
134
|
+
|
135
|
+
return f"level4:{param}"
|
136
|
+
|
137
|
+
# Level 4 - Deep function that calls level 5
|
138
|
+
@tracer.observe()
|
139
|
+
async def level4_deep_function(param):
|
140
|
+
# Capture this function in a span within the current trace
|
141
|
+
print(f"Level 4 deep function with {param}")
|
142
|
+
|
143
|
+
# Call to level 5 (maximum depth)
|
144
|
+
result = await level5_function(f"{param}_final")
|
145
|
+
test = await fib(5)
|
146
|
+
return f"level4_deep:{result}"
|
147
|
+
|
148
|
+
@tracer.observe()
|
149
|
+
async def fib(n):
|
150
|
+
if n <= 1:
|
151
|
+
return n
|
152
|
+
return await fib(n-1) + await fib(n-2)
|
153
|
+
|
154
|
+
# Level 5 - Deepest level
|
155
|
+
@tracer.observe()
|
156
|
+
async def level5_function(param):
|
157
|
+
# Capture this function in a span within the current trace
|
158
|
+
print(f"Level 5 function with {param}")
|
159
|
+
await asyncio.sleep(0.05)
|
160
|
+
|
161
|
+
return f"level5:{param}"
|
162
|
+
|
163
|
+
# --- Synchronous ThreadPoolExecutor Test ---
|
164
|
+
|
165
|
+
@tracer.observe(name="sync_child_task1")
|
166
|
+
def sync_child_task1(param):
|
167
|
+
"""A simple synchronous function to be run in a thread."""
|
168
|
+
print(f"SYNC CHILD 1: Received {param}. Sleeping...")
|
169
|
+
time.sleep(0.15)
|
170
|
+
result = f"Result from sync_child_task1 with {param}"
|
171
|
+
print("SYNC CHILD 1: Done.")
|
172
|
+
return result
|
173
|
+
|
174
|
+
@tracer.observe(name="sync_child_task2")
|
175
|
+
def sync_child_task2(param1, param2):
|
176
|
+
"""Another simple synchronous function."""
|
177
|
+
print(f"SYNC CHILD 2: Received {param1} and {param2}. Sleeping...")
|
178
|
+
time.sleep(0.05)
|
179
|
+
result = f"Result from sync_child_task2 with {param1}, {param2}"
|
180
|
+
print("SYNC CHILD 2: Done.")
|
181
|
+
return result
|
182
|
+
|
183
|
+
@tracer.observe(name="sync_parent_func")
|
184
|
+
def sync_parent_func():
|
185
|
+
"""This function uses TraceThreadPoolExecutor to run sync tasks."""
|
186
|
+
print("SYNC PARENT: Starting...")
|
187
|
+
results = []
|
188
|
+
# Use the TraceThreadPoolExecutor instead of the standard one
|
189
|
+
with TraceThreadPoolExecutor(max_workers=2) as executor:
|
190
|
+
print("SYNC PARENT: Submitting tasks to TraceThreadPoolExecutor...")
|
191
|
+
future1 = executor.submit(sync_child_task1, "data_for_task1")
|
192
|
+
future2 = executor.submit(sync_child_task2, "data1_for_task2", "data2_for_task2")
|
193
|
+
|
194
|
+
print("SYNC PARENT: Waiting for futures...")
|
195
|
+
# Wait for futures and collect results (demonstrates typical usage)
|
196
|
+
for future in concurrent.futures.as_completed([future1, future2]):
|
197
|
+
try:
|
198
|
+
results.append(future.result())
|
199
|
+
except Exception as exc:
|
200
|
+
print(f"SYNC PARENT: Generated an exception: {exc}")
|
201
|
+
results.append(f"Error: {exc}")
|
202
|
+
|
203
|
+
print("SYNC PARENT: Finished.")
|
204
|
+
return results
|
205
|
+
|
206
|
+
# --- End Synchronous Test ---
|
207
|
+
|
208
|
+
async def main():
|
209
|
+
# Run the root function which has deep nesting and nested parallel calls
|
210
|
+
start_time = time.time()
|
211
|
+
result_async = await root_function()
|
212
|
+
end_time = time.time()
|
213
|
+
print(f"\nAsync Final result: {result_async}")
|
214
|
+
print(f"Async Total execution time: {end_time - start_time:.2f} seconds")
|
215
|
+
|
216
|
+
print("\n" + "="*20 + " Starting Sync ThreadPool Test " + "="*20 + "\n")
|
217
|
+
|
218
|
+
# --- Run the synchronous thread pool test ---
|
219
|
+
# Note: We run this *outside* the async root_function's trace
|
220
|
+
# If we wanted it nested, we'd need @tracer.observe on main or call it from root_function
|
221
|
+
# For simplicity, let's trace it separately by calling it directly.
|
222
|
+
# The @tracer.observe on sync_parent_func will create its own root trace.
|
223
|
+
start_time_sync = time.time()
|
224
|
+
result_sync = sync_parent_func() # This will be traced automatically
|
225
|
+
end_time_sync = time.time()
|
226
|
+
print(f"\nSync Final results: {result_sync}")
|
227
|
+
print(f"Sync Total execution time: {end_time_sync - start_time_sync:.2f} seconds")
|
228
|
+
# --- End synchronous test call ---
|
229
|
+
|
230
|
+
if __name__ == "__main__":
|
231
|
+
# Run the complex async example
|
232
|
+
asyncio.run(main())
|
@@ -84,7 +84,7 @@ Key Information:
|
|
84
84
|
]
|
85
85
|
|
86
86
|
client = wrap(openai.Client(api_key=os.getenv("OPENAI_API_KEY")))
|
87
|
-
judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"), project_name="travel_agent_demo")
|
87
|
+
judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"), project_name="travel_agent_demo", enable_evaluations=False, enable_monitoring=False)
|
88
88
|
|
89
89
|
def populate_vector_db(collection, destinations_data):
|
90
90
|
"""
|