judgeval 0.0.12__tar.gz → 0.0.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.0.12 → judgeval-0.0.13}/PKG-INFO +1 -1
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/api_reference/trace.mdx +11 -4
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/getting_started.mdx +43 -20
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/monitoring/tracing.mdx +87 -62
- {judgeval-0.0.12 → judgeval-0.0.13}/pyproject.toml +1 -1
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/common/tracer.py +25 -2
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/constants.py +2 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/datasets/dataset.py +2 -1
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/datasets/eval_dataset_client.py +106 -9
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/example.py +13 -5
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judgment_client.py +29 -6
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/run_evaluation.py +16 -5
- {judgeval-0.0.12 → judgeval-0.0.13}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/.gitignore +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/LICENSE.md +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/Pipfile +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/Pipfile.lock +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/README.md +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/README.md +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/api_reference/judgment_client.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/development.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/faithfulness.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/hallucination.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/json_correctness.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/summarization.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/tool_correctness.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/unit_testing.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/favicon.svg +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/basic_trace_example.png +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/trace_screenshot.png +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/introduction.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/judgment/introduction.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/logo/light.svg +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/mint.json +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/monitoring/introduction.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/pytest.ini +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/ci_testing/ci_testing.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/ci_testing/travel_response.txt +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/custom_scorers/competitor_mentions.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/custom_scorers/text2sql.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/new_bot/basic_bot.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/openai_travel_agent/agent.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/openai_travel_agent/tools.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/basic_test.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/cstone_data.csv +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/data.csv +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/faithfulness_testing.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/galen_data.csv +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/playground.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/results.csv +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/api_example.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/datasets/ground_truth.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/datasets/utils.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/result.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/evaluation_run.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/prompt_scorer.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/tracer/__init__.py +0 -0
@@ -12,27 +12,34 @@ The `Tracer` class is used to trace the execution of your LLM system.
|
|
12
12
|
```python
|
13
13
|
from judgeval.common.tracer import Tracer
|
14
14
|
|
15
|
-
tracer = Tracer()
|
15
|
+
tracer = Tracer(project_name="my_project")
|
16
16
|
```
|
17
17
|
|
18
18
|
<Note>
|
19
|
-
The `Tracer` class is a singleton, so you only need to initialize it once in your application.
|
19
|
+
The `Tracer` class is a singleton, so you only need to initialize it once in your application.
|
20
|
+
The `project_name` enables you to group traces by workflow, keeping all your evaluations and
|
21
|
+
observability tooling in one place.
|
20
22
|
</Note>
|
21
23
|
|
22
|
-
##
|
24
|
+
## Explicitly exporting traces
|
23
25
|
|
24
26
|
When using the `.trace()` context manager, you can control how your traces are exported to the Judgment platform by
|
25
27
|
providing the `project_name` argument. This allows you to group traces by workflow, keeping all your evaluations and
|
26
28
|
observability tooling in one place.
|
27
29
|
|
28
30
|
```python
|
29
|
-
with tracer.trace(
|
31
|
+
with tracer.trace(
|
32
|
+
name="my_workflow",
|
33
|
+
project_name="my_project",
|
34
|
+
overwrite=True
|
35
|
+
) as trace:
|
30
36
|
...
|
31
37
|
```
|
32
38
|
|
33
39
|
`.trace()` has the following args:
|
34
40
|
- `name`: The name of the trace. Can be make unique to each workflow run by using a timestamp or other unique identifier.
|
35
41
|
- `project_name`: The name of the project to use for the trace. Used to group traces by workflow.
|
42
|
+
- `overwrite`: Whether to overwrite the trace with the same `name` if it already exists.
|
36
43
|
|
37
44
|
The `trace()` context manager yields a `TraceClient` object.
|
38
45
|
|
@@ -32,7 +32,7 @@ large-scale evaluations. [Contact us](mailto:contact@judgmentlabs.ai) if you're
|
|
32
32
|
sensitive data that has to reside in your private VPCs.
|
33
33
|
</Note>
|
34
34
|
|
35
|
-
# Create
|
35
|
+
# Create Your First Evaluation
|
36
36
|
|
37
37
|
```python sample_eval.py
|
38
38
|
from judgeval import JudgmentClient
|
@@ -68,6 +68,48 @@ is a scorer that checks if the output is hallucinated relative to the retrieved
|
|
68
68
|
To learn more about using the Judgment Client to run evaluations, click [here](/api_reference/judgment_client).
|
69
69
|
</Tip>
|
70
70
|
|
71
|
+
# Create Your First Trace
|
72
|
+
|
73
|
+
Beyond experimentation, `judgeval` supports monitoring your LLM systems in **production**.
|
74
|
+
Using our `tracing` module, you can **track your LLM system outputs from end to end**, allowing you to visualize the flow of your LLM system.
|
75
|
+
Additionally, you can **enable evaluations to run in real-time** using Judgment's state-of-the-art judge models.
|
76
|
+
|
77
|
+
```python trace_example.py
|
78
|
+
from judgeval.common.tracer import Tracer, wrap
|
79
|
+
from openai import OpenAI
|
80
|
+
|
81
|
+
client = wrap(OpenAI())
|
82
|
+
judgment = Tracer(project_name="my_project")
|
83
|
+
|
84
|
+
@judgment.observe(span_type="tool")
|
85
|
+
def my_tool():
|
86
|
+
return "Hello world!"
|
87
|
+
|
88
|
+
@judgment.observe(span_type="function")
|
89
|
+
def main():
|
90
|
+
res = client.chat.completions.create(
|
91
|
+
model="gpt-4o",
|
92
|
+
messages=[{"role": "user", "content": f"{my_tool()}"}]
|
93
|
+
)
|
94
|
+
return res.choices[0].message.content
|
95
|
+
```
|
96
|
+
|
97
|
+
|
98
|
+
<div style={{display: 'flex', justifyContent: 'center'}}>
|
99
|
+

|
100
|
+
</div>
|
101
|
+
|
102
|
+
There are many benefits of monitoring your LLM systems in production with `judgeval`, including:
|
103
|
+
- Detecting hallucinations and other quality issues **before they reach your customers**
|
104
|
+
- Automatically creating experimental datasets from your **real-world production cases** for future improvement/optimization
|
105
|
+
- Track and create alerts on **any metric** (e.g. latency, cost, hallucination, etc.)
|
106
|
+
|
107
|
+
<Tip>
|
108
|
+
To learn more about `judgeval`'s tracing module, click [here](/tracing/introduction).
|
109
|
+
</Tip>
|
110
|
+
|
111
|
+
|
112
|
+
|
71
113
|
# Create Your First Scorer
|
72
114
|
`judgeval` offers three kinds of LLM scorers for your evaluation needs: ready-made, classifier scorers, and custom scorers.
|
73
115
|
|
@@ -264,22 +306,3 @@ A `Project` keeps track of `Evaluation Run`s in your project. Each `Evaluation R
|
|
264
306
|
You can try different models (e.g. `gpt-4o`, `claude-3-5-sonnet`, etc.) and prompt templates in each `Evaluation Run` to find the
|
265
307
|
optimal setup for your LLM system.
|
266
308
|
</Tip>
|
267
|
-
|
268
|
-
## Monitoring LLM Systems in Production
|
269
|
-
|
270
|
-
Beyond experimenting and measuring historical performance, `judgeval` supports monitoring your LLM systems in **production**.
|
271
|
-
Using our `tracing` module, you can **track your LLM system outputs from end to end**, allowing you to visualize the flow of your LLM system.
|
272
|
-
Additionally, you can **enable evaluations to run in real-time** using Judgment's state-of-the-art judge models.
|
273
|
-
|
274
|
-
<div style={{display: 'flex', justifyContent: 'center'}}>
|
275
|
-

|
276
|
-
</div>
|
277
|
-
|
278
|
-
There are many benefits of monitoring your LLM systems in production with `judgeval`, including:
|
279
|
-
- Detecting hallucinations and other quality issues **before they reach your customers**
|
280
|
-
- Automatically creating experimental datasets from your **real-world production cases** for future improvement/optimization
|
281
|
-
- Track and create alerts on **any metric** (e.g. latency, cost, hallucination, etc.)
|
282
|
-
|
283
|
-
<Tip>
|
284
|
-
To learn more about `judgeval`'s tracing module, click [here](/tracing/introduction).
|
285
|
-
</Tip>
|
@@ -18,24 +18,25 @@ Using tracing, you can:
|
|
18
18
|
|
19
19
|
## Tracing Your Workflow ##
|
20
20
|
|
21
|
-
Setting up tracing with `judgeval` takes
|
21
|
+
Setting up tracing with `judgeval` takes two simple steps:
|
22
22
|
|
23
|
-
### 1. Initialize a tracer with your API key
|
23
|
+
### 1. Initialize a tracer with your API key and project name
|
24
24
|
|
25
25
|
```python
|
26
26
|
from judgeval.common.tracer import Tracer
|
27
27
|
|
28
|
-
judgment = Tracer() # loads from JUDGMENT_API_KEY env var
|
28
|
+
judgment = Tracer(project_name="my_project") # loads from JUDGMENT_API_KEY env var
|
29
29
|
```
|
30
30
|
|
31
31
|
<Note>
|
32
|
-
The [Judgment tracer](/api_reference/trace) is a singleton object that should be shared across your application.
|
32
|
+
The [Judgment tracer](/api_reference/trace) is a singleton object that should be shared across your application.
|
33
|
+
Your project name will be used to organize your traces in one place on the Judgment platform.
|
33
34
|
</Note>
|
34
35
|
|
35
36
|
|
36
37
|
### 2. Wrap your workflow components
|
37
38
|
|
38
|
-
`judgeval` provides
|
39
|
+
`judgeval` provides wrapping mechanisms for your workflow components:
|
39
40
|
|
40
41
|
#### `wrap()` ####
|
41
42
|
The `wrap()` function goes over your LLM client (e.g. OpenAI, Anthropic, etc.) and captures metadata surrounding your LLM calls, such as:
|
@@ -44,6 +45,14 @@ The `wrap()` function goes over your LLM client (e.g. OpenAI, Anthropic, etc.) a
|
|
44
45
|
- Prompt/Completion
|
45
46
|
- Model name
|
46
47
|
|
48
|
+
Here's an example of using `wrap()` on an OpenAI client:
|
49
|
+
```python
|
50
|
+
from openai import OpenAI
|
51
|
+
from judgeval.common.tracer import wrap
|
52
|
+
|
53
|
+
client = wrap(OpenAI())
|
54
|
+
```
|
55
|
+
|
47
56
|
#### `@observe` ####
|
48
57
|
The `@observe` decorator wraps your functions/tools and captures metadata surrounding your function calls, such as:
|
49
58
|
- Latency
|
@@ -63,30 +72,20 @@ def my_tool():
|
|
63
72
|
```
|
64
73
|
|
65
74
|
<Note>
|
66
|
-
|
67
|
-
on
|
75
|
+
`span_type` is a string that you can use to categorize and organize your trace spans.
|
76
|
+
Span types are displayed on the trace UI to easily nagivate a visualization of your workflow.
|
77
|
+
Common span types include `tool`, `function`, `retriever`, `database`, `web search`, etc.
|
68
78
|
</Note>
|
69
79
|
|
70
|
-
#### `context manager` ####
|
71
|
-
|
72
|
-
In your main function (e.g. the one that executes the primary workflow logic), you can use the `with judgment.trace()` context manager to trace the entire workflow.
|
73
|
-
|
74
|
-
The context manager can **save/print the state of the trace at any point in the workflow**.
|
75
|
-
This is useful for debugging or exporting any state of your workflow to run an evaluation from!
|
76
|
-
|
77
|
-
<Tip>
|
78
|
-
The `with judgment.trace()` context manager detects any `@observe` decorated functions or wrapped LLM calls within the context and automatically captures their metadata.
|
79
|
-
</Tip>
|
80
|
-
|
81
80
|
|
82
81
|
#### Putting it all Together
|
83
|
-
Here's a complete example of using
|
82
|
+
Here's a complete example of using judgeval's tracing mechanisms:
|
84
83
|
```python
|
85
84
|
from judgeval.common.tracer import Tracer, wrap
|
86
85
|
from openai import OpenAI
|
87
86
|
|
88
87
|
openai_client = wrap(OpenAI())
|
89
|
-
judgment = Tracer() # loads from JUDGMENT_API_KEY env var
|
88
|
+
judgment = Tracer(project_name="my_project") # loads from JUDGMENT_API_KEY env var
|
90
89
|
|
91
90
|
@judgment.observe(span_type="tool")
|
92
91
|
def my_tool():
|
@@ -101,28 +100,10 @@ def my_llm_call():
|
|
101
100
|
)
|
102
101
|
return res.choices[0].message.content
|
103
102
|
|
103
|
+
@judgment.observe(span_type="function")
|
104
104
|
def main():
|
105
|
-
|
106
|
-
|
107
|
-
project_name="my_project"
|
108
|
-
) as trace:
|
109
|
-
res = my_llm_call()
|
110
|
-
trace.save()
|
111
|
-
trace.print()
|
112
|
-
return res
|
113
|
-
```
|
114
|
-
|
115
|
-
The printed trace appears as follows on the terminal:
|
116
|
-
```
|
117
|
-
→ main_workflow (trace: main_workflow)
|
118
|
-
→ my_llm_call (trace: my_llm_call)
|
119
|
-
Input: {'args': [], 'kwargs': {}}
|
120
|
-
→ my_tool (trace: my_tool)
|
121
|
-
Input: {'args': [], 'kwargs': {}}
|
122
|
-
Output: Hello world!
|
123
|
-
← my_tool (0.000s)
|
124
|
-
Output: Hello! How can I assist you today?
|
125
|
-
← my_llm_call (0.789s)
|
105
|
+
res = my_llm_call()
|
106
|
+
return res
|
126
107
|
```
|
127
108
|
|
128
109
|
And the trace will appear on the Judgment platform as follows:
|
@@ -142,32 +123,27 @@ To execute an asynchronous evaluation, you can use the `trace.async_evaluate()`
|
|
142
123
|
|
143
124
|
```python
|
144
125
|
from judgeval.common.tracer import Tracer
|
145
|
-
from judgeval.scorers import
|
126
|
+
from judgeval.scorers import AnswerRelevancyScorer
|
146
127
|
|
147
|
-
judgment = Tracer()
|
128
|
+
judgment = Tracer(project_name="my_project")
|
148
129
|
|
130
|
+
@judgment.observe(span_type="function")
|
149
131
|
def main():
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
)
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
actual_output=res,
|
161
|
-
retrieval_context=[retrieved_info],
|
162
|
-
model="gpt-4o-mini",
|
163
|
-
)
|
164
|
-
return res
|
132
|
+
query = "What is the capital of France?"
|
133
|
+
res = ... # Your workflow logic
|
134
|
+
|
135
|
+
judgment.get_current_trace().async_evaluate(
|
136
|
+
scorers=[AnswerRelevancyScorer(threshold=0.5)],
|
137
|
+
input="",
|
138
|
+
actual_output=res,
|
139
|
+
model="gpt-4o",
|
140
|
+
)
|
141
|
+
return res
|
165
142
|
```
|
166
143
|
|
167
144
|
<Tip>
|
168
|
-
|
169
|
-
|
170
|
-
for more information.
|
145
|
+
Your async evaluations will be logged to the Judgment platform as part of the original trace and
|
146
|
+
a new evaluation will be created on the Judgment platform.
|
171
147
|
</Tip>
|
172
148
|
|
173
149
|
## Example: OpenAI Travel Agent
|
@@ -183,4 +159,53 @@ In this video, we'll walk through all of the topics covered in this guide by tra
|
|
183
159
|
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
|
184
160
|
referrerpolicy="strict-origin-when-cross-origin"
|
185
161
|
allowfullscreen
|
186
|
-
></iframe>
|
162
|
+
></iframe>
|
163
|
+
|
164
|
+
|
165
|
+
## Advanced: Customizing Traces Using the Context Manager ##
|
166
|
+
|
167
|
+
If you need to customize your tracing context, you can use the `with judgment.trace()` context manager.
|
168
|
+
|
169
|
+
The context manager can **save/print the state of the trace at any point in the workflow**.
|
170
|
+
This is useful for debugging or exporting any state of your workflow to run an evaluation from!
|
171
|
+
|
172
|
+
<Tip>
|
173
|
+
The `with judgment.trace()` context manager detects any `@observe` decorated functions or wrapped LLM calls within the context and automatically captures their metadata.
|
174
|
+
</Tip>
|
175
|
+
|
176
|
+
Here's an example of using the context manager to trace a workflow:
|
177
|
+
```python
|
178
|
+
from judgeval.common.tracer import Tracer, wrap
|
179
|
+
from openai import OpenAI
|
180
|
+
|
181
|
+
judgment = Tracer(project_name="my_project")
|
182
|
+
client = wrap(OpenAI())
|
183
|
+
|
184
|
+
@judgment.observe(span_type="tool")
|
185
|
+
def my_tool():
|
186
|
+
return "Hello world!"
|
187
|
+
|
188
|
+
def main():
|
189
|
+
with judgment.trace(name="my_workflow") as trace:
|
190
|
+
res = client.chat.completions.create(
|
191
|
+
model="gpt-4o",
|
192
|
+
messages=[{"role": "user", "content": f"{my_tool()}"}]
|
193
|
+
)
|
194
|
+
|
195
|
+
trace.print() # prints the state of the trace to console
|
196
|
+
trace.save() # saves the current state of the trace to the Judgment platform
|
197
|
+
|
198
|
+
return res.choices[0].message.content
|
199
|
+
```
|
200
|
+
|
201
|
+
<Warning>
|
202
|
+
The `with judgment.trace()` context manager should only be used if you need to customize the context
|
203
|
+
over which you're tracing. In most cases, you should trace using the `@observe` decorator.
|
204
|
+
</Warning>
|
205
|
+
|
206
|
+
|
207
|
+
|
208
|
+
|
209
|
+
|
210
|
+
|
211
|
+
|
@@ -199,10 +199,11 @@ class TraceManagerClient:
|
|
199
199
|
JUDGMENT_TRACES_FETCH_API_URL,
|
200
200
|
json={
|
201
201
|
"trace_id": trace_id,
|
202
|
-
"judgment_api_key": self.judgment_api_key,
|
202
|
+
# "judgment_api_key": self.judgment_api_key,
|
203
203
|
},
|
204
204
|
headers={
|
205
205
|
"Content-Type": "application/json",
|
206
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
206
207
|
}
|
207
208
|
)
|
208
209
|
|
@@ -225,6 +226,7 @@ class TraceManagerClient:
|
|
225
226
|
json=trace_data,
|
226
227
|
headers={
|
227
228
|
"Content-Type": "application/json",
|
229
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
228
230
|
}
|
229
231
|
)
|
230
232
|
|
@@ -248,6 +250,7 @@ class TraceManagerClient:
|
|
248
250
|
},
|
249
251
|
headers={
|
250
252
|
"Content-Type": "application/json",
|
253
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
251
254
|
}
|
252
255
|
)
|
253
256
|
|
@@ -263,11 +266,12 @@ class TraceManagerClient:
|
|
263
266
|
response = requests.delete(
|
264
267
|
JUDGMENT_TRACES_DELETE_API_URL,
|
265
268
|
json={
|
266
|
-
"judgment_api_key": self.judgment_api_key,
|
269
|
+
# "judgment_api_key": self.judgment_api_key,
|
267
270
|
"trace_ids": trace_ids,
|
268
271
|
},
|
269
272
|
headers={
|
270
273
|
"Content-Type": "application/json",
|
274
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
271
275
|
}
|
272
276
|
)
|
273
277
|
|
@@ -576,6 +580,25 @@ class TraceClient:
|
|
576
580
|
|
577
581
|
self.trace_manager_client.save_trace(trace_data, empty_save)
|
578
582
|
|
583
|
+
|
584
|
+
# Save trace data by making POST request to API
|
585
|
+
response = requests.post(
|
586
|
+
JUDGMENT_TRACES_SAVE_API_URL,
|
587
|
+
json=trace_data,
|
588
|
+
headers={
|
589
|
+
"Content-Type": "application/json",
|
590
|
+
"Authorization": f"Bearer {self.tracer.api_key}" # Bearer token format
|
591
|
+
}
|
592
|
+
)
|
593
|
+
|
594
|
+
if response.status_code == HTTPStatus.BAD_REQUEST:
|
595
|
+
raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
|
596
|
+
elif response.status_code != HTTPStatus.OK:
|
597
|
+
raise ValueError(f"Failed to save trace data: {response.text}")
|
598
|
+
|
599
|
+
if not empty_save and "ui_results_url" in response.json():
|
600
|
+
rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
|
601
|
+
|
579
602
|
return self.trace_id, trace_data
|
580
603
|
|
581
604
|
def delete(self):
|
@@ -36,7 +36,9 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
|
36
36
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
37
37
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
38
38
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
|
39
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
39
40
|
JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
|
41
|
+
JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
|
40
42
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
41
43
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
42
44
|
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
|
@@ -162,7 +162,8 @@ class EvalDataset:
|
|
162
162
|
"additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
|
163
163
|
"tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
|
164
164
|
"expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
|
165
|
-
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
|
165
|
+
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
|
166
|
+
"example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
|
166
167
|
}
|
167
168
|
if row["example"]:
|
168
169
|
data["name"] = row["name"] if pd.notna(row["name"]) else None
|
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
from typing import Optional
|
2
|
+
from typing import Optional, List
|
3
3
|
import requests
|
4
4
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
5
5
|
|
@@ -7,7 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
|
|
7
7
|
from judgeval.constants import (
|
8
8
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
9
|
JUDGMENT_DATASETS_PULL_API_URL,
|
10
|
-
JUDGMENT_DATASETS_PULL_ALL_API_URL
|
10
|
+
JUDGMENT_DATASETS_PULL_ALL_API_URL,
|
11
|
+
JUDGMENT_DATASETS_EDIT_API_URL,
|
12
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
11
13
|
)
|
12
14
|
from judgeval.data import Example
|
13
15
|
from judgeval.data.datasets import EvalDataset
|
@@ -23,7 +25,7 @@ class EvalDatasetClient:
|
|
23
25
|
def create_dataset(self) -> EvalDataset:
|
24
26
|
return EvalDataset(judgment_api_key=self.judgment_api_key)
|
25
27
|
|
26
|
-
def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
|
28
|
+
def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
|
27
29
|
debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
|
28
30
|
if overwrite:
|
29
31
|
warning(f"Overwrite enabled for alias '{alias}'")
|
@@ -56,12 +58,16 @@ class EvalDatasetClient:
|
|
56
58
|
"ground_truths": [g.to_dict() for g in dataset.ground_truths],
|
57
59
|
"examples": [e.to_dict() for e in dataset.examples],
|
58
60
|
"overwrite": overwrite,
|
59
|
-
"judgment_api_key": dataset.judgment_api_key
|
61
|
+
# "judgment_api_key": dataset.judgment_api_key
|
60
62
|
}
|
61
63
|
try:
|
62
64
|
response = requests.post(
|
63
65
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
64
|
-
json=content
|
66
|
+
json=content,
|
67
|
+
headers={
|
68
|
+
"Content-Type": "application/json",
|
69
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
70
|
+
}
|
65
71
|
)
|
66
72
|
if response.status_code == 500:
|
67
73
|
error(f"Server error during push: {content.get('message')}")
|
@@ -115,13 +121,17 @@ class EvalDatasetClient:
|
|
115
121
|
)
|
116
122
|
request_body = {
|
117
123
|
"alias": alias,
|
118
|
-
"judgment_api_key": self.judgment_api_key
|
124
|
+
# "judgment_api_key": self.judgment_api_key
|
119
125
|
}
|
120
126
|
|
121
127
|
try:
|
122
128
|
response = requests.post(
|
123
129
|
JUDGMENT_DATASETS_PULL_API_URL,
|
124
|
-
json=request_body
|
130
|
+
json=request_body,
|
131
|
+
headers={
|
132
|
+
"Content-Type": "application/json",
|
133
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
134
|
+
}
|
125
135
|
)
|
126
136
|
response.raise_for_status()
|
127
137
|
except requests.exceptions.RequestException as e:
|
@@ -169,13 +179,17 @@ class EvalDatasetClient:
|
|
169
179
|
total=100,
|
170
180
|
)
|
171
181
|
request_body = {
|
172
|
-
"judgment_api_key": self.judgment_api_key
|
182
|
+
# "judgment_api_key": self.judgment_api_key
|
173
183
|
}
|
174
184
|
|
175
185
|
try:
|
176
186
|
response = requests.post(
|
177
187
|
JUDGMENT_DATASETS_PULL_ALL_API_URL,
|
178
|
-
json=request_body
|
188
|
+
json=request_body,
|
189
|
+
headers={
|
190
|
+
"Content-Type": "application/json",
|
191
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
192
|
+
}
|
179
193
|
)
|
180
194
|
response.raise_for_status()
|
181
195
|
except requests.exceptions.RequestException as e:
|
@@ -191,3 +205,86 @@ class EvalDatasetClient:
|
|
191
205
|
)
|
192
206
|
|
193
207
|
return payload
|
208
|
+
|
209
|
+
def edit_dataset(self, alias: str, examples: List[Example], ground_truths: List[GroundTruthExample]) -> bool:
|
210
|
+
"""
|
211
|
+
Edits the dataset on Judgment platform by adding new examples and ground truths
|
212
|
+
|
213
|
+
Mock request:
|
214
|
+
{
|
215
|
+
"alias": alias,
|
216
|
+
"examples": [...],
|
217
|
+
"ground_truths": [...],
|
218
|
+
"judgment_api_key": self.judgment_api_key
|
219
|
+
}
|
220
|
+
"""
|
221
|
+
with Progress(
|
222
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
223
|
+
TextColumn("[progress.description]{task.description}"),
|
224
|
+
transient=False,
|
225
|
+
) as progress:
|
226
|
+
task_id = progress.add_task(
|
227
|
+
f"Editing dataset [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] on Judgment...",
|
228
|
+
total=100,
|
229
|
+
)
|
230
|
+
|
231
|
+
content = {
|
232
|
+
"alias": alias,
|
233
|
+
"examples": [e.to_dict() for e in examples],
|
234
|
+
"ground_truths": [g.to_dict() for g in ground_truths],
|
235
|
+
"judgment_api_key": self.judgment_api_key
|
236
|
+
}
|
237
|
+
|
238
|
+
try:
|
239
|
+
response = requests.post(
|
240
|
+
JUDGMENT_DATASETS_EDIT_API_URL,
|
241
|
+
json=content
|
242
|
+
)
|
243
|
+
response.raise_for_status()
|
244
|
+
except requests.exceptions.RequestException as e:
|
245
|
+
error(f"Error editing dataset: {str(e)}")
|
246
|
+
return False
|
247
|
+
|
248
|
+
info(f"Successfully edited dataset '{alias}'")
|
249
|
+
return True
|
250
|
+
|
251
|
+
def export_jsonl(self, alias: str) -> requests.Response:
|
252
|
+
"""Export dataset in JSONL format from Judgment platform"""
|
253
|
+
debug(f"Exporting dataset with alias '{alias}' as JSONL")
|
254
|
+
with Progress(
|
255
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
256
|
+
TextColumn("[progress.description]{task.description}"),
|
257
|
+
transient=False,
|
258
|
+
) as progress:
|
259
|
+
task_id = progress.add_task(
|
260
|
+
f"Exporting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] as JSONL...",
|
261
|
+
total=100,
|
262
|
+
)
|
263
|
+
try:
|
264
|
+
response = requests.post(
|
265
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
|
266
|
+
json={"alias": alias},
|
267
|
+
headers={
|
268
|
+
"Content-Type": "application/json",
|
269
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
270
|
+
},
|
271
|
+
stream=True
|
272
|
+
)
|
273
|
+
response.raise_for_status()
|
274
|
+
except requests.exceptions.HTTPError as err:
|
275
|
+
if err.response.status_code == 404:
|
276
|
+
error(f"Dataset not found: {alias}")
|
277
|
+
else:
|
278
|
+
error(f"HTTP error during export: {err}")
|
279
|
+
raise
|
280
|
+
except Exception as e:
|
281
|
+
error(f"Error during export: {str(e)}")
|
282
|
+
raise
|
283
|
+
|
284
|
+
info(f"Successfully exported dataset with alias '{alias}'")
|
285
|
+
progress.update(
|
286
|
+
task_id,
|
287
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
288
|
+
)
|
289
|
+
|
290
|
+
return response
|
@@ -4,9 +4,11 @@ Classes for representing examples in a dataset.
|
|
4
4
|
|
5
5
|
|
6
6
|
from typing import TypeVar, Optional, Any, Dict, List
|
7
|
-
from
|
7
|
+
from uuid import uuid4
|
8
|
+
from pydantic import BaseModel, Field
|
8
9
|
from enum import Enum
|
9
10
|
from datetime import datetime
|
11
|
+
import time
|
10
12
|
|
11
13
|
|
12
14
|
Input = TypeVar('Input')
|
@@ -33,15 +35,19 @@ class Example(BaseModel):
|
|
33
35
|
tools_called: Optional[List[str]] = None
|
34
36
|
expected_tools: Optional[List[str]] = None
|
35
37
|
name: Optional[str] = None
|
36
|
-
example_id:
|
38
|
+
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
39
|
+
example_index: Optional[int] = None
|
37
40
|
timestamp: Optional[str] = None
|
38
41
|
trace_id: Optional[str] = None
|
39
42
|
|
40
43
|
def __init__(self, **data):
|
41
|
-
|
44
|
+
if 'example_id' not in data:
|
45
|
+
data['example_id'] = str(uuid4())
|
42
46
|
# Set timestamp if not provided
|
43
|
-
if
|
44
|
-
|
47
|
+
if 'timestamp' not in data:
|
48
|
+
data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
|
49
|
+
super().__init__(**data)
|
50
|
+
|
45
51
|
|
46
52
|
def to_dict(self):
|
47
53
|
return {
|
@@ -55,6 +61,7 @@ class Example(BaseModel):
|
|
55
61
|
"expected_tools": self.expected_tools,
|
56
62
|
"name": self.name,
|
57
63
|
"example_id": self.example_id,
|
64
|
+
"example_index": self.example_index,
|
58
65
|
"timestamp": self.timestamp,
|
59
66
|
"trace_id": self.trace_id
|
60
67
|
}
|
@@ -71,6 +78,7 @@ class Example(BaseModel):
|
|
71
78
|
f"expected_tools={self.expected_tools}, "
|
72
79
|
f"name={self.name}, "
|
73
80
|
f"example_id={self.example_id}, "
|
81
|
+
f"example_index={self.example_index}, "
|
74
82
|
f"timestamp={self.timestamp}, "
|
75
83
|
f"trace_id={self.trace_id})"
|
76
84
|
)
|