judgeval 0.0.35__tar.gz → 0.0.36__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.0.35 → judgeval-0.0.36}/.github/workflows/ci.yaml +3 -3
- {judgeval-0.0.35 → judgeval-0.0.36}/PKG-INFO +1 -2
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/api_reference/judgment_client.mdx +46 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/api_reference/trace.mdx +8 -41
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/clustering/clustering.mdx +6 -2
- judgeval-0.0.36/docs/compliance/certifications.mdx +47 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/data_datasets.mdx +68 -0
- judgeval-0.0.36/docs/evaluation/experiment_comparisons.mdx +143 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/getting_started.mdx +44 -11
- judgeval-0.0.36/docs/images/experiment-comparison-page-2.png +0 -0
- judgeval-0.0.36/docs/images/experiment-page-comparison.png +0 -0
- judgeval-0.0.36/docs/images/experiment-popout-comparison.png +0 -0
- judgeval-0.0.36/docs/images/experiments-page-comparison-2.png +0 -0
- judgeval-0.0.36/docs/images/experiments-page-comparison.png +0 -0
- judgeval-0.0.36/docs/images/export-dataset.png +0 -0
- judgeval-0.0.36/docs/images/synth_data_button.png +0 -0
- judgeval-0.0.36/docs/images/synth_data_window.png +0 -0
- judgeval-0.0.36/docs/integration/langgraph.mdx +126 -0
- judgeval-0.0.36/docs/judgment_cli/installation.mdx +91 -0
- judgeval-0.0.36/docs/judgment_cli/self-hosting.mdx +190 -0
- judgeval-0.0.36/docs/judgment_cli/supabase-org-id.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/mint.json +35 -3
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/monitoring/tracing.mdx +71 -24
- judgeval-0.0.36/docs/monitoring/tracing_s3.mdx +60 -0
- judgeval-0.0.35/docs/mcp_server/mcp_server.mdx → judgeval-0.0.36/docs/optimization/osiris_agent.mdx +29 -17
- judgeval-0.0.36/docs/self_hosting/get_started.mdx +73 -0
- judgeval-0.0.36/docs/synthetic_data/synthetic_data.mdx +66 -0
- judgeval-0.0.36/docusaurus/my-website/sidebars.ts +1 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/pyproject.toml +2 -3
- judgeval-0.0.36/src/demo/demo.py +50 -0
- judgeval-0.0.36/src/demo/hehe.py +19 -0
- judgeval-0.0.36/src/demo/human_in_the_loop/human_in_the_loop.py +195 -0
- judgeval-0.0.36/src/demo/human_in_the_loop/test.yaml +17 -0
- judgeval-0.0.36/src/demo/langgraph_demo.py +269 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/demo/travel_agent.py +20 -10
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/common/tracer.py +352 -118
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/constants.py +3 -2
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/data/datasets/dataset.py +3 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/data/datasets/eval_dataset_client.py +63 -3
- judgeval-0.0.36/src/judgeval/integrations/langgraph.py +1999 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/judgment_client.py +8 -2
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/run_evaluation.py +67 -18
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/score.py +1 -0
- judgeval-0.0.36/src/output.txt +276 -0
- judgeval-0.0.36/src/test.py +249 -0
- judgeval-0.0.35/docs/integration/langgraph.mdx +0 -53
- judgeval-0.0.35/src/demo/demo.py +0 -54
- judgeval-0.0.35/src/judgeval/integrations/langgraph.py +0 -337
- judgeval-0.0.35/src/test.py +0 -143
- {judgeval-0.0.35 → judgeval-0.0.36}/.github/pull_request_template.md +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/.gitignore +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/LICENSE.md +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/Pipfile +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/Pipfile.lock +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/README.md +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/README.md +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/alerts/notifications.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/alerts/platform_notifications.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/alerts/rules.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/changelog/2025-04-21.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/development.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/data_sequences.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/agent/derailment.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/default/answer_correctness.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/default/answer_relevancy.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/default/comparison.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/default/contextual_precision.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/default/contextual_recall.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/default/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/default/execution_order.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/default/faithfulness.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/default/groundedness.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/default/json_correctness.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/default/summarization.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/evaluation/unit_testing.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/favicon.svg +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/annotation_queue_ui.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/basic_trace_example.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/cluster.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/cluster_button.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/dashboard_annotation_queue_button.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/notifications_page.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/online_eval_fault.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/reports_modal.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/images/trace_ss.png +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/introduction.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/logo/light.svg +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/monitoring/annotations.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/monitoring/introduction.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/pytest.ini +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/demo/custom_scorer/main.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/demo/custom_scorer/scorer.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/demo/dataset.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/demo/demo2.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/demo/new_bot/basic_bot.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/demo/simple_trace.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/demo/simplified_tracing/example_complex_async.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/demo/streaming_anthropic_demo.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/demo/streaming_openai_demo.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/demo/test.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/common/s3_storage.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/data/custom_example.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/data/example.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/data/result.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/data/sequence.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/data/sequence_run.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/evaluation_run.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/rules.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorer.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/prompt_scorer.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/utils/alerts.py +0 -0
- {judgeval-0.0.35 → judgeval-0.0.36}/src/judgeval/version_check.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.36
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -11,7 +11,6 @@ Classifier: Operating System :: OS Independent
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: anthropic
|
14
|
-
Requires-Dist: boto3==1.38.3
|
15
14
|
Requires-Dist: fastapi
|
16
15
|
Requires-Dist: google-genai
|
17
16
|
Requires-Dist: langchain
|
@@ -31,6 +31,8 @@ const client = JudgmentClient.getInstance();
|
|
31
31
|
|
32
32
|
## Running an Evaluation
|
33
33
|
|
34
|
+
### Example Level
|
35
|
+
|
34
36
|
The `client.run_evaluation` (Python) or `client.evaluate` (Typescript) method is the primary method for executing evaluations.
|
35
37
|
|
36
38
|
<CodeGroup>
|
@@ -99,3 +101,47 @@ In Judgment, **projects** are used to organize workflows, while **evaluation run
|
|
99
101
|
are used to group versions of a workflow for comparative analysis of evaluations.
|
100
102
|
As a result, you can think of projects as folders, and evaluation runs as sub-folders that contain evaluation results.
|
101
103
|
</Tip>
|
104
|
+
|
105
|
+
|
106
|
+
### Sequence Level
|
107
|
+
|
108
|
+
The `client.run_sequence_evaluation` (Python) or `client.evaluateSequence` (Typescript) method is the primary method for executing sequence evaluations.
|
109
|
+
|
110
|
+
<CodeGroup>
|
111
|
+
```Python Python
|
112
|
+
from judgeval import JudgmentClient
|
113
|
+
from judgeval.data import Example, Sequence
|
114
|
+
from judgeval.scorers import DerailmentScorer
|
115
|
+
|
116
|
+
client = JudgmentClient()
|
117
|
+
|
118
|
+
airlines_example = Example(
|
119
|
+
input="Which airlines fly to Paris?",
|
120
|
+
actual_output="Air France, Delta, and American Airlines offer direct flights."
|
121
|
+
)
|
122
|
+
airline_followup = Example(
|
123
|
+
input="Which airline is the best for a family of 4?",
|
124
|
+
actual_output="Delta is the best airline for a family of 4."
|
125
|
+
)
|
126
|
+
weather_example = Example(
|
127
|
+
input="What is the weather like in Texas?",
|
128
|
+
actual_output="It's sunny with a high of 75°F in Texas."
|
129
|
+
)
|
130
|
+
airline_sequence = Sequence(
|
131
|
+
name="Flight Details",
|
132
|
+
items=[airlines_example, airline_followup, weather_example]
|
133
|
+
)
|
134
|
+
|
135
|
+
results = client.run_sequence_evaluation(
|
136
|
+
sequences=[airline_sequence],
|
137
|
+
scorers=[DerailmentScorer(threshold=0.5)],
|
138
|
+
model="gpt-4o",
|
139
|
+
log_results=True,
|
140
|
+
override=True,
|
141
|
+
)
|
142
|
+
```
|
143
|
+
</CodeGroup>
|
144
|
+
|
145
|
+
The `run_sequence_evaluation` (Python) / `evaluateSequence` (Typescript) method accepts the same arguments as the `run_evaluation` (Python) / `evaluate` (Typescript) method, with the following changes to the arguments:
|
146
|
+
|
147
|
+
- `sequences`: A list/array of [Sequence](/evaluation/data_examples) objects to evaluate (instead of 'examples')
|
@@ -92,34 +92,24 @@ The `TraceClient` object manages the context of a single trace context (or workf
|
|
92
92
|
|
93
93
|
## Tracing functions (`@observe` / `observe()`)
|
94
94
|
|
95
|
-
|
96
|
-
|
95
|
+
Each intermediate function or coroutine you want to trace is wrapped with the `@judgment.observe()` decorator (Python) or the `tracer.observe()` higher-order function (Typescript).
|
97
96
|
**If you use multiple decorators in Python**, the `@judgment.observe()` decorator should be the innermost decorator to preserve functionality.
|
98
97
|
|
99
|
-
Here's an example using
|
98
|
+
Here's an example using `observe`:
|
100
99
|
|
101
100
|
<CodeGroup>
|
102
101
|
```Python Python
|
103
|
-
#
|
104
|
-
|
105
|
-
|
106
|
-
# Only need to observe the top-level function
|
107
|
-
@judgment.observe(span_type="function")
|
108
|
-
def main_workflow(query: str):
|
109
|
-
# All function calls inside will be automatically traced
|
110
|
-
result = my_tool(query)
|
111
|
-
return process_result(result)
|
102
|
+
# Assume judgment = Tracer(...) exists
|
103
|
+
from langchain.tools import tool # Example other decorator
|
112
104
|
|
113
|
-
|
105
|
+
@tool
|
106
|
+
@judgment.observe(span_type="tool")
|
114
107
|
def my_tool(query: str):
|
108
|
+
# ... tool logic ...
|
115
109
|
print(f"Tool executed with query: {query}")
|
116
110
|
return "Tool result"
|
117
111
|
|
118
|
-
#
|
119
|
-
def process_result(result: str):
|
120
|
-
return f"Processed: {result}"
|
121
|
-
|
122
|
-
# Calling main_workflow("some query") will trace the entire call stack
|
112
|
+
# Calling my_tool("some query") will now be traced.
|
123
113
|
```
|
124
114
|
```Typescript Typescript
|
125
115
|
// Assume tracer = Tracer.getInstance(...) exists
|
@@ -144,29 +134,6 @@ const observedMyTool = tracer.observe({ spanType: "tool" })(myTool);
|
|
144
134
|
```
|
145
135
|
</CodeGroup>
|
146
136
|
|
147
|
-
You can also disable deep tracing if you prefer manual control:
|
148
|
-
|
149
|
-
<CodeGroup>
|
150
|
-
```Python Python
|
151
|
-
# Disable deep tracing
|
152
|
-
judgment = Tracer(project_name="my_project")
|
153
|
-
|
154
|
-
# Even with deep tracing globally enabled (default)
|
155
|
-
# You can disable it for specific functions so judgment would not trace any functions this selective_function calls
|
156
|
-
@judgment.observe(span_type="function", deep_tracing=False)
|
157
|
-
def selective_function():
|
158
|
-
helper_function() # Won't be traced automatically
|
159
|
-
return "Done"
|
160
|
-
```
|
161
|
-
```Typescript Typescript
|
162
|
-
// Disable deep tracing
|
163
|
-
const judgment = Tracer.getInstance({
|
164
|
-
projectName: "my_project",
|
165
|
-
deepTracing: false // Disable automatic deep tracing
|
166
|
-
});
|
167
|
-
```
|
168
|
-
</CodeGroup>
|
169
|
-
|
170
137
|
The `span_type` / `spanType` argument can be used to categorize the type of span for observability purposes and will be displayed
|
171
138
|
on the Judgment platform:
|
172
139
|
|
@@ -14,7 +14,9 @@ Clustering visualization helps you:
|
|
14
14
|
- Explore data points to understand cluster characteristics
|
15
15
|
- Compare results across different evaluation sets, traces, or datasets
|
16
16
|
|
17
|
-
|
17
|
+
<Frame>
|
18
|
+
<img src="/images/cluster.png" alt="Clustering visualization example" />
|
19
|
+
</Frame>
|
18
20
|
|
19
21
|
## Accessing Clustering Visualization
|
20
22
|
|
@@ -31,7 +33,9 @@ You can access clustering visualization in three different contexts:
|
|
31
33
|
1. **Select a project**: Choose your project from the from the projects page of the platform website.
|
32
34
|
2. **Choose data source**: From the project page, we can click into experiments, monitoring traces, or datasets and choose the option to cluster from within the respective pages. The visualization will display data from the specified source.
|
33
35
|
|
34
|
-
|
36
|
+
<Frame>
|
37
|
+
<img src="/images/cluster_button.png" alt="Clustering visualization button location" />
|
38
|
+
</Frame>
|
35
39
|
|
36
40
|
### Interacting with the Visualization
|
37
41
|
|
@@ -0,0 +1,47 @@
|
|
1
|
+
---
|
2
|
+
title: Security & Compliance
|
3
|
+
---
|
4
|
+
|
5
|
+
At Judgment Labs, we take security and compliance seriously. We maintain rigorous standards to protect our customers' data and ensure the highest level of service reliability.
|
6
|
+
|
7
|
+
## SOC 2 Compliance
|
8
|
+
|
9
|
+
### Type 1 Certification
|
10
|
+
We have successfully completed our SOC 2 Type 1 audit, demonstrating our commitment to security, availability, and confidentiality. This certification verifies that our security controls are appropriately designed and implemented.
|
11
|
+
|
12
|
+
<Note>
|
13
|
+
View our [SOC 2 Type 1 Report](https://app.delve.co/judgment-labs)
|
14
|
+
</Note>
|
15
|
+
|
16
|
+
### Type 2 Certification (In Progress)
|
17
|
+
We are currently undergoing our SOC 2 Type 2 audit, which will validate the operational effectiveness of our security controls over time. This comprehensive audit examines our systems and processes over an extended period to ensure consistent adherence to security protocols.
|
18
|
+
|
19
|
+
<Note>
|
20
|
+
The SOC 2 Type 2 audit is expected to be completed in the coming months. Once completed, the report will be available through our [Delve compliance portal](https://app.delve.co/judgment-labs).
|
21
|
+
</Note>
|
22
|
+
|
23
|
+
## HIPAA Compliance
|
24
|
+
|
25
|
+
We maintain HIPAA compliance to ensure the security and privacy of protected health information (PHI). Our infrastructure and processes are designed to meet HIPAA's strict requirements for:
|
26
|
+
- Data encryption
|
27
|
+
- Access controls
|
28
|
+
- Audit logging
|
29
|
+
- Data backup and recovery
|
30
|
+
- Security incident handling
|
31
|
+
|
32
|
+
<Tip>
|
33
|
+
Access our [HIPAA Compliance Report](https://app.delve.co/judgment-labs) through our compliance portal. If you're working with healthcare data, please contact our team at contact@judgmentlabs.ai to discuss your specific compliance needs.
|
34
|
+
</Tip>
|
35
|
+
|
36
|
+
## Our Commitment
|
37
|
+
|
38
|
+
Our security and compliance certifications reflect our commitment to:
|
39
|
+
- Protecting customer data
|
40
|
+
- Maintaining system availability
|
41
|
+
- Ensuring process integrity
|
42
|
+
- Preserving confidentiality
|
43
|
+
- Following industry best practices
|
44
|
+
|
45
|
+
For detailed information about our security practices or compliance certifications, please:
|
46
|
+
1. Visit our [Compliance Portal](https://app.delve.co/judgment-labs)
|
47
|
+
2. Contact our security team at contact@judgmentlabs.ai
|
@@ -279,6 +279,74 @@ const results = await client.evaluate({
|
|
279
279
|
```
|
280
280
|
</CodeGroup>
|
281
281
|
|
282
|
+
## Exporting Datasets
|
283
|
+
|
284
|
+
You can export your datasets from the Judgment Platform UI for backup purposes or sharing with team members.
|
285
|
+
|
286
|
+
### Export from Platform UI
|
287
|
+
|
288
|
+
1. Navigate to your project in the [Judgment Platform](https://app.judgmentlabs.ai)
|
289
|
+
2. Select the dataset you want to export
|
290
|
+
3. Click the "Download Dataset" button in the top right
|
291
|
+
4. The dataset will be downloaded as a JSON file
|
292
|
+
|
293
|
+
<Frame>
|
294
|
+
<img src="/images/export-dataset.png" alt="Export Dataset" />
|
295
|
+
</Frame>
|
296
|
+
|
297
|
+
The exported JSON file contains the complete dataset information, including metadata and examples:
|
298
|
+
|
299
|
+
```json
|
300
|
+
{
|
301
|
+
"dataset_id": "f852eeee-87fa-4430-9571-5784e693326e",
|
302
|
+
"organization_id": "0fbb0aa8-a7b3-4108-b92a-cc6c6800d825",
|
303
|
+
"dataset_alias": "QA-Pairs",
|
304
|
+
"comments": null,
|
305
|
+
"source_file": null,
|
306
|
+
"created_at": "2025-04-23T22:38:11.709763+00:00",
|
307
|
+
"is_sequence": false,
|
308
|
+
"examples": [
|
309
|
+
{
|
310
|
+
"example_id": "119ee1f6-1046-41bc-bb89-d9fc704829dd",
|
311
|
+
"input": "How can I start meditating?",
|
312
|
+
"actual_output": null,
|
313
|
+
"expected_output": "Meditation is a wonderful way to relax and focus...",
|
314
|
+
"context": null,
|
315
|
+
"retrieval_context": null,
|
316
|
+
"additional_metadata": {
|
317
|
+
"synthetic": true
|
318
|
+
},
|
319
|
+
"tools_called": null,
|
320
|
+
"expected_tools": null,
|
321
|
+
"name": null,
|
322
|
+
"created_at": "2025-04-23T23:34:33.117479+00:00",
|
323
|
+
"dataset_id": "f852eeee-87fa-4430-9571-5784e693326e",
|
324
|
+
"eval_results_id": null,
|
325
|
+
"sequence_id": null,
|
326
|
+
"sequence_order": 0
|
327
|
+
},
|
328
|
+
// more examples...
|
329
|
+
]
|
330
|
+
}
|
331
|
+
```
|
332
|
+
|
333
|
+
Each example in the dataset contains:
|
334
|
+
- `example_id`: Unique identifier for the example
|
335
|
+
- `input`: The input query or prompt
|
336
|
+
- `actual_output`: The response from your agent (if any)
|
337
|
+
- `expected_output`: The expected response or ground truth
|
338
|
+
- `context`: Additional context for the example
|
339
|
+
- `retrieval_context`: Retrieved context used for RAG systems
|
340
|
+
- `additional_metadata`: Custom metadata (e.g., whether the example is synthetic)
|
341
|
+
- `tools_called`: Record of tools used in the response
|
342
|
+
- `expected_tools`: Expected tool calls for the example
|
343
|
+
- `created_at`: Timestamp of example creation
|
344
|
+
- `sequence_order`: Order in sequence (if part of a sequence)
|
345
|
+
|
346
|
+
<Note>
|
347
|
+
When downloading datasets that contain sensitive information, make sure to follow your organization's data handling policies and store the exported files in secure locations.
|
348
|
+
</Note>
|
349
|
+
|
282
350
|
## Conclusion
|
283
351
|
|
284
352
|
Congratulations! 🎉
|
@@ -0,0 +1,143 @@
|
|
1
|
+
---
|
2
|
+
title: Experiment Comparisons
|
3
|
+
description: "Learn how to A/B test changes in your LLM workflows using experiment comparisons."
|
4
|
+
---
|
5
|
+
|
6
|
+
# Introduction
|
7
|
+
|
8
|
+
Experiment comparisons allow you to systematically A/B test changes in your LLM workflows. Whether you're testing different prompts, models, or architectures, Judgment helps you compare results across experiments to make data-driven decisions about your LLM systems.
|
9
|
+
|
10
|
+
# Creating Your First Comparison
|
11
|
+
|
12
|
+
Let's walk through how to create and run experiment comparisons:
|
13
|
+
|
14
|
+
<CodeGroup>
|
15
|
+
```Python Python
|
16
|
+
from judgeval import JudgmentClient
|
17
|
+
from judgeval.data import Example
|
18
|
+
from judgeval.scorers import AnswerCorrectnessScorer
|
19
|
+
|
20
|
+
client = JudgmentClient()
|
21
|
+
|
22
|
+
# Define your test examples
|
23
|
+
examples = [
|
24
|
+
Example(
|
25
|
+
input="What is the capital of France?",
|
26
|
+
actual_output="Paris is the capital of France.",
|
27
|
+
expected_output="Paris"
|
28
|
+
),
|
29
|
+
Example(
|
30
|
+
input="What is the capital of Japan?",
|
31
|
+
actual_output="Tokyo is the capital of Japan.",
|
32
|
+
expected_output="Tokyo"
|
33
|
+
)
|
34
|
+
]
|
35
|
+
|
36
|
+
# Define your scorer
|
37
|
+
scorer = AnswerCorrectnessScorer(threshold=0.7)
|
38
|
+
|
39
|
+
# Run first experiment with GPT-4
|
40
|
+
experiment_1 = client.run_evaluation(
|
41
|
+
examples=examples,
|
42
|
+
scorers=[scorer],
|
43
|
+
model="gpt-4",
|
44
|
+
project_name="capital_cities",
|
45
|
+
eval_name="gpt4_experiment"
|
46
|
+
)
|
47
|
+
|
48
|
+
# Run second experiment with a different model
|
49
|
+
experiment_2 = client.run_evaluation(
|
50
|
+
examples=examples,
|
51
|
+
scorers=[scorer],
|
52
|
+
model="gpt-3.5-turbo",
|
53
|
+
project_name="capital_cities",
|
54
|
+
eval_name="gpt35_experiment"
|
55
|
+
)
|
56
|
+
```
|
57
|
+
```Typescript Typescript
|
58
|
+
import { JudgmentClient, ExampleBuilder, AnswerCorrectnessScorer } from 'judgeval';
|
59
|
+
|
60
|
+
async function runComparativeExperiments() {
|
61
|
+
const client = JudgmentClient.getInstance();
|
62
|
+
|
63
|
+
// Define your test examples
|
64
|
+
const examples = [
|
65
|
+
new ExampleBuilder()
|
66
|
+
.input("What is the capital of France?")
|
67
|
+
.actualOutput("Paris is the capital of France.")
|
68
|
+
.expectedOutput("Paris")
|
69
|
+
.build(),
|
70
|
+
new ExampleBuilder()
|
71
|
+
.input("What is the capital of Japan?")
|
72
|
+
.actualOutput("Tokyo is the capital of Japan.")
|
73
|
+
.expectedOutput("Tokyo")
|
74
|
+
.build()
|
75
|
+
];
|
76
|
+
|
77
|
+
// Define your scorer
|
78
|
+
const scorer = new AnswerCorrectnessScorer(0.7);
|
79
|
+
|
80
|
+
// Run first experiment with GPT-4
|
81
|
+
const experiment1 = await client.evaluate({
|
82
|
+
examples: examples,
|
83
|
+
scorers: [scorer],
|
84
|
+
model: "gpt-4",
|
85
|
+
projectName: "capital_cities",
|
86
|
+
evalName: "gpt4_experiment"
|
87
|
+
});
|
88
|
+
|
89
|
+
// Run second experiment with a different model
|
90
|
+
const experiment2 = await client.evaluate({
|
91
|
+
examples: examples,
|
92
|
+
scorers: [scorer],
|
93
|
+
model: "gpt-3.5-turbo",
|
94
|
+
projectName: "capital_cities",
|
95
|
+
evalName: "gpt35_experiment"
|
96
|
+
});
|
97
|
+
}
|
98
|
+
|
99
|
+
runComparativeExperiments();
|
100
|
+
```
|
101
|
+
</CodeGroup>
|
102
|
+
|
103
|
+
After running the following code, click the `View Results` link to take you to your experiment run on the Judgment Platform.
|
104
|
+
|
105
|
+
# Analyzing Results
|
106
|
+
|
107
|
+
Once your experiments are complete, you can compare them on the Judgment Platform:
|
108
|
+
|
109
|
+
1. You'll be automatically directed to your **Experiment page**. Here you'll see your latest experiment results and a "Compare" button.
|
110
|
+
<div style={{display: 'flex', justifyContent: 'center'}}>
|
111
|
+
<Frame>
|
112
|
+

|
113
|
+
</Frame>
|
114
|
+
</div>
|
115
|
+
|
116
|
+
2. Click the "Compare" button to navigate to the **Experiments page**. Here you can select a previous experiment to compare against your current results.
|
117
|
+
<div style={{display: 'flex', justifyContent: 'center'}}>
|
118
|
+
<Frame>
|
119
|
+

|
120
|
+
</Frame>
|
121
|
+
</div>
|
122
|
+
|
123
|
+
3. After selecting an experiment, you'll return to the **Experiment page** with both experiments' results displayed side by side.
|
124
|
+
<div style={{display: 'flex', justifyContent: 'center'}}>
|
125
|
+
<Frame>
|
126
|
+

|
127
|
+
</Frame>
|
128
|
+
</div>
|
129
|
+
|
130
|
+
4. For detailed insights, click on any row in the comparison table to see specific metrics and analysis.
|
131
|
+
<div style={{display: 'flex', justifyContent: 'center'}}>
|
132
|
+
<Frame>
|
133
|
+

|
134
|
+
</Frame>
|
135
|
+
</div>
|
136
|
+
|
137
|
+
<Tip>
|
138
|
+
Use these detailed comparisons to make data-driven decisions about which model, prompt, or architecture performs best for your specific use case.
|
139
|
+
</Tip>
|
140
|
+
|
141
|
+
# Next Steps
|
142
|
+
|
143
|
+
- To learn more about creating datasets to run on your experiments, check out our [Datasets](/evaluation/datasets) section
|
@@ -120,12 +120,14 @@ from judgeval.common.tracer import Tracer, wrap
|
|
120
120
|
from openai import OpenAI
|
121
121
|
|
122
122
|
client = wrap(OpenAI())
|
123
|
-
judgment = Tracer(project_name="my_project")
|
123
|
+
judgment = Tracer(project_name="my_project")
|
124
|
+
|
125
|
+
@judgment.observe(span_type="tool")
|
126
|
+
def my_tool():
|
127
|
+
return "Hello world!"
|
124
128
|
|
125
|
-
# With automatic deep tracing, you only need to observe the top-level function
|
126
129
|
@judgment.observe(span_type="function")
|
127
130
|
def main():
|
128
|
-
# my_tool will be automatically traced without @observe
|
129
131
|
task_input = my_tool()
|
130
132
|
res = client.chat.completions.create(
|
131
133
|
model="gpt-4o",
|
@@ -133,10 +135,6 @@ def main():
|
|
133
135
|
)
|
134
136
|
return res.choices[0].message.content
|
135
137
|
|
136
|
-
# No @observe needed - automatically traced when called from main
|
137
|
-
def my_tool():
|
138
|
-
return "Hello world!"
|
139
|
-
|
140
138
|
# Calling the observed function implicitly starts and saves the trace
|
141
139
|
main()
|
142
140
|
```
|
@@ -183,8 +181,6 @@ runImplicitTrace();
|
|
183
181
|
```
|
184
182
|
</CodeGroup>
|
185
183
|
|
186
|
-
With automatic deep tracing, you only need to observe top-level functions, and all nested function calls will be automatically traced. This significantly reduces the amount of instrumentation needed in your code.
|
187
|
-
|
188
184
|
Congratulations! You've just created your first trace. It should look like this:
|
189
185
|
|
190
186
|
<div style={{display: 'flex', justifyContent: 'center'}}>
|
@@ -200,6 +196,40 @@ There are many benefits of monitoring your LLM systems with `judgeval` tracing,
|
|
200
196
|
To learn more about `judgeval`'s tracing module, click [here](/tracing/introduction).
|
201
197
|
</Tip>
|
202
198
|
|
199
|
+
## Automatic Deep Tracing
|
200
|
+
|
201
|
+
Judgeval supports automatic deep tracing, which significantly reduces the amount of instrumentation needed in your code. With deep tracing enabled (which is the default), you only need to observe top-level functions, and all nested function calls will be automatically traced.
|
202
|
+
|
203
|
+
<CodeGroup>
|
204
|
+
```Python Python
|
205
|
+
from judgeval.tracer import Tracer, wrap
|
206
|
+
from openai import OpenAI
|
207
|
+
|
208
|
+
client = wrap(OpenAI())
|
209
|
+
judgment = Tracer(project_name="my_project")
|
210
|
+
|
211
|
+
# Define a function that will be automatically traced when called from main
|
212
|
+
def helper_function():
|
213
|
+
return "This will be traced automatically"
|
214
|
+
|
215
|
+
# Only need to observe the top-level function
|
216
|
+
@judgment.observe(span_type="function")
|
217
|
+
def main():
|
218
|
+
# helper_function will be automatically traced without @observe
|
219
|
+
result = helper_function()
|
220
|
+
res = client.chat.completions.create(
|
221
|
+
model="gpt-4o",
|
222
|
+
messages=[{"role": "user", "content": result}]
|
223
|
+
)
|
224
|
+
return res.choices[0].message.content
|
225
|
+
|
226
|
+
main()
|
227
|
+
```
|
228
|
+
</CodeGroup>
|
229
|
+
|
230
|
+
To disable deep tracing, initialize the tracer with `deep_tracing=False`. You can still name and declare span types for each function using jdugement.observe().
|
231
|
+
|
232
|
+
|
203
233
|
# Create Your First Online Evaluation
|
204
234
|
|
205
235
|
In addition to tracing, `judgeval` allows you to run online evaluations on your LLM systems. This enables you to:
|
@@ -229,11 +259,14 @@ def main():
|
|
229
259
|
messages=[{"role": "user", "content": f"{task_input}"}]
|
230
260
|
).choices[0].message.content
|
231
261
|
|
262
|
+
example = Example(
|
263
|
+
input=task_input,
|
264
|
+
actual_output=res
|
265
|
+
)
|
232
266
|
# In Python, this likely operates on the implicit trace context
|
233
267
|
judgment.async_evaluate(
|
234
268
|
scorers=[AnswerRelevancyScorer(threshold=0.5)],
|
235
|
-
|
236
|
-
actual_output=res,
|
269
|
+
example=example,
|
237
270
|
model="gpt-4o"
|
238
271
|
)
|
239
272
|
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|