judgeval 0.0.10__tar.gz → 0.0.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.0.10 → judgeval-0.0.11}/PKG-INFO +5 -4
- {judgeval-0.0.10 → judgeval-0.0.11}/Pipfile +4 -3
- {judgeval-0.0.10 → judgeval-0.0.11}/Pipfile.lock +1268 -585
- judgeval-0.0.11/README.md +3 -0
- judgeval-0.0.11/docs/api_reference/judgment_client.mdx +61 -0
- judgeval-0.0.11/docs/api_reference/trace.mdx +74 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/judges.mdx +1 -1
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/custom_scorers.mdx +12 -12
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/introduction.mdx +4 -4
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/getting_started.mdx +6 -2
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/mint.json +7 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/monitoring/tracing.mdx +7 -1
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/notebooks/create_scorer.ipynb +2 -2
- {judgeval-0.0.10 → judgeval-0.0.11}/pyproject.toml +5 -4
- {judgeval-0.0.10 → judgeval-0.0.11}/src/demo/cookbooks/openai_travel_agent/agent.py +11 -66
- judgeval-0.0.11/src/demo/cookbooks/openai_travel_agent/tools.py +16 -0
- judgeval-0.0.11/src/demo/customer_use/cstone/basic_test.py +37 -0
- judgeval-0.0.11/src/demo/customer_use/cstone/cstone_data.csv +1225 -0
- judgeval-0.0.11/src/demo/customer_use/cstone/data.csv +1227 -0
- judgeval-0.0.11/src/demo/customer_use/cstone/faithfulness_testing.py +169 -0
- judgeval-0.0.11/src/demo/customer_use/cstone/galen_data.csv +0 -0
- judgeval-0.0.11/src/demo/customer_use/cstone/playground.py +152 -0
- judgeval-0.0.11/src/demo/customer_use/cstone/results.csv +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/common/tracer.py +183 -41
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/constants.py +14 -3
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/evaluation_run.py +2 -1
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/judges/utils.py +14 -2
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/judgment_client.py +46 -1
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorer.py +8 -8
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +3 -1
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +6 -3
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/prompt_scorer.py +2 -2
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/score.py +11 -11
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/utils.py +3 -3
- judgeval-0.0.11/src/judgeval/tracer/__init__.py +3 -0
- judgeval-0.0.10/README.md +0 -3
- {judgeval-0.0.10 → judgeval-0.0.11}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/.gitignore +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/LICENSE.md +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/README.md +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/development.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/faithfulness.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/hallucination.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/json_correctness.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/summarization.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/scorers/tool_correctness.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/evaluation/unit_testing.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/favicon.svg +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/images/basic_trace_example.png +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/images/trace_screenshot.png +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/introduction.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/judgment/introduction.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/logo/light.svg +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/monitoring/introduction.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/pytest.ini +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/demo/cookbooks/ci_testing/ci_testing.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/demo/cookbooks/ci_testing/travel_response.txt +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/demo/cookbooks/custom_scorers/competitor_mentions.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/demo/cookbooks/custom_scorers/text2sql.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/data/api_example.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/data/datasets/dataset.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/data/datasets/eval_dataset_client.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/data/datasets/ground_truth.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/data/datasets/utils.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/data/example.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/data/result.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/run_evaluation.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -0
- {judgeval-0.0.10 → judgeval-0.0.11}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.11
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -12,12 +12,10 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: anthropic
|
14
14
|
Requires-Dist: fastapi
|
15
|
-
Requires-Dist: langfuse==2.50.3
|
16
15
|
Requires-Dist: litellm
|
17
16
|
Requires-Dist: nest-asyncio
|
18
17
|
Requires-Dist: openai
|
19
18
|
Requires-Dist: pandas
|
20
|
-
Requires-Dist: patronus
|
21
19
|
Requires-Dist: pika
|
22
20
|
Requires-Dist: python-dotenv==1.0.1
|
23
21
|
Requires-Dist: requests
|
@@ -25,11 +23,14 @@ Requires-Dist: supabase
|
|
25
23
|
Requires-Dist: together
|
26
24
|
Requires-Dist: uvicorn
|
27
25
|
Provides-Extra: dev
|
26
|
+
Requires-Dist: langfuse==2.50.3; extra == 'dev'
|
27
|
+
Requires-Dist: patronus; extra == 'dev'
|
28
28
|
Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
|
29
29
|
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
30
30
|
Requires-Dist: pytest>=8.3.4; extra == 'dev'
|
31
|
+
Requires-Dist: tavily-python; extra == 'dev'
|
31
32
|
Description-Content-Type: text/markdown
|
32
33
|
|
33
34
|
# judgeval
|
34
35
|
|
35
|
-
Judgeval is
|
36
|
+
Judgeval is an open-source evaluation framework for multi-agent LLM workflows, for both real-time and offline evaluations.
|
@@ -4,7 +4,6 @@ verify_ssl = true
|
|
4
4
|
name = "pypi"
|
5
5
|
|
6
6
|
[packages]
|
7
|
-
langfuse = "==2.50.3"
|
8
7
|
litellm = "*"
|
9
8
|
python-dotenv = "==1.0.1"
|
10
9
|
fastapi = "*"
|
@@ -15,15 +14,17 @@ pandas = "*"
|
|
15
14
|
openai = "*"
|
16
15
|
together = "*"
|
17
16
|
anthropic = "*"
|
18
|
-
patronus = "*"
|
19
17
|
asyncio = "*"
|
20
18
|
nest-asyncio = "*"
|
21
|
-
|
19
|
+
pika = "*"
|
22
20
|
|
23
21
|
[dev-packages]
|
24
22
|
pytest = "*"
|
25
23
|
pytest-asyncio = "*"
|
26
24
|
pytest-mock = "*"
|
25
|
+
tavily-python = "*"
|
26
|
+
patronus = "*"
|
27
|
+
langfuse = "==2.50.3"
|
27
28
|
|
28
29
|
[requires]
|
29
30
|
python_version = "3.11"
|