PyPI - judgeval - Versions diffs - 0.0.37__py3-none-any.whl → 0.0.38__py3-none-any.whl - Mend

judgeval 0.0.37py3-none-any.whl → 0.0.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/common/tracer.py +132 -281
judgeval/common/utils.py +1 -1
judgeval/constants.py +1 -3
judgeval/data/__init__.py +0 -2
judgeval/data/datasets/dataset.py +2 -9
judgeval/data/datasets/eval_dataset_client.py +1 -62
judgeval/data/example.py +0 -1
judgeval/data/result.py +3 -3
judgeval/data/trace.py +4 -1
judgeval/data/{sequence_run.py → trace_run.py} +4 -4
judgeval/evaluation_run.py +1 -1
judgeval/integrations/langgraph.py +187 -1768
judgeval/judges/litellm_judge.py +1 -1
judgeval/judges/mixture_of_judges.py +1 -1
judgeval/judges/utils.py +1 -1
judgeval/judgment_client.py +15 -21
judgeval/run_evaluation.py +31 -81
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +4 -2
judgeval-0.0.38.dist-info/METADATA +247 -0
{judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/RECORD +22 -23
judgeval/data/sequence.py +0 -50
judgeval-0.0.37.dist-info/METADATA +0 -214
{judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/WHEEL +0 -0
{judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/licenses/LICENSE.md +0 -0

{judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/RECORD RENAMED Viewed

@@ -1,35 +1,34 @@
 judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
 judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
-judgeval/constants.py,sha256=KDHkZbzk-nr9uP-jsWUwpfaQSf4YkVfbO1o8w93-GME,5707
-judgeval/evaluation_run.py,sha256=MnlDSCXXi1vhTNTYC1XgPAl2BVG_ivNeFzIyfaw4Dho,6761
-judgeval/judgment_client.py,sha256=SiqazgyKkGsCVZ7J6XeL3Fvc51Oz7TM7yBgSfguJ0wQ,23625
+judgeval/constants.py,sha256=qemyUNf5G5-W6YQ9tNkxbFa7L7XR6cDtWCVFKRwT3TM,5519
+judgeval/evaluation_run.py,sha256=V9xMyiJ7e9lqHRblaeeMh6oyx1MEtGwfSxYtbi-EeXY,6746
+judgeval/judgment_client.py,sha256=ozNMDeM3lNnaNq4zY40x3z1TwHYL1e25BlxGnSYO0yw,23275
 judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
-judgeval/run_evaluation.py,sha256=66ppcGpCc08WrK46gWMJjktGzAg5alAWzRb9ncv9DTM,34555
+judgeval/run_evaluation.py,sha256=-7oiebkggP7lf6nVRxqDKE3QkuPSA0sAVkZl_n2nZtI,32437
 judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
 judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
-judgeval/common/tracer.py,sha256=5Pt49vSVK2zxh6p9nMW87Ju1eN3-M_mziSF_mDucXkA,87211
-judgeval/common/utils.py,sha256=oYjUgW8eL1qkzBe_tOz1WbCB6LbHWYvZN38XyXof8Eo,34086
-judgeval/data/__init__.py,sha256=-i7cuSBHrSTMf3UiIbFXwp56y15QJ7pmQeQK1yprhqM,561
+judgeval/common/tracer.py,sha256=EkWkg2AsS5FIj2ffh912qZZ9ew5h3hu2rynPBDsMszw,80463
+judgeval/common/utils.py,sha256=w1SjpDtB1DTJapFSAvLzr_a3gGI45iacEoxIUnQXx4Q,34087
+judgeval/data/__init__.py,sha256=Q4WiIva20U_NgxGr-MU-9FWN_eFzUZBVgCsBmoo7IM8,501
 judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
-judgeval/data/example.py,sha256=EVniiTpaut2wlTS1u3MxB983odCRLSa9RJ74iAsR0wg,6929
-judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
+judgeval/data/example.py,sha256=MD0rA9oNI4cyaRgz7I7EOKv0gD2dp22Q_5z-NWdFHhE,6891
+judgeval/data/result.py,sha256=KfU9lhAKG_Xo2eGDm2uKVVRZpf177IDASg1cIwedJwE,3184
 judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
-judgeval/data/sequence.py,sha256=vat_N0Acr64yfu97AfsaVPNrv2LqBiWCuYIoQWuxYwo,2074
-judgeval/data/sequence_run.py,sha256=XPa-MvwRK6ABKQtpMdmGHnoyL1KrgzQUjqItpLDc8U0,2213
-judgeval/data/trace.py,sha256=aRNwtJGebsm5MerVlZ3HKzviNAMpzydyUs88rs-BZ5Q,4899
+judgeval/data/trace.py,sha256=IjL06YNElxTuJC0HrPUh69rtXkfkSpzDoZdNiXFUvwY,5043
+judgeval/data/trace_run.py,sha256=G_OsHNK_nZzJKhtdiyWp7GFyyns5AOJZ956GM_4jXM0,2192
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
-judgeval/data/datasets/dataset.py,sha256=1o-SMG96yYkSCxqUTItKKfeFFLhQXrE2cKPYEvlNeGw,13044
-judgeval/data/datasets/eval_dataset_client.py,sha256=uirHpkpLOfygXIz0xKAGTPx1qjbBTzdLFQK6yyoZduU,17544
-judgeval/integrations/langgraph.py,sha256=nQ6KAi9giirnWmD35i4CfoFYKzajh5ElvKh4t6Yasgs,122617
+judgeval/data/datasets/dataset.py,sha256=oU9hvZTifK2x8em3FhL3oIqgHOByfJWH6C_9rIKnL5g,12773
+judgeval/data/datasets/eval_dataset_client.py,sha256=3RBfkaMrkudjnmY_qFwY4I-2mOPE3XK4WxkfSweLB-Q,15016
+judgeval/integrations/langgraph.py,sha256=L9zPPWVLGL2HWuwHPqM5Kic4S7EfQ_Y1Y3YKBJNfGCA,23004
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
 judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
-judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
-judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
+judgeval/judges/litellm_judge.py,sha256=DhB6px9ELZL3gbMb2w4FkBliuTlaCVIcjE8v149G6NM,2425
+judgeval/judges/mixture_of_judges.py,sha256=D97h8L-6saPwwppVwitrIdlMAjizzxGWeVOfNyVnXZA,15550
 judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
-judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
+judgeval/judges/utils.py,sha256=vL-15_udU94JHUAiyrAvHAKMj6Fqypg01ek4YH5zVCM,2687
 judgeval/scorers/__init__.py,sha256=-4GLkYiLKI_BxpoIfgadCFEUfqJcBWZLAtfrInjZT0Q,1282
 judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
@@ -53,14 +52,14 @@ judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=k5gDOki-8K
 judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=XnSGEkQfwVqaqnHEGMCsxNiHVzrsrej48uDbLoWc8CQ,678
 judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=mMKEuR87_yanEuZJ5YSGFMHDD_oLVZ6-rQuciFaDOMA,1095
 judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=QmWB8bVbDYHY5FcF0rYZE_3c2XXgMLRmR6aXJWfdMC4,655
-judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=yBd5KU4wBTHFMi1B6D8hRdPOYYQl4uD7Z-xCW4yk5-E,427
+judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=urm8LgkeZA7e-ePWo6AToKGheQYSp6MOpKon5NF5EJw,570
 judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
 judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
 judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
 judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
 judgeval/utils/data_utils.py,sha256=pB4GBWi8XoM2zSR2NlLXH5kqcQ029BVhDxaVKkdmiBY,1860
-judgeval-0.0.37.dist-info/METADATA,sha256=0XKc4BJUpG8qnB3a9afhh8l7H_C19ritSJv95ogifXw,7742
-judgeval-0.0.37.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.37.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.37.dist-info/RECORD,,
+judgeval-0.0.38.dist-info/METADATA,sha256=jlCQMfdz2Ni9nRi9cOu5svHnLqIinll2odC37dqkE3U,11860
+judgeval-0.0.38.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.38.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.38.dist-info/RECORD,,

judgeval/data/sequence.py DELETED Viewed

@@ -1,50 +0,0 @@
-from pydantic import BaseModel, Field, field_validator, model_validator
-from typing import List, Optional, Union, Any, Dict
-from judgeval.data.example import Example
-from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
-from uuid import uuid4
-from datetime import datetime, timezone
-class Sequence(BaseModel):
-    """
-    A sequence is a list of either Examples or nested Sequence objects.
-    """
-    sequence_id: str = Field(default_factory=lambda: str(uuid4()))
-    name: Optional[str] = "Sequence"
-    created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
-    items: List[Union["Sequence", Example]] = []
-    scorers: Optional[Any] = None
-    parent_sequence_id: Optional[str] = None
-    sequence_order: Optional[int] = 0
-    root_sequence_id: Optional[str] = None
-    inputs: Optional[Dict[str, Any]] = None
-    output: Optional[Any] = None
-    expected_tools: Optional[List[Dict[str, Any]]] = None
-    @field_validator("scorers")
-    def validate_scorer(cls, v):
-        for scorer in v or []:
-            if not isinstance(scorer, APIJudgmentScorer) and not isinstance(scorer, JudgevalScorer):
-                raise ValueError(f"Invalid scorer type: {type(scorer)}")
-        return v
-    @model_validator(mode="after")
-    def populate_sequence_metadata(self) -> "Sequence":
-        """Recursively set parent_sequence_id, root_sequence_id, and sequence_order."""
-        # If root_sequence_id isn't already set, assign it to self
-        if self.root_sequence_id is None:
-            self.root_sequence_id = self.sequence_id
-        for idx, item in enumerate(self.items):
-            item.sequence_order = idx
-            if isinstance(item, Sequence):
-                item.parent_sequence_id = self.sequence_id
-                item.root_sequence_id = self.root_sequence_id
-                item.populate_sequence_metadata()
-        return self
-    class Config:
-        arbitrary_types_allowed = True
-# Update forward references so that "Sequence" inside items is resolved.
-Sequence.model_rebuild()

judgeval-0.0.37.dist-info/METADATA DELETED Viewed

@@ -1,214 +0,0 @@
-Metadata-Version: 2.4
-Name: judgeval
-Version: 0.0.37
-Summary: Judgeval Package
-Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
-Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
-Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
-License-Expression: Apache-2.0
-License-File: LICENSE.md
-Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python :: 3
-Requires-Python: >=3.11
-Requires-Dist: anthropic
-Requires-Dist: boto3
-Requires-Dist: google-genai
-Requires-Dist: langchain-anthropic
-Requires-Dist: langchain-core
-Requires-Dist: langchain-huggingface
-Requires-Dist: langchain-openai
-Requires-Dist: litellm==1.38.12
-Requires-Dist: nest-asyncio
-Requires-Dist: openai
-Requires-Dist: pandas
-Requires-Dist: python-dotenv==1.0.1
-Requires-Dist: requests
-Requires-Dist: together
-Description-Content-Type: text/markdown
-<div align="center">
-<img src="assets/logo-light.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
-<img src="assets/logo-dark.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
-**Build monitoring & evaluation pipelines for complex agents**
-[Website](https://www.judgmentlabs.ai/) • [Twitter/X](https://x.com/JudgmentLabs) • [LinkedIn](https://www.linkedin.com/company/judgmentlabs) • [Documentation](https://judgment.mintlify.app/getting_started) • [Demos](https://www.youtube.com/@AlexShan-j3o)
-</div>
-## 🚀 What is Judgeval?
-Judgeval is an open-source tool for testing, monitoring, and optimizing AI agents. Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
-**🔍 Tracing**
-* Automatic agent tracing for common agent frameworks and SDKs (LangGraph, OpenAI, Anthropic, etc.)
-* Track input/output, latency, cost, token usage at every step
-* Function tracing with `@judgment.observe` decorator
-**🧪 Evals**
-* Plug-and-measure 15+ metrics, including:
-  * Tool call accuracy
-  * Hallucinations
-  * Instruction adherence
-  * Retrieval context recall
-    Our metric implementations are research-backed by Stanford and Berkeley AI labs. Check out our [research](https://judgmentlabs.ai/research)!
-* Build custom evaluators that seamlessly connect with our infrastructure!
-* Use our evals for:
-    * ⚠️ Unit-testing your agent
-    * 🔬 Experimentally testing new prompts and models
-    * 🛡️ Online evaluations to guardrail your agent's actions and responses
-**📊 Datasets**
-* Export trace data to datasets hosted on Judgment's Platform and export to JSON, Parquet, S3, etc.
-* Run evals on datasets as unit-tests or to A/B test agent configs
-**💡 Insights**
-* Error clustering groups agent failures to uncover failure patterns and speed up root cause analysis
-* Trace agent failures to their exact source. Judgment's Osiris agent localizes errors to specific agent components, enabling precise, targeted fixes.
-## 🛠️ Installation
-Get started with Judgeval by installing our SDK using pip:
-```bash
-pip install judgeval
-```
-Ensure you have your `JUDGMENT_API_KEY` environment variable set to connect to the [Judgment platform](https://app.judgmentlabs.ai/). If you don't have a key, create an account on the platform!
-## 🏁 Get Started
-Here's how you can quickly start using Judgeval:
-### 🛰️ Tracing
-Track your agent execution with full observability with just a few lines of code.
-Create a file named `traces.py` with the following code:
-```python
-from judgeval.common.tracer import Tracer, wrap
-from openai import OpenAI
-client = wrap(OpenAI())
-judgment = Tracer(project_name="my_project")
-@judgment.observe(span_type="tool")
-def my_tool():
-    return "What's the capital of the U.S.?"
-@judgment.observe(span_type="function")
-def main():
-    task_input = my_tool()
-    res = client.chat.completions.create(
-        model="gpt-4.1",
-        messages=[{"role": "user", "content": f"{task_input}"}]
-    )
-    return res.choices[0].message.content
-main()
-```
-[Click here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation.
-### 📝 Offline Evaluations
-You can evaluate your agent's execution to measure quality metrics such as hallucination.
-Create a file named `evaluate.py` with the following code:
-```python evaluate.py
-from judgeval import JudgmentClient
-from judgeval.data import Example
-from judgeval.scorers import FaithfulnessScorer
-client = JudgmentClient()
-example = Example(
-    input="What if these shoes don't fit?",
-    actual_output="We offer a 30-day full refund at no extra cost.",
-    retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
-)
-scorer = FaithfulnessScorer(threshold=0.5)
-results = client.run_evaluation(
-    examples=[example],
-    scorers=[scorer],
-    model="gpt-4.1",
-)
-print(results)
-```
-[Click here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation.
-### 📡 Online Evaluations
-Apply performance monitoring to measure the quality of your systems in production, not just on traces.
-Using the same `traces.py` file we created earlier, modify `main` function:
-```python
-from judgeval.common.tracer import Tracer, wrap
-from judgeval.scorers import AnswerRelevancyScorer
-from openai import OpenAI
-client = wrap(OpenAI())
-judgment = Tracer(project_name="my_project")
-@judgment.observe(span_type="tool")
-def my_tool():
-    return "Hello world!"
-@judgment.observe(span_type="function")
-def main():
-    task_input = my_tool()
-    res = client.chat.completions.create(
-        model="gpt-4.1",
-        messages=[{"role": "user", "content": f"{task_input}"}]
-    ).choices[0].message.content
-    judgment.get_current_trace().async_evaluate(
-        scorers=[AnswerRelevancyScorer(threshold=0.5)],
-        input=task_input,
-        actual_output=res,
-        model="gpt-4.1"
-    )
-    print("Online evaluation submitted.")
-    return res
-main()
-```
-[Click here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation.
-## 🏢 Self-Hosting
-Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
-### Key Features
-* Deploy Judgment on your own AWS account
-* Store data in your own Supabase instance
-* Access Judgment through your own custom domain
-### Getting Started
-1. Check out our [self-hosting documentation](https://judgment.mintlify.app/self_hosting/get_started) for detailed setup instructions, along with how your self-hosted instance can be accessed
-2. Use the [Judgment CLI](https://github.com/JudgmentLabs/judgment-cli) to deploy your self-hosted environment
-3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
-## ⭐ Star Us on GitHub
-If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the product.
-## 🤝 Contributing
-There are many ways to contribute to Judgeval:
-- Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
-- Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
-- Speaking or writing about Judgment and letting us know!
-## Documentation and Demos
-For more detailed documentation, please check out our [developer docs](https://judgment.mintlify.app/getting_started) and some of our [demo videos](https://www.youtube.com/@AlexShan-j3o) for reference!

{judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.37__py3-none-any.whl → 0.0.38__py3-none-any.whl

judgeval 0.0.37py3-none-any.whl → 0.0.38py3-none-any.whl