PyPI - judgeval - Versions diffs - 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl - Mend

judgeval 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/common/tracer.py +869 -928
judgeval/common/utils.py +18 -0
judgeval/constants.py +6 -3
judgeval/data/__init__.py +4 -0
judgeval/data/datasets/dataset.py +3 -2
judgeval/data/datasets/eval_dataset_client.py +63 -3
judgeval/data/example.py +29 -7
judgeval/data/sequence.py +5 -4
judgeval/data/sequence_run.py +4 -3
judgeval/data/trace.py +129 -0
judgeval/evaluation_run.py +1 -1
judgeval/integrations/langgraph.py +1962 -299
judgeval/judgment_client.py +85 -66
judgeval/run_evaluation.py +191 -45
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -0
judgeval/scorers/score.py +2 -1
judgeval/utils/data_utils.py +57 -0
judgeval-0.0.37.dist-info/METADATA +214 -0
{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/RECORD +23 -20
judgeval-0.0.35.dist-info/METADATA +0 -170
{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/WHEEL +0 -0
{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/licenses/LICENSE.md +0 -0

judgeval/scorers/score.py CHANGED Viewed

@@ -243,7 +243,7 @@ async def score_with_indicator(
 async def a_execute_scoring(
     examples: Union[List[Example], List[CustomExample]],
     scorers: List[JudgevalScorer],
-    model: Optional[Union[str, List[str], JudgevalJudge]] = None,
+    model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
     ignore_errors: bool = True,
     skip_on_missing_params: bool = True,
     show_indicator: bool = True,
@@ -271,6 +271,7 @@ async def a_execute_scoring(
     Returns:
         List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
     """
     semaphore = asyncio.Semaphore(max_concurrent)
     async def execute_with_semaphore(func: Callable, *args, **kwargs):

judgeval/utils/data_utils.py ADDED Viewed

@@ -0,0 +1,57 @@
+import yaml
+from judgeval.common.logger import (
+    debug,
+    info,
+    error,
+    example_logging_context
+)
+from judgeval.data import Example
+def add_from_yaml(file_path: str) -> None:
+    debug(f"Loading dataset from YAML file: {file_path}")
+    """
+    Adds examples from a YAML file.
+    The format of the YAML file is expected to be a dictionary with one key: "examples".
+    The value of the key is a list of dictionaries, where each dictionary represents an example.
+    The YAML file is expected to have the following format:
+    examples:
+        - input: "test input"
+        actual_output: "test output"
+        expected_output: "expected output"
+        context:
+            - "context1"
+            - "context2"
+        retrieval_context:
+            - "retrieval1"
+        additional_metadata:
+            key: "value"
+        tools_called:
+            - "tool1"
+        expected_tools:
+            - {tool_name: "tool1", parameters: {"query": "test query 1"}}
+            - {tool_name: "tool2", parameters: {"query": "test query 2"}}
+        name: "test example"
+        example_id: null
+        timestamp: "20241230_160117"
+        trace_id: "123"
+    """
+    try:
+        with open(file_path, "r") as file:
+            payload = yaml.safe_load(file)
+            if payload is None:
+                raise ValueError("The YAML file is empty.")
+            examples = payload.get("examples", [])
+    except FileNotFoundError:
+        error(f"YAML file not found: {file_path}")
+        raise FileNotFoundError(f"The file {file_path} was not found.")
+    except yaml.YAMLError:
+        error(f"Invalid YAML file: {file_path}")
+        raise ValueError(f"The file {file_path} is not a valid YAML file.")
+    info(f"Added {len(examples)} examples from YAML")
+    new_examples = [Example(**e) for e in examples]
+    return new_examples

judgeval-0.0.37.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,214 @@
+Metadata-Version: 2.4
+Name: judgeval
+Version: 0.0.37
+Summary: Judgeval Package
+Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
+Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
+Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
+License-Expression: Apache-2.0
+License-File: LICENSE.md
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.11
+Requires-Dist: anthropic
+Requires-Dist: boto3
+Requires-Dist: google-genai
+Requires-Dist: langchain-anthropic
+Requires-Dist: langchain-core
+Requires-Dist: langchain-huggingface
+Requires-Dist: langchain-openai
+Requires-Dist: litellm==1.38.12
+Requires-Dist: nest-asyncio
+Requires-Dist: openai
+Requires-Dist: pandas
+Requires-Dist: python-dotenv==1.0.1
+Requires-Dist: requests
+Requires-Dist: together
+Description-Content-Type: text/markdown
+<div align="center">
+<img src="assets/logo-light.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
+<img src="assets/logo-dark.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
+**Build monitoring & evaluation pipelines for complex agents**
+[Website](https://www.judgmentlabs.ai/) • [Twitter/X](https://x.com/JudgmentLabs) • [LinkedIn](https://www.linkedin.com/company/judgmentlabs) • [Documentation](https://judgment.mintlify.app/getting_started) • [Demos](https://www.youtube.com/@AlexShan-j3o)
+</div>
+## 🚀 What is Judgeval?
+Judgeval is an open-source tool for testing, monitoring, and optimizing AI agents. Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
+**🔍 Tracing**
+* Automatic agent tracing for common agent frameworks and SDKs (LangGraph, OpenAI, Anthropic, etc.)
+* Track input/output, latency, cost, token usage at every step
+* Function tracing with `@judgment.observe` decorator
+**🧪 Evals**
+* Plug-and-measure 15+ metrics, including:
+  * Tool call accuracy
+  * Hallucinations
+  * Instruction adherence
+  * Retrieval context recall
+    Our metric implementations are research-backed by Stanford and Berkeley AI labs. Check out our [research](https://judgmentlabs.ai/research)!
+* Build custom evaluators that seamlessly connect with our infrastructure!
+* Use our evals for:
+    * ⚠️ Unit-testing your agent
+    * 🔬 Experimentally testing new prompts and models
+    * 🛡️ Online evaluations to guardrail your agent's actions and responses
+**📊 Datasets**
+* Export trace data to datasets hosted on Judgment's Platform and export to JSON, Parquet, S3, etc.
+* Run evals on datasets as unit-tests or to A/B test agent configs
+**💡 Insights**
+* Error clustering groups agent failures to uncover failure patterns and speed up root cause analysis
+* Trace agent failures to their exact source. Judgment's Osiris agent localizes errors to specific agent components, enabling precise, targeted fixes.
+## 🛠️ Installation
+Get started with Judgeval by installing our SDK using pip:
+```bash
+pip install judgeval
+```
+Ensure you have your `JUDGMENT_API_KEY` environment variable set to connect to the [Judgment platform](https://app.judgmentlabs.ai/). If you don't have a key, create an account on the platform!
+## 🏁 Get Started
+Here's how you can quickly start using Judgeval:
+### 🛰️ Tracing
+Track your agent execution with full observability with just a few lines of code.
+Create a file named `traces.py` with the following code:
+```python
+from judgeval.common.tracer import Tracer, wrap
+from openai import OpenAI
+client = wrap(OpenAI())
+judgment = Tracer(project_name="my_project")
+@judgment.observe(span_type="tool")
+def my_tool():
+    return "What's the capital of the U.S.?"
+@judgment.observe(span_type="function")
+def main():
+    task_input = my_tool()
+    res = client.chat.completions.create(
+        model="gpt-4.1",
+        messages=[{"role": "user", "content": f"{task_input}"}]
+    )
+    return res.choices[0].message.content
+main()
+```
+[Click here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation.
+### 📝 Offline Evaluations
+You can evaluate your agent's execution to measure quality metrics such as hallucination.
+Create a file named `evaluate.py` with the following code:
+```python evaluate.py
+from judgeval import JudgmentClient
+from judgeval.data import Example
+from judgeval.scorers import FaithfulnessScorer
+client = JudgmentClient()
+example = Example(
+    input="What if these shoes don't fit?",
+    actual_output="We offer a 30-day full refund at no extra cost.",
+    retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
+)
+scorer = FaithfulnessScorer(threshold=0.5)
+results = client.run_evaluation(
+    examples=[example],
+    scorers=[scorer],
+    model="gpt-4.1",
+)
+print(results)
+```
+[Click here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation.
+### 📡 Online Evaluations
+Apply performance monitoring to measure the quality of your systems in production, not just on traces.
+Using the same `traces.py` file we created earlier, modify `main` function:
+```python
+from judgeval.common.tracer import Tracer, wrap
+from judgeval.scorers import AnswerRelevancyScorer
+from openai import OpenAI
+client = wrap(OpenAI())
+judgment = Tracer(project_name="my_project")
+@judgment.observe(span_type="tool")
+def my_tool():
+    return "Hello world!"
+@judgment.observe(span_type="function")
+def main():
+    task_input = my_tool()
+    res = client.chat.completions.create(
+        model="gpt-4.1",
+        messages=[{"role": "user", "content": f"{task_input}"}]
+    ).choices[0].message.content
+    judgment.get_current_trace().async_evaluate(
+        scorers=[AnswerRelevancyScorer(threshold=0.5)],
+        input=task_input,
+        actual_output=res,
+        model="gpt-4.1"
+    )
+    print("Online evaluation submitted.")
+    return res
+main()
+```
+[Click here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation.
+## 🏢 Self-Hosting
+Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
+### Key Features
+* Deploy Judgment on your own AWS account
+* Store data in your own Supabase instance
+* Access Judgment through your own custom domain
+### Getting Started
+1. Check out our [self-hosting documentation](https://judgment.mintlify.app/self_hosting/get_started) for detailed setup instructions, along with how your self-hosted instance can be accessed
+2. Use the [Judgment CLI](https://github.com/JudgmentLabs/judgment-cli) to deploy your self-hosted environment
+3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
+## ⭐ Star Us on GitHub
+If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the product.
+## 🤝 Contributing
+There are many ways to contribute to Judgeval:
+- Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
+- Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
+- Speaking or writing about Judgment and letting us know!
+## Documentation and Demos
+For more detailed documentation, please check out our [developer docs](https://judgment.mintlify.app/getting_started) and some of our [demo videos](https://www.youtube.com/@AlexShan-j3o) for reference!

{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/RECORD RENAMED Viewed

@@ -1,43 +1,44 @@
 judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
 judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
-judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
-judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
-judgeval/judgment_client.py,sha256=brRYmphZR-2IUre9kdOhfse1mYDilcIqUzzH21ROAdk,22208
+judgeval/constants.py,sha256=KDHkZbzk-nr9uP-jsWUwpfaQSf4YkVfbO1o8w93-GME,5707
+judgeval/evaluation_run.py,sha256=MnlDSCXXi1vhTNTYC1XgPAl2BVG_ivNeFzIyfaw4Dho,6761
+judgeval/judgment_client.py,sha256=SiqazgyKkGsCVZ7J6XeL3Fvc51Oz7TM7yBgSfguJ0wQ,23625
 judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
-judgeval/run_evaluation.py,sha256=elMpFHahyeukKKa09fmJM3c_afwJ00mbZRqm18l5f00,28481
+judgeval/run_evaluation.py,sha256=66ppcGpCc08WrK46gWMJjktGzAg5alAWzRb9ncv9DTM,34555
 judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
 judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
-judgeval/common/tracer.py,sha256=YsObK8VQXp1DDbU9xncU8NjuY-JUI54BqmG4olezrZc,92507
-judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
-judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
+judgeval/common/tracer.py,sha256=5Pt49vSVK2zxh6p9nMW87Ju1eN3-M_mziSF_mDucXkA,87211
+judgeval/common/utils.py,sha256=oYjUgW8eL1qkzBe_tOz1WbCB6LbHWYvZN38XyXof8Eo,34086
+judgeval/data/__init__.py,sha256=-i7cuSBHrSTMf3UiIbFXwp56y15QJ7pmQeQK1yprhqM,561
 judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
-judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
+judgeval/data/example.py,sha256=EVniiTpaut2wlTS1u3MxB983odCRLSa9RJ74iAsR0wg,6929
 judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
 judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
-judgeval/data/sequence.py,sha256=FmKVdzQP5VTujRCHDWk097MKRR-rJgbsdrxyCKee6tA,1994
-judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
+judgeval/data/sequence.py,sha256=vat_N0Acr64yfu97AfsaVPNrv2LqBiWCuYIoQWuxYwo,2074
+judgeval/data/sequence_run.py,sha256=XPa-MvwRK6ABKQtpMdmGHnoyL1KrgzQUjqItpLDc8U0,2213
+judgeval/data/trace.py,sha256=aRNwtJGebsm5MerVlZ3HKzviNAMpzydyUs88rs-BZ5Q,4899
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
-judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
-judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
-judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
+judgeval/data/datasets/dataset.py,sha256=1o-SMG96yYkSCxqUTItKKfeFFLhQXrE2cKPYEvlNeGw,13044
+judgeval/data/datasets/eval_dataset_client.py,sha256=uirHpkpLOfygXIz0xKAGTPx1qjbBTzdLFQK6yyoZduU,17544
+judgeval/integrations/langgraph.py,sha256=nQ6KAi9giirnWmD35i4CfoFYKzajh5ElvKh4t6Yasgs,122617
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
 judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
 judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
 judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
 judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
 judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
-judgeval/scorers/__init__.py,sha256=Mk-mWUt_gNpJqY_WIEuQynD6fxc34fWSRSuobMSrj94,1238
+judgeval/scorers/__init__.py,sha256=-4GLkYiLKI_BxpoIfgadCFEUfqJcBWZLAtfrInjZT0Q,1282
 judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
 judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
 judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
-judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
+judgeval/scorers/score.py,sha256=m9luk5ZLeUCual5CpI-9ZR9nqR3eC9wJLVT87SFPN6g,18747
 judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
 judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
+judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=QhHKpl6kNEXxuwriSEwQ5gIIxb7NeHZ1H_7SAZhQiQk,1872
 judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
 judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
 judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
@@ -52,12 +53,14 @@ judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=k5gDOki-8K
 judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=XnSGEkQfwVqaqnHEGMCsxNiHVzrsrej48uDbLoWc8CQ,678
 judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=mMKEuR87_yanEuZJ5YSGFMHDD_oLVZ6-rQuciFaDOMA,1095
 judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=QmWB8bVbDYHY5FcF0rYZE_3c2XXgMLRmR6aXJWfdMC4,655
+judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=yBd5KU4wBTHFMi1B6D8hRdPOYYQl4uD7Z-xCW4yk5-E,427
 judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
 judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
 judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
 judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
-judgeval-0.0.35.dist-info/METADATA,sha256=oAaDqpJCCZxUBOoVPTFbSjZgZ5xJMpGTxjngoJqmTO8,6126
-judgeval-0.0.35.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.35.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.35.dist-info/RECORD,,
+judgeval/utils/data_utils.py,sha256=pB4GBWi8XoM2zSR2NlLXH5kqcQ029BVhDxaVKkdmiBY,1860
+judgeval-0.0.37.dist-info/METADATA,sha256=0XKc4BJUpG8qnB3a9afhh8l7H_C19ritSJv95ogifXw,7742
+judgeval-0.0.37.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.37.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.37.dist-info/RECORD,,

judgeval-0.0.35.dist-info/METADATA DELETED Viewed

@@ -1,170 +0,0 @@
-Metadata-Version: 2.4
-Name: judgeval
-Version: 0.0.35
-Summary: Judgeval Package
-Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
-Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
-Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
-License-Expression: Apache-2.0
-License-File: LICENSE.md
-Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python :: 3
-Requires-Python: >=3.11
-Requires-Dist: anthropic
-Requires-Dist: boto3==1.38.3
-Requires-Dist: fastapi
-Requires-Dist: google-genai
-Requires-Dist: langchain
-Requires-Dist: langchain-anthropic
-Requires-Dist: langchain-core
-Requires-Dist: langchain-huggingface
-Requires-Dist: langchain-openai
-Requires-Dist: litellm==1.38.12
-Requires-Dist: nest-asyncio
-Requires-Dist: openai
-Requires-Dist: openpyxl
-Requires-Dist: pandas
-Requires-Dist: pika
-Requires-Dist: python-dotenv==1.0.1
-Requires-Dist: requests
-Requires-Dist: supabase
-Requires-Dist: together
-Requires-Dist: uvicorn
-Provides-Extra: dev
-Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
-Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
-Requires-Dist: pytest>=8.3.4; extra == 'dev'
-Requires-Dist: tavily-python; extra == 'dev'
-Description-Content-Type: text/markdown
-# Judgeval SDK
-Judgeval is an open-source framework for building evaluation pipelines for multi-step agent workflows, supporting both real-time and experimental evaluation setups. To learn more about Judgment or sign up for free, visit our [website](https://www.judgmentlabs.ai/) or check out our [developer docs](https://judgment.mintlify.app/getting_started).
-## Features
-- **Development and Production Evaluation Layer**: Offers a robust evaluation layer for multi-step agent applications, including unit-testing and performance monitoring.
-- **Plug-and-Evaluate**: Integrate LLM systems with 10+ research-backed metrics, including:
-  - Hallucination detection
-  - RAG retriever quality
-  - And more
-- **Custom Evaluation Pipelines**: Construct powerful custom evaluation pipelines tailored for your LLM systems.
-- **Monitoring in Production**: Utilize state-of-the-art real-time evaluation foundation models to monitor LLM systems effectively.
-## Installation
-   ```bash
-   pip install judgeval
-   ```
-## Quickstart: Evaluations
-You can evaluate your workflow execution data to measure quality metrics such as hallucination.
-Create a file named `evaluate.py` with the following code:
-   ```python
-    from judgeval import JudgmentClient
-    from judgeval.data import Example
-    from judgeval.scorers import FaithfulnessScorer
-    client = JudgmentClient()
-    example = Example(
-        input="What if these shoes don't fit?",
-        actual_output="We offer a 30-day full refund at no extra cost.",
-        retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
-    )
-    scorer = FaithfulnessScorer(threshold=0.5)
-    results = client.run_evaluation(
-        examples=[example],
-        scorers=[scorer],
-        model="gpt-4o",
-    )
-    print(results)
-   ```
-   Click [here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation
-## Quickstart: Traces
-Track your workflow execution for full observability with just a few lines of code.
-Create a file named `traces.py` with the following code:
-   ```python
-    from judgeval.common.tracer import Tracer, wrap
-    from openai import OpenAI
-    # Basic initialization
-    client = wrap(OpenAI())
-    judgment = Tracer(project_name="my_project")
-    # Or with S3 storage enabled
-    # NOTE: Make sure AWS creds correspond to an account with write access to the specified S3 bucket
-    judgment = Tracer(
-        project_name="my_project",
-        use_s3=True,
-        s3_bucket_name="my-traces-bucket", # Bucket created automatically if it doesn't exist
-        s3_aws_access_key_id="your-access-key",  # Optional: defaults to AWS_ACCESS_KEY_ID env var
-        s3_aws_secret_access_key="your-secret-key",  # Optional: defaults to AWS_SECRET_ACCESS_KEY env var
-        s3_region_name="us-west-1"  # Optional: defaults to AWS_REGION env var or "us-west-1"
-    )
-    @judgment.observe(span_type="tool")
-    def my_tool():
-        return "Hello world!"
-    @judgment.observe(span_type="function")
-    def main():
-        task_input = my_tool()
-        res = client.chat.completions.create(
-            model="gpt-4o",
-            messages=[{"role": "user", "content": f"{task_input}"}]
-        )
-        return res.choices[0].message.content
-   ```
-   Click [here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation
-## Quickstart: Online Evaluations
-Apply performance monitoring to measure the quality of your systems in production, not just on historical data.
-Using the same traces.py file we created earlier:
-   ```python
-    from judgeval.common.tracer import Tracer, wrap
-    from judgeval.scorers import AnswerRelevancyScorer
-    from openai import OpenAI
-    client = wrap(OpenAI())
-    judgment = Tracer(project_name="my_project")
-    @judgment.observe(span_type="tool")
-    def my_tool():
-        return "Hello world!"
-    @judgment.observe(span_type="function")
-    def main():
-        task_input = my_tool()
-        res = client.chat.completions.create(
-            model="gpt-4o",
-            messages=[{"role": "user", "content": f"{task_input}"}]
-        ).choices[0].message.content
-        judgment.get_current_trace().async_evaluate(
-            scorers=[AnswerRelevancyScorer(threshold=0.5)],
-            input=task_input,
-            actual_output=res,
-            model="gpt-4o"
-        )
-        return res
-   ```
-   Click [here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation
-## Documentation and Demos
-For more detailed documentation, please check out our [docs](https://judgment.mintlify.app/getting_started) and some of our [demo videos](https://www.youtube.com/@AlexShan-j3o) for reference!
-##

{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

judgeval 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl