judgeval 0.0.12__tar.gz → 0.0.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. {judgeval-0.0.12 → judgeval-0.0.13}/PKG-INFO +1 -1
  2. {judgeval-0.0.12 → judgeval-0.0.13}/docs/api_reference/trace.mdx +11 -4
  3. {judgeval-0.0.12 → judgeval-0.0.13}/docs/getting_started.mdx +43 -20
  4. {judgeval-0.0.12 → judgeval-0.0.13}/docs/monitoring/tracing.mdx +87 -62
  5. {judgeval-0.0.12 → judgeval-0.0.13}/pyproject.toml +1 -1
  6. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/common/tracer.py +25 -2
  7. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/constants.py +2 -0
  8. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/datasets/dataset.py +2 -1
  9. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/datasets/eval_dataset_client.py +106 -9
  10. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/example.py +13 -5
  11. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judgment_client.py +29 -6
  12. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/run_evaluation.py +16 -5
  13. {judgeval-0.0.12 → judgeval-0.0.13}/.github/workflows/ci.yaml +0 -0
  14. {judgeval-0.0.12 → judgeval-0.0.13}/.gitignore +0 -0
  15. {judgeval-0.0.12 → judgeval-0.0.13}/LICENSE.md +0 -0
  16. {judgeval-0.0.12 → judgeval-0.0.13}/Pipfile +0 -0
  17. {judgeval-0.0.12 → judgeval-0.0.13}/Pipfile.lock +0 -0
  18. {judgeval-0.0.12 → judgeval-0.0.13}/README.md +0 -0
  19. {judgeval-0.0.12 → judgeval-0.0.13}/docs/README.md +0 -0
  20. {judgeval-0.0.12 → judgeval-0.0.13}/docs/api_reference/judgment_client.mdx +0 -0
  21. {judgeval-0.0.12 → judgeval-0.0.13}/docs/development.mdx +0 -0
  22. {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/code.mdx +0 -0
  23. {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/images.mdx +0 -0
  24. {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/markdown.mdx +0 -0
  25. {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/navigation.mdx +0 -0
  26. {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/reusable-snippets.mdx +0 -0
  27. {judgeval-0.0.12 → judgeval-0.0.13}/docs/essentials/settings.mdx +0 -0
  28. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/data_datasets.mdx +0 -0
  29. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/data_examples.mdx +0 -0
  30. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/introduction.mdx +0 -0
  31. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/judges.mdx +0 -0
  32. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
  33. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
  34. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  35. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
  36. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
  37. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
  38. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  39. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/faithfulness.mdx +0 -0
  40. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/hallucination.mdx +0 -0
  41. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/introduction.mdx +0 -0
  42. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/json_correctness.mdx +0 -0
  43. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/summarization.mdx +0 -0
  44. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/scorers/tool_correctness.mdx +0 -0
  45. {judgeval-0.0.12 → judgeval-0.0.13}/docs/evaluation/unit_testing.mdx +0 -0
  46. {judgeval-0.0.12 → judgeval-0.0.13}/docs/favicon.svg +0 -0
  47. {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/basic_trace_example.png +0 -0
  48. {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/checks-passed.png +0 -0
  49. {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/create_aggressive_scorer.png +0 -0
  50. {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/create_scorer.png +0 -0
  51. {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/evaluation_diagram.png +0 -0
  52. {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/hero-dark.svg +0 -0
  53. {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/hero-light.svg +0 -0
  54. {judgeval-0.0.12 → judgeval-0.0.13}/docs/images/trace_screenshot.png +0 -0
  55. {judgeval-0.0.12 → judgeval-0.0.13}/docs/introduction.mdx +0 -0
  56. {judgeval-0.0.12 → judgeval-0.0.13}/docs/judgment/introduction.mdx +0 -0
  57. {judgeval-0.0.12 → judgeval-0.0.13}/docs/logo/dark.svg +0 -0
  58. {judgeval-0.0.12 → judgeval-0.0.13}/docs/logo/light.svg +0 -0
  59. {judgeval-0.0.12 → judgeval-0.0.13}/docs/mint.json +0 -0
  60. {judgeval-0.0.12 → judgeval-0.0.13}/docs/monitoring/introduction.mdx +0 -0
  61. {judgeval-0.0.12 → judgeval-0.0.13}/docs/monitoring/production_insights.mdx +0 -0
  62. {judgeval-0.0.12 → judgeval-0.0.13}/docs/notebooks/create_dataset.ipynb +0 -0
  63. {judgeval-0.0.12 → judgeval-0.0.13}/docs/notebooks/create_scorer.ipynb +0 -0
  64. {judgeval-0.0.12 → judgeval-0.0.13}/docs/notebooks/demo.ipynb +0 -0
  65. {judgeval-0.0.12 → judgeval-0.0.13}/docs/notebooks/prompt_scorer.ipynb +0 -0
  66. {judgeval-0.0.12 → judgeval-0.0.13}/docs/notebooks/quickstart.ipynb +0 -0
  67. {judgeval-0.0.12 → judgeval-0.0.13}/docs/quickstart.mdx +0 -0
  68. {judgeval-0.0.12 → judgeval-0.0.13}/docs/snippets/snippet-intro.mdx +0 -0
  69. {judgeval-0.0.12 → judgeval-0.0.13}/pytest.ini +0 -0
  70. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/ci_testing/ci_testing.py +0 -0
  71. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/ci_testing/travel_response.txt +0 -0
  72. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/custom_scorers/competitor_mentions.py +0 -0
  73. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/custom_scorers/text2sql.py +0 -0
  74. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -0
  75. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
  76. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -0
  77. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -0
  78. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -0
  79. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/new_bot/basic_bot.py +0 -0
  80. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/openai_travel_agent/agent.py +0 -0
  81. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -0
  82. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/cookbooks/openai_travel_agent/tools.py +0 -0
  83. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/basic_test.py +0 -0
  84. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/cstone_data.csv +0 -0
  85. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/data.csv +0 -0
  86. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/faithfulness_testing.py +0 -0
  87. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/galen_data.csv +0 -0
  88. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/playground.py +0 -0
  89. {judgeval-0.0.12 → judgeval-0.0.13}/src/demo/customer_use/cstone/results.csv +0 -0
  90. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/__init__.py +0 -0
  91. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/clients.py +0 -0
  92. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/common/__init__.py +0 -0
  93. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/common/exceptions.py +0 -0
  94. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/common/logger.py +0 -0
  95. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/common/utils.py +0 -0
  96. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/__init__.py +0 -0
  97. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/api_example.py +0 -0
  98. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/datasets/__init__.py +0 -0
  99. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/datasets/ground_truth.py +0 -0
  100. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/datasets/utils.py +0 -0
  101. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/result.py +0 -0
  102. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/data/scorer_data.py +0 -0
  103. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/evaluation_run.py +0 -0
  104. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/__init__.py +0 -0
  105. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/base_judge.py +0 -0
  106. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/litellm_judge.py +0 -0
  107. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/mixture_of_judges.py +0 -0
  108. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/together_judge.py +0 -0
  109. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/judges/utils.py +0 -0
  110. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/__init__.py +0 -0
  111. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/api_scorer.py +0 -0
  112. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/base_scorer.py +0 -0
  113. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/exceptions.py +0 -0
  114. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorer.py +0 -0
  115. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  116. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  117. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  118. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  119. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
  120. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
  121. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
  122. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  123. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  124. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
  125. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
  126. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -0
  127. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  128. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  129. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  130. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
  131. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
  132. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
  133. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
  134. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
  135. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
  136. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
  137. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
  138. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
  139. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
  140. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
  141. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
  142. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
  143. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
  144. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
  145. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
  146. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
  147. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
  148. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
  149. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
  150. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
  151. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
  152. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
  153. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
  154. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
  155. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
  156. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
  157. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -0
  158. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -0
  159. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/prompt_scorer.py +0 -0
  160. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/score.py +0 -0
  161. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/scorers/utils.py +0 -0
  162. {judgeval-0.0.12 → judgeval-0.0.13}/src/judgeval/tracer/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.12
3
+ Version: 0.0.13
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,27 +12,34 @@ The `Tracer` class is used to trace the execution of your LLM system.
12
12
  ```python
13
13
  from judgeval.common.tracer import Tracer
14
14
 
15
- tracer = Tracer()
15
+ tracer = Tracer(project_name="my_project")
16
16
  ```
17
17
 
18
18
  <Note>
19
- The `Tracer` class is a singleton, so you only need to initialize it once in your application.
19
+ The `Tracer` class is a singleton, so you only need to initialize it once in your application.
20
+ The `project_name` enables you to group traces by workflow, keeping all your evaluations and
21
+ observability tooling in one place.
20
22
  </Note>
21
23
 
22
- ## Exporting traces
24
+ ## Explicitly exporting traces
23
25
 
24
26
  When using the `.trace()` context manager, you can control how your traces are exported to the Judgment platform by
25
27
  providing the `project_name` argument. This allows you to group traces by workflow, keeping all your evaluations and
26
28
  observability tooling in one place.
27
29
 
28
30
  ```python
29
- with tracer.trace("my_workflow", project_name="my_project"):
31
+ with tracer.trace(
32
+ name="my_workflow",
33
+ project_name="my_project",
34
+ overwrite=True
35
+ ) as trace:
30
36
  ...
31
37
  ```
32
38
 
33
39
  `.trace()` has the following args:
34
40
  - `name`: The name of the trace. Can be make unique to each workflow run by using a timestamp or other unique identifier.
35
41
  - `project_name`: The name of the project to use for the trace. Used to group traces by workflow.
42
+ - `overwrite`: Whether to overwrite the trace with the same `name` if it already exists.
36
43
 
37
44
  The `trace()` context manager yields a `TraceClient` object.
38
45
 
@@ -32,7 +32,7 @@ large-scale evaluations. [Contact us](mailto:contact@judgmentlabs.ai) if you're
32
32
  sensitive data that has to reside in your private VPCs.
33
33
  </Note>
34
34
 
35
- # Create your first evaluation
35
+ # Create Your First Evaluation
36
36
 
37
37
  ```python sample_eval.py
38
38
  from judgeval import JudgmentClient
@@ -68,6 +68,48 @@ is a scorer that checks if the output is hallucinated relative to the retrieved
68
68
  To learn more about using the Judgment Client to run evaluations, click [here](/api_reference/judgment_client).
69
69
  </Tip>
70
70
 
71
+ # Create Your First Trace
72
+
73
+ Beyond experimentation, `judgeval` supports monitoring your LLM systems in **production**.
74
+ Using our `tracing` module, you can **track your LLM system outputs from end to end**, allowing you to visualize the flow of your LLM system.
75
+ Additionally, you can **enable evaluations to run in real-time** using Judgment's state-of-the-art judge models.
76
+
77
+ ```python trace_example.py
78
+ from judgeval.common.tracer import Tracer, wrap
79
+ from openai import OpenAI
80
+
81
+ client = wrap(OpenAI())
82
+ judgment = Tracer(project_name="my_project")
83
+
84
+ @judgment.observe(span_type="tool")
85
+ def my_tool():
86
+ return "Hello world!"
87
+
88
+ @judgment.observe(span_type="function")
89
+ def main():
90
+ res = client.chat.completions.create(
91
+ model="gpt-4o",
92
+ messages=[{"role": "user", "content": f"{my_tool()}"}]
93
+ )
94
+ return res.choices[0].message.content
95
+ ```
96
+
97
+
98
+ <div style={{display: 'flex', justifyContent: 'center'}}>
99
+ ![Alt text](/images/trace_screenshot.png "Image of a RAG pipeline trace")
100
+ </div>
101
+
102
+ There are many benefits of monitoring your LLM systems in production with `judgeval`, including:
103
+ - Detecting hallucinations and other quality issues **before they reach your customers**
104
+ - Automatically creating experimental datasets from your **real-world production cases** for future improvement/optimization
105
+ - Track and create alerts on **any metric** (e.g. latency, cost, hallucination, etc.)
106
+
107
+ <Tip>
108
+ To learn more about `judgeval`'s tracing module, click [here](/tracing/introduction).
109
+ </Tip>
110
+
111
+
112
+
71
113
  # Create Your First Scorer
72
114
  `judgeval` offers three kinds of LLM scorers for your evaluation needs: ready-made, classifier scorers, and custom scorers.
73
115
 
@@ -264,22 +306,3 @@ A `Project` keeps track of `Evaluation Run`s in your project. Each `Evaluation R
264
306
  You can try different models (e.g. `gpt-4o`, `claude-3-5-sonnet`, etc.) and prompt templates in each `Evaluation Run` to find the
265
307
  optimal setup for your LLM system.
266
308
  </Tip>
267
-
268
- ## Monitoring LLM Systems in Production
269
-
270
- Beyond experimenting and measuring historical performance, `judgeval` supports monitoring your LLM systems in **production**.
271
- Using our `tracing` module, you can **track your LLM system outputs from end to end**, allowing you to visualize the flow of your LLM system.
272
- Additionally, you can **enable evaluations to run in real-time** using Judgment's state-of-the-art judge models.
273
-
274
- <div style={{display: 'flex', justifyContent: 'center'}}>
275
- ![Alt text](/images/trace_screenshot.png "Image of a RAG pipeline trace")
276
- </div>
277
-
278
- There are many benefits of monitoring your LLM systems in production with `judgeval`, including:
279
- - Detecting hallucinations and other quality issues **before they reach your customers**
280
- - Automatically creating experimental datasets from your **real-world production cases** for future improvement/optimization
281
- - Track and create alerts on **any metric** (e.g. latency, cost, hallucination, etc.)
282
-
283
- <Tip>
284
- To learn more about `judgeval`'s tracing module, click [here](/tracing/introduction).
285
- </Tip>
@@ -18,24 +18,25 @@ Using tracing, you can:
18
18
 
19
19
  ## Tracing Your Workflow ##
20
20
 
21
- Setting up tracing with `judgeval` takes three simple steps:
21
+ Setting up tracing with `judgeval` takes two simple steps:
22
22
 
23
- ### 1. Initialize a tracer with your API key
23
+ ### 1. Initialize a tracer with your API key and project name
24
24
 
25
25
  ```python
26
26
  from judgeval.common.tracer import Tracer
27
27
 
28
- judgment = Tracer() # loads from JUDGMENT_API_KEY env var
28
+ judgment = Tracer(project_name="my_project") # loads from JUDGMENT_API_KEY env var
29
29
  ```
30
30
 
31
31
  <Note>
32
- The [Judgment tracer](/api_reference/trace) is a singleton object that should be shared across your application.
32
+ The [Judgment tracer](/api_reference/trace) is a singleton object that should be shared across your application.
33
+ Your project name will be used to organize your traces in one place on the Judgment platform.
33
34
  </Note>
34
35
 
35
36
 
36
37
  ### 2. Wrap your workflow components
37
38
 
38
- `judgeval` provides three wrapping mechanisms for your workflow components:
39
+ `judgeval` provides wrapping mechanisms for your workflow components:
39
40
 
40
41
  #### `wrap()` ####
41
42
  The `wrap()` function goes over your LLM client (e.g. OpenAI, Anthropic, etc.) and captures metadata surrounding your LLM calls, such as:
@@ -44,6 +45,14 @@ The `wrap()` function goes over your LLM client (e.g. OpenAI, Anthropic, etc.) a
44
45
  - Prompt/Completion
45
46
  - Model name
46
47
 
48
+ Here's an example of using `wrap()` on an OpenAI client:
49
+ ```python
50
+ from openai import OpenAI
51
+ from judgeval.common.tracer import wrap
52
+
53
+ client = wrap(OpenAI())
54
+ ```
55
+
47
56
  #### `@observe` ####
48
57
  The `@observe` decorator wraps your functions/tools and captures metadata surrounding your function calls, such as:
49
58
  - Latency
@@ -63,30 +72,20 @@ def my_tool():
63
72
  ```
64
73
 
65
74
  <Note>
66
- The `@observe` decorator is used on top of helper functions that you write, but is not designed to be used
67
- on your "main" function. For more information, see the `context manager` section below.
75
+ `span_type` is a string that you can use to categorize and organize your trace spans.
76
+ Span types are displayed on the trace UI to easily nagivate a visualization of your workflow.
77
+ Common span types include `tool`, `function`, `retriever`, `database`, `web search`, etc.
68
78
  </Note>
69
79
 
70
- #### `context manager` ####
71
-
72
- In your main function (e.g. the one that executes the primary workflow logic), you can use the `with judgment.trace()` context manager to trace the entire workflow.
73
-
74
- The context manager can **save/print the state of the trace at any point in the workflow**.
75
- This is useful for debugging or exporting any state of your workflow to run an evaluation from!
76
-
77
- <Tip>
78
- The `with judgment.trace()` context manager detects any `@observe` decorated functions or wrapped LLM calls within the context and automatically captures their metadata.
79
- </Tip>
80
-
81
80
 
82
81
  #### Putting it all Together
83
- Here's a complete example of using the `with judgment.trace()` context manager with the other tracing mechanisms:
82
+ Here's a complete example of using judgeval's tracing mechanisms:
84
83
  ```python
85
84
  from judgeval.common.tracer import Tracer, wrap
86
85
  from openai import OpenAI
87
86
 
88
87
  openai_client = wrap(OpenAI())
89
- judgment = Tracer() # loads from JUDGMENT_API_KEY env var
88
+ judgment = Tracer(project_name="my_project") # loads from JUDGMENT_API_KEY env var
90
89
 
91
90
  @judgment.observe(span_type="tool")
92
91
  def my_tool():
@@ -101,28 +100,10 @@ def my_llm_call():
101
100
  )
102
101
  return res.choices[0].message.content
103
102
 
103
+ @judgment.observe(span_type="function")
104
104
  def main():
105
- with judgment.trace(
106
- "main_workflow",
107
- project_name="my_project"
108
- ) as trace:
109
- res = my_llm_call()
110
- trace.save()
111
- trace.print()
112
- return res
113
- ```
114
-
115
- The printed trace appears as follows on the terminal:
116
- ```
117
- → main_workflow (trace: main_workflow)
118
- → my_llm_call (trace: my_llm_call)
119
- Input: {'args': [], 'kwargs': {}}
120
- → my_tool (trace: my_tool)
121
- Input: {'args': [], 'kwargs': {}}
122
- Output: Hello world!
123
- ← my_tool (0.000s)
124
- Output: Hello! How can I assist you today?
125
- ← my_llm_call (0.789s)
105
+ res = my_llm_call()
106
+ return res
126
107
  ```
127
108
 
128
109
  And the trace will appear on the Judgment platform as follows:
@@ -142,32 +123,27 @@ To execute an asynchronous evaluation, you can use the `trace.async_evaluate()`
142
123
 
143
124
  ```python
144
125
  from judgeval.common.tracer import Tracer
145
- from judgeval.scorers import FaithfulnessScorer
126
+ from judgeval.scorers import AnswerRelevancyScorer
146
127
 
147
- judgment = Tracer()
128
+ judgment = Tracer(project_name="my_project")
148
129
 
130
+ @judgment.observe(span_type="function")
149
131
  def main():
150
- with judgment.trace(
151
- "main_workflow",
152
- project_name="my_project"
153
- ) as trace:
154
- retrieved_info = ... # from knowledge base
155
- res = ... # your main workflow logic
156
-
157
- judgment.get_current_trace().async_evaluate(
158
- scorers=[FaithfulnesssScorer(threshold=0.5)],
159
- input="",
160
- actual_output=res,
161
- retrieval_context=[retrieved_info],
162
- model="gpt-4o-mini",
163
- )
164
- return res
132
+ query = "What is the capital of France?"
133
+ res = ... # Your workflow logic
134
+
135
+ judgment.get_current_trace().async_evaluate(
136
+ scorers=[AnswerRelevancyScorer(threshold=0.5)],
137
+ input="",
138
+ actual_output=res,
139
+ model="gpt-4o",
140
+ )
141
+ return res
165
142
  ```
166
143
 
167
144
  <Tip>
168
- You can organize how your async evaluation runs are logged to the Judgment platform by using the
169
- `project_name` argument in the `trace` context manager. See our [API documentation](/api_reference/trace)
170
- for more information.
145
+ Your async evaluations will be logged to the Judgment platform as part of the original trace and
146
+ a new evaluation will be created on the Judgment platform.
171
147
  </Tip>
172
148
 
173
149
  ## Example: OpenAI Travel Agent
@@ -183,4 +159,53 @@ In this video, we'll walk through all of the topics covered in this guide by tra
183
159
  allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
184
160
  referrerpolicy="strict-origin-when-cross-origin"
185
161
  allowfullscreen
186
- ></iframe>
162
+ ></iframe>
163
+
164
+
165
+ ## Advanced: Customizing Traces Using the Context Manager ##
166
+
167
+ If you need to customize your tracing context, you can use the `with judgment.trace()` context manager.
168
+
169
+ The context manager can **save/print the state of the trace at any point in the workflow**.
170
+ This is useful for debugging or exporting any state of your workflow to run an evaluation from!
171
+
172
+ <Tip>
173
+ The `with judgment.trace()` context manager detects any `@observe` decorated functions or wrapped LLM calls within the context and automatically captures their metadata.
174
+ </Tip>
175
+
176
+ Here's an example of using the context manager to trace a workflow:
177
+ ```python
178
+ from judgeval.common.tracer import Tracer, wrap
179
+ from openai import OpenAI
180
+
181
+ judgment = Tracer(project_name="my_project")
182
+ client = wrap(OpenAI())
183
+
184
+ @judgment.observe(span_type="tool")
185
+ def my_tool():
186
+ return "Hello world!"
187
+
188
+ def main():
189
+ with judgment.trace(name="my_workflow") as trace:
190
+ res = client.chat.completions.create(
191
+ model="gpt-4o",
192
+ messages=[{"role": "user", "content": f"{my_tool()}"}]
193
+ )
194
+
195
+ trace.print() # prints the state of the trace to console
196
+ trace.save() # saves the current state of the trace to the Judgment platform
197
+
198
+ return res.choices[0].message.content
199
+ ```
200
+
201
+ <Warning>
202
+ The `with judgment.trace()` context manager should only be used if you need to customize the context
203
+ over which you're tracing. In most cases, you should trace using the `@observe` decorator.
204
+ </Warning>
205
+
206
+
207
+
208
+
209
+
210
+
211
+
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.12"
3
+ version = "0.0.13"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -199,10 +199,11 @@ class TraceManagerClient:
199
199
  JUDGMENT_TRACES_FETCH_API_URL,
200
200
  json={
201
201
  "trace_id": trace_id,
202
- "judgment_api_key": self.judgment_api_key,
202
+ # "judgment_api_key": self.judgment_api_key,
203
203
  },
204
204
  headers={
205
205
  "Content-Type": "application/json",
206
+ "Authorization": f"Bearer {self.judgment_api_key}"
206
207
  }
207
208
  )
208
209
 
@@ -225,6 +226,7 @@ class TraceManagerClient:
225
226
  json=trace_data,
226
227
  headers={
227
228
  "Content-Type": "application/json",
229
+ "Authorization": f"Bearer {self.judgment_api_key}"
228
230
  }
229
231
  )
230
232
 
@@ -248,6 +250,7 @@ class TraceManagerClient:
248
250
  },
249
251
  headers={
250
252
  "Content-Type": "application/json",
253
+ "Authorization": f"Bearer {self.judgment_api_key}"
251
254
  }
252
255
  )
253
256
 
@@ -263,11 +266,12 @@ class TraceManagerClient:
263
266
  response = requests.delete(
264
267
  JUDGMENT_TRACES_DELETE_API_URL,
265
268
  json={
266
- "judgment_api_key": self.judgment_api_key,
269
+ # "judgment_api_key": self.judgment_api_key,
267
270
  "trace_ids": trace_ids,
268
271
  },
269
272
  headers={
270
273
  "Content-Type": "application/json",
274
+ "Authorization": f"Bearer {self.judgment_api_key}"
271
275
  }
272
276
  )
273
277
 
@@ -576,6 +580,25 @@ class TraceClient:
576
580
 
577
581
  self.trace_manager_client.save_trace(trace_data, empty_save)
578
582
 
583
+
584
+ # Save trace data by making POST request to API
585
+ response = requests.post(
586
+ JUDGMENT_TRACES_SAVE_API_URL,
587
+ json=trace_data,
588
+ headers={
589
+ "Content-Type": "application/json",
590
+ "Authorization": f"Bearer {self.tracer.api_key}" # Bearer token format
591
+ }
592
+ )
593
+
594
+ if response.status_code == HTTPStatus.BAD_REQUEST:
595
+ raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
596
+ elif response.status_code != HTTPStatus.OK:
597
+ raise ValueError(f"Failed to save trace data: {response.text}")
598
+
599
+ if not empty_save and "ui_results_url" in response.json():
600
+ rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
601
+
579
602
  return self.trace_id, trace_data
580
603
 
581
604
  def delete(self):
@@ -36,7 +36,9 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
36
36
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
37
37
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
38
38
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
39
+ JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
39
40
  JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
41
+ JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
40
42
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
41
43
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
42
44
  JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
@@ -162,7 +162,8 @@ class EvalDataset:
162
162
  "additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
163
163
  "tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
164
164
  "expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
165
- "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
165
+ "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
166
+ "example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
166
167
  }
167
168
  if row["example"]:
168
169
  data["name"] = row["name"] if pd.notna(row["name"]) else None
@@ -1,5 +1,5 @@
1
1
 
2
- from typing import Optional
2
+ from typing import Optional, List
3
3
  import requests
4
4
  from rich.progress import Progress, SpinnerColumn, TextColumn
5
5
 
@@ -7,7 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
7
7
  from judgeval.constants import (
8
8
  JUDGMENT_DATASETS_PUSH_API_URL,
9
9
  JUDGMENT_DATASETS_PULL_API_URL,
10
- JUDGMENT_DATASETS_PULL_ALL_API_URL
10
+ JUDGMENT_DATASETS_PULL_ALL_API_URL,
11
+ JUDGMENT_DATASETS_EDIT_API_URL,
12
+ JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
11
13
  )
12
14
  from judgeval.data import Example
13
15
  from judgeval.data.datasets import EvalDataset
@@ -23,7 +25,7 @@ class EvalDatasetClient:
23
25
  def create_dataset(self) -> EvalDataset:
24
26
  return EvalDataset(judgment_api_key=self.judgment_api_key)
25
27
 
26
- def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
28
+ def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
27
29
  debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
28
30
  if overwrite:
29
31
  warning(f"Overwrite enabled for alias '{alias}'")
@@ -56,12 +58,16 @@ class EvalDatasetClient:
56
58
  "ground_truths": [g.to_dict() for g in dataset.ground_truths],
57
59
  "examples": [e.to_dict() for e in dataset.examples],
58
60
  "overwrite": overwrite,
59
- "judgment_api_key": dataset.judgment_api_key
61
+ # "judgment_api_key": dataset.judgment_api_key
60
62
  }
61
63
  try:
62
64
  response = requests.post(
63
65
  JUDGMENT_DATASETS_PUSH_API_URL,
64
- json=content
66
+ json=content,
67
+ headers={
68
+ "Content-Type": "application/json",
69
+ "Authorization": f"Bearer {self.judgment_api_key}"
70
+ }
65
71
  )
66
72
  if response.status_code == 500:
67
73
  error(f"Server error during push: {content.get('message')}")
@@ -115,13 +121,17 @@ class EvalDatasetClient:
115
121
  )
116
122
  request_body = {
117
123
  "alias": alias,
118
- "judgment_api_key": self.judgment_api_key
124
+ # "judgment_api_key": self.judgment_api_key
119
125
  }
120
126
 
121
127
  try:
122
128
  response = requests.post(
123
129
  JUDGMENT_DATASETS_PULL_API_URL,
124
- json=request_body
130
+ json=request_body,
131
+ headers={
132
+ "Content-Type": "application/json",
133
+ "Authorization": f"Bearer {self.judgment_api_key}"
134
+ }
125
135
  )
126
136
  response.raise_for_status()
127
137
  except requests.exceptions.RequestException as e:
@@ -169,13 +179,17 @@ class EvalDatasetClient:
169
179
  total=100,
170
180
  )
171
181
  request_body = {
172
- "judgment_api_key": self.judgment_api_key
182
+ # "judgment_api_key": self.judgment_api_key
173
183
  }
174
184
 
175
185
  try:
176
186
  response = requests.post(
177
187
  JUDGMENT_DATASETS_PULL_ALL_API_URL,
178
- json=request_body
188
+ json=request_body,
189
+ headers={
190
+ "Content-Type": "application/json",
191
+ "Authorization": f"Bearer {self.judgment_api_key}"
192
+ }
179
193
  )
180
194
  response.raise_for_status()
181
195
  except requests.exceptions.RequestException as e:
@@ -191,3 +205,86 @@ class EvalDatasetClient:
191
205
  )
192
206
 
193
207
  return payload
208
+
209
+ def edit_dataset(self, alias: str, examples: List[Example], ground_truths: List[GroundTruthExample]) -> bool:
210
+ """
211
+ Edits the dataset on Judgment platform by adding new examples and ground truths
212
+
213
+ Mock request:
214
+ {
215
+ "alias": alias,
216
+ "examples": [...],
217
+ "ground_truths": [...],
218
+ "judgment_api_key": self.judgment_api_key
219
+ }
220
+ """
221
+ with Progress(
222
+ SpinnerColumn(style="rgb(106,0,255)"),
223
+ TextColumn("[progress.description]{task.description}"),
224
+ transient=False,
225
+ ) as progress:
226
+ task_id = progress.add_task(
227
+ f"Editing dataset [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] on Judgment...",
228
+ total=100,
229
+ )
230
+
231
+ content = {
232
+ "alias": alias,
233
+ "examples": [e.to_dict() for e in examples],
234
+ "ground_truths": [g.to_dict() for g in ground_truths],
235
+ "judgment_api_key": self.judgment_api_key
236
+ }
237
+
238
+ try:
239
+ response = requests.post(
240
+ JUDGMENT_DATASETS_EDIT_API_URL,
241
+ json=content
242
+ )
243
+ response.raise_for_status()
244
+ except requests.exceptions.RequestException as e:
245
+ error(f"Error editing dataset: {str(e)}")
246
+ return False
247
+
248
+ info(f"Successfully edited dataset '{alias}'")
249
+ return True
250
+
251
+ def export_jsonl(self, alias: str) -> requests.Response:
252
+ """Export dataset in JSONL format from Judgment platform"""
253
+ debug(f"Exporting dataset with alias '{alias}' as JSONL")
254
+ with Progress(
255
+ SpinnerColumn(style="rgb(106,0,255)"),
256
+ TextColumn("[progress.description]{task.description}"),
257
+ transient=False,
258
+ ) as progress:
259
+ task_id = progress.add_task(
260
+ f"Exporting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] as JSONL...",
261
+ total=100,
262
+ )
263
+ try:
264
+ response = requests.post(
265
+ JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
266
+ json={"alias": alias},
267
+ headers={
268
+ "Content-Type": "application/json",
269
+ "Authorization": f"Bearer {self.judgment_api_key}"
270
+ },
271
+ stream=True
272
+ )
273
+ response.raise_for_status()
274
+ except requests.exceptions.HTTPError as err:
275
+ if err.response.status_code == 404:
276
+ error(f"Dataset not found: {alias}")
277
+ else:
278
+ error(f"HTTP error during export: {err}")
279
+ raise
280
+ except Exception as e:
281
+ error(f"Error during export: {str(e)}")
282
+ raise
283
+
284
+ info(f"Successfully exported dataset with alias '{alias}'")
285
+ progress.update(
286
+ task_id,
287
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
288
+ )
289
+
290
+ return response
@@ -4,9 +4,11 @@ Classes for representing examples in a dataset.
4
4
 
5
5
 
6
6
  from typing import TypeVar, Optional, Any, Dict, List
7
- from pydantic import BaseModel
7
+ from uuid import uuid4
8
+ from pydantic import BaseModel, Field
8
9
  from enum import Enum
9
10
  from datetime import datetime
11
+ import time
10
12
 
11
13
 
12
14
  Input = TypeVar('Input')
@@ -33,15 +35,19 @@ class Example(BaseModel):
33
35
  tools_called: Optional[List[str]] = None
34
36
  expected_tools: Optional[List[str]] = None
35
37
  name: Optional[str] = None
36
- example_id: Optional[str] = None
38
+ example_id: str = Field(default_factory=lambda: str(uuid4()))
39
+ example_index: Optional[int] = None
37
40
  timestamp: Optional[str] = None
38
41
  trace_id: Optional[str] = None
39
42
 
40
43
  def __init__(self, **data):
41
- super().__init__(**data)
44
+ if 'example_id' not in data:
45
+ data['example_id'] = str(uuid4())
42
46
  # Set timestamp if not provided
43
- if self.timestamp is None:
44
- self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
47
+ if 'timestamp' not in data:
48
+ data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
49
+ super().__init__(**data)
50
+
45
51
 
46
52
  def to_dict(self):
47
53
  return {
@@ -55,6 +61,7 @@ class Example(BaseModel):
55
61
  "expected_tools": self.expected_tools,
56
62
  "name": self.name,
57
63
  "example_id": self.example_id,
64
+ "example_index": self.example_index,
58
65
  "timestamp": self.timestamp,
59
66
  "trace_id": self.trace_id
60
67
  }
@@ -71,6 +78,7 @@ class Example(BaseModel):
71
78
  f"expected_tools={self.expected_tools}, "
72
79
  f"name={self.name}, "
73
80
  f"example_id={self.example_id}, "
81
+ f"example_index={self.example_index}, "
74
82
  f"timestamp={self.timestamp}, "
75
83
  f"trace_id={self.trace_id})"
76
84
  )