judgeval 0.9.0__tar.gz → 0.22.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (287) hide show
  1. judgeval-0.22.3/.github/workflows/ci.yaml +148 -0
  2. {judgeval-0.9.0 → judgeval-0.22.3}/.github/workflows/release.yaml +1 -1
  3. judgeval-0.22.3/.pre-commit-config.yaml +23 -0
  4. judgeval-0.22.3/CONTRIBUTING.md +10 -0
  5. judgeval-0.22.3/PKG-INFO +266 -0
  6. judgeval-0.22.3/README.md +238 -0
  7. judgeval-0.22.3/assets/brand/company.jpg +0 -0
  8. judgeval-0.22.3/assets/brand/company_banner.jpg +0 -0
  9. judgeval-0.22.3/assets/brand/darkmode.svg +7 -0
  10. judgeval-0.22.3/assets/brand/full_logo.png +0 -0
  11. judgeval-0.22.3/assets/brand/icon.png +0 -0
  12. judgeval-0.22.3/assets/brand/lightmode.svg +7 -0
  13. judgeval-0.22.3/assets/brand/white_background.png +0 -0
  14. judgeval-0.22.3/assets/custom_scorer_online_abm.png +0 -0
  15. judgeval-0.22.3/assets/logo_darkmode.svg +7 -0
  16. judgeval-0.22.3/assets/logo_lightmode.svg +7 -0
  17. judgeval-0.22.3/assets/quickstart_trajectory_ss.png +0 -0
  18. {judgeval-0.9.0 → judgeval-0.22.3}/pyproject.toml +21 -14
  19. {judgeval-0.9.0 → judgeval-0.22.3}/scripts/api_generator.py +11 -6
  20. judgeval-0.22.3/scripts/api_generator_v1.py +468 -0
  21. {judgeval-0.9.0 → judgeval-0.22.3}/scripts/openapi_transform.py +9 -5
  22. {judgeval-0.9.0 → judgeval-0.22.3}/scripts/update_types.sh +4 -1
  23. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/__init__.py +49 -13
  24. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/api/__init__.py +143 -121
  25. judgeval-0.22.3/src/judgeval/api/api_types.py +416 -0
  26. judgeval-0.22.3/src/judgeval/cli.py +112 -0
  27. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/constants.py +1 -11
  28. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/data/__init__.py +1 -3
  29. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/data/evaluation_run.py +15 -17
  30. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/data/example.py +4 -2
  31. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/data/judgment_types.py +230 -169
  32. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/data/result.py +2 -2
  33. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/data/scorer_data.py +1 -26
  34. judgeval-0.22.3/src/judgeval/data/trace.py +121 -0
  35. judgeval-0.22.3/src/judgeval/dataset/__init__.py +264 -0
  36. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/env.py +6 -20
  37. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/evaluation/__init__.py +47 -106
  38. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/exceptions.py +3 -2
  39. judgeval-0.22.3/src/judgeval/integrations/langgraph/__init__.py +13 -0
  40. judgeval-0.22.3/src/judgeval/integrations/openlit/__init__.py +51 -0
  41. judgeval-0.22.3/src/judgeval/judgment_attribute_keys.py +55 -0
  42. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/logger.py +18 -4
  43. judgeval-0.22.3/src/judgeval/prompt/__init__.py +330 -0
  44. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/__init__.py +11 -7
  45. judgeval-0.22.3/src/judgeval/scorers/agent_scorer.py +17 -0
  46. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/api_scorer.py +15 -12
  47. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/base_scorer.py +2 -3
  48. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
  49. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  50. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  51. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  52. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  53. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +111 -37
  54. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/utils.py +1 -4
  55. judgeval-0.22.3/src/judgeval/tracer/__init__.py +1123 -0
  56. judgeval-0.22.3/src/judgeval/tracer/constants.py +1 -0
  57. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/tracer/exporters/__init__.py +4 -1
  58. judgeval-0.22.3/src/judgeval/tracer/exporters/store.py +59 -0
  59. judgeval-0.22.3/src/judgeval/tracer/keys.py +63 -0
  60. judgeval-0.22.3/src/judgeval/tracer/llm/__init__.py +7 -0
  61. judgeval-0.22.3/src/judgeval/tracer/llm/config.py +78 -0
  62. judgeval-0.22.3/src/judgeval/tracer/llm/constants.py +9 -0
  63. judgeval-0.22.3/src/judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  64. judgeval-0.22.3/src/judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  65. judgeval-0.22.3/src/judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  66. judgeval-0.22.3/src/judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  67. judgeval-0.22.3/src/judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  68. judgeval-0.22.3/src/judgeval/tracer/llm/llm_google/__init__.py +3 -0
  69. judgeval-0.22.3/src/judgeval/tracer/llm/llm_google/config.py +6 -0
  70. judgeval-0.22.3/src/judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  71. judgeval-0.22.3/src/judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  72. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  73. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  74. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  75. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/config.py +6 -0
  76. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/responses.py +506 -0
  77. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/utils.py +42 -0
  78. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  79. judgeval-0.22.3/src/judgeval/tracer/llm/llm_together/__init__.py +3 -0
  80. judgeval-0.22.3/src/judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  81. judgeval-0.22.3/src/judgeval/tracer/llm/llm_together/config.py +6 -0
  82. judgeval-0.22.3/src/judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  83. judgeval-0.22.3/src/judgeval/tracer/llm/providers.py +19 -0
  84. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/tracer/managers.py +27 -48
  85. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/tracer/processors/__init__.py +57 -18
  86. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/tracer/utils.py +3 -4
  87. judgeval-0.22.3/src/judgeval/trainer/__init__.py +14 -0
  88. judgeval-0.22.3/src/judgeval/trainer/base_trainer.py +122 -0
  89. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/trainer/config.py +2 -7
  90. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/trainer/console.py +1 -1
  91. judgeval-0.9.0/src/judgeval/trainer/trainer.py → judgeval-0.22.3/src/judgeval/trainer/fireworks_trainer.py +43 -54
  92. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/trainer/trainable_model.py +25 -16
  93. judgeval-0.22.3/src/judgeval/trainer/trainer.py +70 -0
  94. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/utils/async_utils.py +7 -3
  95. judgeval-0.22.3/src/judgeval/utils/decorators/__init__.py +0 -0
  96. judgeval-0.22.3/src/judgeval/utils/decorators/dont_throw.py +37 -0
  97. judgeval-0.9.0/src/judgeval/utils/decorators.py → judgeval-0.22.3/src/judgeval/utils/decorators/use_once.py +0 -11
  98. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/utils/file_utils.py +0 -2
  99. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/utils/guards.py +9 -5
  100. judgeval-0.22.3/src/judgeval/utils/meta.py +27 -0
  101. judgeval-0.22.3/src/judgeval/utils/project.py +15 -0
  102. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/utils/serialize.py +7 -1
  103. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/utils/testing.py +0 -18
  104. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/utils/version_check.py +2 -0
  105. judgeval-0.22.3/src/judgeval/utils/wrappers/README.md +3 -0
  106. judgeval-0.22.3/src/judgeval/utils/wrappers/__init__.py +15 -0
  107. judgeval-0.22.3/src/judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  108. judgeval-0.22.3/src/judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  109. judgeval-0.22.3/src/judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  110. judgeval-0.22.3/src/judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  111. judgeval-0.22.3/src/judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  112. judgeval-0.22.3/src/judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  113. judgeval-0.22.3/src/judgeval/utils/wrappers/py.typed +0 -0
  114. judgeval-0.22.3/src/judgeval/utils/wrappers/utils.py +35 -0
  115. judgeval-0.22.3/src/judgeval/v1/__init__.py +88 -0
  116. judgeval-0.22.3/src/judgeval/v1/data/__init__.py +7 -0
  117. judgeval-0.22.3/src/judgeval/v1/data/example.py +44 -0
  118. judgeval-0.22.3/src/judgeval/v1/data/scorer_data.py +42 -0
  119. judgeval-0.22.3/src/judgeval/v1/data/scoring_result.py +44 -0
  120. judgeval-0.22.3/src/judgeval/v1/datasets/__init__.py +6 -0
  121. judgeval-0.22.3/src/judgeval/v1/datasets/dataset.py +152 -0
  122. judgeval-0.22.3/src/judgeval/v1/datasets/dataset_factory.py +88 -0
  123. judgeval-0.22.3/src/judgeval/v1/evaluation/__init__.py +6 -0
  124. judgeval-0.22.3/src/judgeval/v1/evaluation/evaluation.py +184 -0
  125. judgeval-0.22.3/src/judgeval/v1/evaluation/evaluation_factory.py +17 -0
  126. judgeval-0.22.3/src/judgeval/v1/instrumentation/__init__.py +6 -0
  127. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/__init__.py +7 -0
  128. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/config.py +78 -0
  129. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/constants.py +11 -0
  130. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  131. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  132. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  133. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  134. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  135. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  136. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  137. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  138. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  139. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  140. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  141. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  142. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  143. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  144. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  145. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  146. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  147. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  148. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  149. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  150. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/providers.py +19 -0
  151. judgeval-0.22.3/src/judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  152. judgeval-0.22.3/src/judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  153. judgeval-0.22.3/src/judgeval/v1/integrations/langgraph/__init__.py +13 -0
  154. judgeval-0.22.3/src/judgeval/v1/integrations/openlit/__init__.py +47 -0
  155. judgeval-0.22.3/src/judgeval/v1/internal/api/__init__.py +525 -0
  156. judgeval-0.22.3/src/judgeval/v1/internal/api/api_types.py +416 -0
  157. judgeval-0.22.3/src/judgeval/v1/prompts/__init__.py +6 -0
  158. judgeval-0.22.3/src/judgeval/v1/prompts/prompt.py +29 -0
  159. judgeval-0.22.3/src/judgeval/v1/prompts/prompt_factory.py +189 -0
  160. judgeval-0.22.3/src/judgeval/v1/py.typed +0 -0
  161. judgeval-0.22.3/src/judgeval/v1/scorers/__init__.py +6 -0
  162. judgeval-0.22.3/src/judgeval/v1/scorers/api_scorer.py +82 -0
  163. judgeval-0.22.3/src/judgeval/v1/scorers/base_scorer.py +17 -0
  164. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/__init__.py +17 -0
  165. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  166. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  167. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  168. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  169. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  170. judgeval-0.22.3/src/judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  171. judgeval-0.22.3/src/judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  172. judgeval-0.22.3/src/judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  173. judgeval-0.22.3/src/judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  174. judgeval-0.22.3/src/judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  175. judgeval-0.22.3/src/judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  176. judgeval-0.22.3/src/judgeval/v1/scorers/scorers_factory.py +49 -0
  177. judgeval-0.22.3/src/judgeval/v1/tracer/__init__.py +7 -0
  178. judgeval-0.22.3/src/judgeval/v1/tracer/base_tracer.py +520 -0
  179. judgeval-0.22.3/src/judgeval/v1/tracer/exporters/__init__.py +14 -0
  180. judgeval-0.22.3/src/judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  181. judgeval-0.22.3/src/judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  182. judgeval-0.22.3/src/judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  183. judgeval-0.22.3/src/judgeval/v1/tracer/exporters/span_store.py +50 -0
  184. judgeval-0.22.3/src/judgeval/v1/tracer/processors/__init__.py +6 -0
  185. judgeval-0.22.3/src/judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  186. judgeval-0.22.3/src/judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  187. judgeval-0.22.3/src/judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  188. judgeval-0.22.3/src/judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  189. judgeval-0.22.3/src/judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  190. judgeval-0.22.3/src/judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  191. judgeval-0.22.3/src/judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  192. judgeval-0.22.3/src/judgeval/v1/tracer/tracer.py +61 -0
  193. judgeval-0.22.3/src/judgeval/v1/tracer/tracer_factory.py +36 -0
  194. judgeval-0.22.3/src/judgeval/v1/trainers/__init__.py +5 -0
  195. judgeval-0.22.3/src/judgeval/v1/trainers/base_trainer.py +62 -0
  196. judgeval-0.22.3/src/judgeval/v1/trainers/config.py +123 -0
  197. judgeval-0.22.3/src/judgeval/v1/trainers/console.py +144 -0
  198. judgeval-0.22.3/src/judgeval/v1/trainers/fireworks_trainer.py +392 -0
  199. judgeval-0.22.3/src/judgeval/v1/trainers/trainable_model.py +252 -0
  200. judgeval-0.22.3/src/judgeval/v1/trainers/trainers_factory.py +37 -0
  201. judgeval-0.22.3/src/judgeval/v1/utils.py +18 -0
  202. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/version.py +1 -1
  203. judgeval-0.22.3/update_version.py +35 -0
  204. judgeval-0.22.3/uv.lock +5786 -0
  205. judgeval-0.9.0/.github/workflows/ci.yaml +0 -163
  206. judgeval-0.9.0/.pre-commit-config.yaml +0 -23
  207. judgeval-0.9.0/PKG-INFO +0 -164
  208. judgeval-0.9.0/README.md +0 -131
  209. judgeval-0.9.0/assets/logo-dark.svg +0 -23
  210. judgeval-0.9.0/assets/logo-light.svg +0 -18
  211. judgeval-0.9.0/assets/new_darkmode.svg +0 -29
  212. judgeval-0.9.0/assets/new_lightmode.svg +0 -34
  213. judgeval-0.9.0/src/judgeval/api/api_types.py +0 -344
  214. judgeval-0.9.0/src/judgeval/cli.py +0 -63
  215. judgeval-0.9.0/src/judgeval/data/tool.py +0 -5
  216. judgeval-0.9.0/src/judgeval/data/trace.py +0 -40
  217. judgeval-0.9.0/src/judgeval/data/trace_run.py +0 -39
  218. judgeval-0.9.0/src/judgeval/dataset/__init__.py +0 -209
  219. judgeval-0.9.0/src/judgeval/integrations/langgraph/__init__.py +0 -788
  220. judgeval-0.9.0/src/judgeval/scorers/agent_scorer.py +0 -17
  221. judgeval-0.9.0/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  222. judgeval-0.9.0/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  223. judgeval-0.9.0/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  224. judgeval-0.9.0/src/judgeval/scorers/trace_api_scorer.py +0 -5
  225. judgeval-0.9.0/src/judgeval/tracer/__init__.py +0 -1076
  226. judgeval-0.9.0/src/judgeval/tracer/constants.py +0 -1
  227. judgeval-0.9.0/src/judgeval/tracer/exporters/store.py +0 -43
  228. judgeval-0.9.0/src/judgeval/tracer/keys.py +0 -67
  229. judgeval-0.9.0/src/judgeval/tracer/llm/__init__.py +0 -1233
  230. judgeval-0.9.0/src/judgeval/tracer/llm/providers.py +0 -114
  231. judgeval-0.9.0/src/judgeval/tracer/local_eval_queue.py +0 -195
  232. judgeval-0.9.0/src/judgeval/trainer/__init__.py +0 -5
  233. judgeval-0.9.0/src/judgeval/utils/meta.py +0 -14
  234. judgeval-0.9.0/update_version.py +0 -32
  235. judgeval-0.9.0/uv.lock +0 -3941
  236. {judgeval-0.9.0 → judgeval-0.22.3}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  237. {judgeval-0.9.0 → judgeval-0.22.3}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  238. {judgeval-0.9.0 → judgeval-0.22.3}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  239. {judgeval-0.9.0 → judgeval-0.22.3}/.github/pull_request_template.md +0 -0
  240. {judgeval-0.9.0 → judgeval-0.22.3}/.github/workflows/blocked-pr.yaml +0 -0
  241. {judgeval-0.9.0 → judgeval-0.22.3}/.github/workflows/claude-code-review.yml +0 -0
  242. {judgeval-0.9.0 → judgeval-0.22.3}/.github/workflows/claude.yml +0 -0
  243. {judgeval-0.9.0 → judgeval-0.22.3}/.github/workflows/lint.yaml +0 -0
  244. {judgeval-0.9.0 → judgeval-0.22.3}/.github/workflows/merge-branch-check.yaml +0 -0
  245. {judgeval-0.9.0 → judgeval-0.22.3}/.github/workflows/mypy.yaml +0 -0
  246. {judgeval-0.9.0 → judgeval-0.22.3}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
  247. {judgeval-0.9.0 → judgeval-0.22.3}/.github/workflows/validate-branch.yaml +0 -0
  248. {judgeval-0.9.0 → judgeval-0.22.3}/.gitignore +0 -0
  249. {judgeval-0.9.0 → judgeval-0.22.3}/LICENSE.md +0 -0
  250. {judgeval-0.9.0 → judgeval-0.22.3}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  251. {judgeval-0.9.0 → judgeval-0.22.3}/assets/agent.gif +0 -0
  252. {judgeval-0.9.0 → judgeval-0.22.3}/assets/agent_trace_example.png +0 -0
  253. {judgeval-0.9.0 → judgeval-0.22.3}/assets/data.gif +0 -0
  254. {judgeval-0.9.0 → judgeval-0.22.3}/assets/dataset_clustering_screenshot.png +0 -0
  255. {judgeval-0.9.0 → judgeval-0.22.3}/assets/dataset_clustering_screenshot_dm.png +0 -0
  256. {judgeval-0.9.0 → judgeval-0.22.3}/assets/datasets_preview_screenshot.png +0 -0
  257. {judgeval-0.9.0 → judgeval-0.22.3}/assets/document.gif +0 -0
  258. {judgeval-0.9.0 → judgeval-0.22.3}/assets/error_analysis_dashboard.png +0 -0
  259. {judgeval-0.9.0 → judgeval-0.22.3}/assets/errors.png +0 -0
  260. {judgeval-0.9.0 → judgeval-0.22.3}/assets/experiments_dashboard_screenshot.png +0 -0
  261. {judgeval-0.9.0 → judgeval-0.22.3}/assets/experiments_page.png +0 -0
  262. {judgeval-0.9.0 → judgeval-0.22.3}/assets/experiments_pagev2.png +0 -0
  263. {judgeval-0.9.0 → judgeval-0.22.3}/assets/monitoring_screenshot.png +0 -0
  264. {judgeval-0.9.0 → judgeval-0.22.3}/assets/online_eval.png +0 -0
  265. {judgeval-0.9.0 → judgeval-0.22.3}/assets/product_shot.png +0 -0
  266. {judgeval-0.9.0 → judgeval-0.22.3}/assets/test.png +0 -0
  267. {judgeval-0.9.0 → judgeval-0.22.3}/assets/tests.png +0 -0
  268. {judgeval-0.9.0 → judgeval-0.22.3}/assets/trace.gif +0 -0
  269. {judgeval-0.9.0 → judgeval-0.22.3}/assets/trace_demo.png +0 -0
  270. {judgeval-0.9.0 → judgeval-0.22.3}/assets/trace_screenshot.png +0 -0
  271. {judgeval-0.9.0 → judgeval-0.22.3}/assets/trace_screenshot_old.png +0 -0
  272. {judgeval-0.9.0 → judgeval-0.22.3}/pytest.ini +0 -0
  273. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  274. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/data/scripts/openapi_transform.py +0 -0
  275. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/judges/__init__.py +0 -0
  276. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/judges/base_judge.py +0 -0
  277. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/judges/litellm_judge.py +0 -0
  278. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/judges/together_judge.py +0 -0
  279. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/judges/utils.py +0 -0
  280. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/example_scorer.py +0 -0
  281. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/exceptions.py +0 -0
  282. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  283. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/scorers/score.py +0 -0
  284. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/tracer/exporters/s3.py +0 -0
  285. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/tracer/exporters/utils.py +0 -0
  286. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/utils/url.py +0 -0
  287. {judgeval-0.9.0 → judgeval-0.22.3}/src/judgeval/warnings.py +0 -0
@@ -0,0 +1,148 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize, reopened]
6
+
7
+ permissions: read-all
8
+
9
+ jobs:
10
+ validate-branch:
11
+ uses: ./.github/workflows/merge-branch-check.yaml
12
+
13
+ run-tests:
14
+ needs: [validate-branch]
15
+ if: needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped'
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ os: [ubuntu-latest, macos-latest]
20
+ python-version:
21
+ - "3.10"
22
+ - "3.11"
23
+ - "3.12"
24
+ - "3.13"
25
+ name: Unit Tests
26
+ runs-on: ${{ matrix.os }}
27
+ env:
28
+ PYTHONPATH: "."
29
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
30
+ TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
31
+ GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
32
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
33
+ OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
34
+ JUDGMENT_DEV: true
35
+
36
+ steps:
37
+ - name: Checkout code
38
+ uses: actions/checkout@v4
39
+
40
+ - name: Set up Python
41
+ uses: actions/setup-python@v4
42
+ with:
43
+ python-version: ${{ matrix.python-version }}
44
+
45
+ - name: Install dependencies
46
+ run: |
47
+ pip install uv
48
+ uv sync --dev
49
+
50
+ - name: Install Claude Code CLI
51
+ run: |
52
+ npm install -g @anthropic-ai/claude-code
53
+
54
+ - name: Run tests
55
+ run: |
56
+ cd src
57
+ export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
58
+ export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
59
+ uv run pytest tests -n auto
60
+
61
+ run-e2e-tests:
62
+ needs: [validate-branch]
63
+ if: "(github.base_ref == 'staging' || github.base_ref == 'main') && !contains(github.actor, '[bot]') && (needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped')"
64
+ strategy:
65
+ fail-fast: false
66
+ matrix:
67
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
68
+ name: E2E Tests
69
+ runs-on: ubuntu-latest
70
+ env:
71
+ TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
72
+ steps:
73
+ - name: Configure AWS Credentials
74
+ uses: aws-actions/configure-aws-credentials@v4
75
+ with:
76
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
77
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
78
+ aws-region: us-west-1
79
+
80
+ - name: Checkout code
81
+ uses: actions/checkout@v4
82
+
83
+ - name: Set env based on branch
84
+ run: |
85
+ if [ "${{ github.base_ref }}" = "main" ]; then
86
+ echo "TARGET_ENV=main" >> "$GITHUB_ENV"
87
+ echo "BASE_URL=https://api.judgmentlabs.ai" >> "$GITHUB_ENV"
88
+ echo "SECRETS_PATH=prod/api-keys/e2e-tests" >> "$GITHUB_ENV"
89
+ echo "COVERAGE_ARTIFACT=coverage-html-production-${{ matrix.python-version }}" >> "$GITHUB_ENV"
90
+ else
91
+ echo "TARGET_ENV=staging" >> "$GITHUB_ENV"
92
+ echo "BASE_URL=https://staging.api.judgmentlabs.ai" >> "$GITHUB_ENV"
93
+ echo "SECRETS_PATH=stg/api-keys/e2e-tests" >> "$GITHUB_ENV"
94
+ echo "COVERAGE_ARTIFACT=coverage-html-staging-${{ matrix.python-version }}" >> "$GITHUB_ENV"
95
+ fi
96
+
97
+ - name: Restore uv cache
98
+ uses: actions/cache/restore@v4
99
+ id: restore-uv-cache
100
+ with:
101
+ path: ~/.cache/uv/
102
+ key: ${{ runner.os }}-uv-judgment-${{ hashFiles('./**/uv.lock') }}
103
+ restore-keys: |
104
+ ${{ runner.os }}-uv-judgment-
105
+ ${{ runner.os }}-uv-
106
+
107
+ - name: Set up Python
108
+ uses: actions/setup-python@v4
109
+ with:
110
+ python-version: ${{ matrix.python-version }}
111
+
112
+ - name: Install judgeval dependencies
113
+ run: |
114
+ pip install uv
115
+ uv sync --dev
116
+
117
+ - name: Check if server is running
118
+ run: |
119
+ if ! curl -s "$BASE_URL/health" > /dev/null; then
120
+ echo "Judgment server ($BASE_URL) is not running properly. Check CloudWatch logs."
121
+ exit 1
122
+ else
123
+ echo "Server is running."
124
+ fi
125
+
126
+ - name: Run E2E tests
127
+ working-directory: src
128
+ run: |
129
+ SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id "$SECRETS_PATH" --query SecretString --output text)
130
+ export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
131
+ export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
132
+ export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
133
+ export JUDGMENT_API_URL="$BASE_URL"
134
+ timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
135
+
136
+ - name: Upload coverage HTML report
137
+ if: always()
138
+ uses: actions/upload-artifact@v4
139
+ with:
140
+ name: ${{ env.COVERAGE_ARTIFACT }}
141
+ path: src/htmlcov
142
+
143
+ - name: Save uv cache
144
+ uses: actions/cache/save@v4
145
+ if: always() && steps.restore-uv-cache.outputs.cache-hit != 'true'
146
+ with:
147
+ path: ~/.cache/uv/
148
+ key: ${{ runner.os }}-uv-judgment-${{ hashFiles('./**/uv.lock') }}
@@ -91,7 +91,7 @@ jobs:
91
91
  env:
92
92
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
93
93
 
94
- - name: Bump pyproject.toml version
94
+ - name: Bump pyproject.toml and version.py version
95
95
  run: |
96
96
  python update_version.py ${{ steps.bump_tag.outputs.new_version }}
97
97
 
@@ -0,0 +1,23 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/uv-pre-commit
3
+ rev: 0.9.7
4
+ hooks:
5
+ - id: uv-lock
6
+
7
+ - repo: https://github.com/astral-sh/ruff-pre-commit
8
+ rev: v0.14.3
9
+ hooks:
10
+ - id: ruff
11
+ name: ruff (linter)
12
+ args: [--fix]
13
+ - id: ruff-format
14
+ name: ruff (formatter)
15
+
16
+ - repo: https://github.com/pre-commit/mirrors-mypy
17
+ rev: v1.18.2
18
+ hooks:
19
+ - id: mypy
20
+ language: system
21
+ # These next two lines allow commits even if mypy fails, REMOVE once we fix all mypy errors
22
+ verbose: true
23
+ entry: bash -c 'mypy src/judgeval/ || true'
@@ -0,0 +1,10 @@
1
+ # Contribute to Judgeval
2
+
3
+ There are many ways to contribute to Judgeval:
4
+
5
+ - Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
6
+ - Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
7
+ - Speaking or writing about Judgment and letting us know!
8
+
9
+ <!-- Contributors collage -->
10
+ [![Contributors](https://contributors-img.web.app/image?repo=JudgmentLabs/judgeval)](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
@@ -0,0 +1,266 @@
1
+ Metadata-Version: 2.4
2
+ Name: judgeval
3
+ Version: 0.22.3
4
+ Summary: The open source post-building layer for Agent Behavior Monitoring.
5
+ Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
+ Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
+ Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
+ Maintainer-email: Judgment Labs <contact@judgmentlabs.ai>
9
+ License-Expression: Apache-2.0
10
+ License-File: LICENSE.md
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Requires-Python: >=3.10
14
+ Requires-Dist: boto3>=1.40.11
15
+ Requires-Dist: click<8.2.0
16
+ Requires-Dist: dotenv>=0.9.9
17
+ Requires-Dist: httpx>=0.28.1
18
+ Requires-Dist: litellm>=1.75.0
19
+ Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
20
+ Requires-Dist: opentelemetry-sdk>=1.36.0
21
+ Requires-Dist: orjson>=3.9.0
22
+ Requires-Dist: typer>=0.9.0
23
+ Provides-Extra: s3
24
+ Requires-Dist: boto3>=1.40.11; extra == 's3'
25
+ Provides-Extra: trainer
26
+ Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
27
+ Description-Content-Type: text/markdown
28
+
29
+ <div align="center">
30
+
31
+ <a href="https://judgmentlabs.ai/">
32
+ <picture>
33
+ <source media="(prefers-color-scheme: dark)" srcset="assets/logo_darkmode.svg">
34
+ <img src="assets/logo_lightmode.svg" alt="Judgment Logo" width="400" />
35
+ </picture>
36
+ </a>
37
+
38
+ <br>
39
+
40
+ ## Agent Behavior Monitoring (ABM)
41
+
42
+ Track and judge any agent behavior in online and offline setups. Set up Sentry-style alerts and analyze agent behaviors / topic patterns at scale!
43
+
44
+ [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.judgmentlabs.ai/documentation)
45
+ [![Judgment Cloud](https://img.shields.io/badge/Judgment%20Cloud-brightgreen)](https://app.judgmentlabs.ai/register)
46
+ [![Self-Host](https://img.shields.io/badge/Self--Host-orange)](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
47
+
48
+
49
+ [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
50
+ [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
51
+
52
+ </div>
53
+
54
+
55
+ </table>
56
+
57
+ ## [NEW] 🎆 Agent Reinforcement Learning
58
+
59
+ Train your agents with multi-turn reinforcement learning using judgeval and [Fireworks AI](https://fireworks.ai/)! Judgeval's ABM now integrates with Fireworks' Reinforcement Fine-Tuning (RFT) endpoint, supporting gpt-oss, qwen3, Kimi2, DeepSeek, and more.
60
+
61
+ Judgeval's agent monitoring infra provides a simple harness for integrating GRPO into any Python agent, giving builders a quick method to **try RL with minimal code changes** to their existing agents!
62
+
63
+ ```python
64
+ await trainer.train(
65
+ agent_function=your_agent_function, # entry point to your agent
66
+ scorers=[RewardScorer()], # Custom scorer you define based on task criteria, acts as reward
67
+ prompts=training_prompts # Tasks
68
+ )
69
+ ```
70
+
71
+ **That's it!** Judgeval automatically manages trajectory collection and reward tagging - your agent can learn from production data with minimal code changes.
72
+
73
+ 👉 Check out the [Wikipedia Racer notebook](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb), where an agent learns to navigate Wikipedia using RL, to see Judgeval in action.
74
+
75
+
76
+ You can view and monitor training progress for free via the [Judgment Dashboard](https://app.judgmentlabs.ai/).
77
+
78
+
79
+ ## Judgeval Overview
80
+
81
+ Judgeval is an open-source framework for agent behavior monitoring. Judgeval offers a toolkit to track and judge agent behavior in online and offline setups, enabling you to convert interaction data from production/test environments into improved agents. To get started, try running one of the notebooks below or dive deeper in our [docs](https://docs.judgmentlabs.ai/documentation).
82
+
83
+ Our mission is to unlock the power of production data for agent development, enabling teams to improve their apps by catching real-time failures and optimizing over their users' preferences.
84
+
85
+ ## 📚 Cookbooks
86
+
87
+ | Try Out | Notebook | Description |
88
+ |:---------|:-----|:------------|
89
+ | RL | [Wikipedia Racer](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb) | Train agents with reinforcement learning |
90
+ | Online ABM | [Research Agent](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/monitoring/Research_Agent_Online_Monitoring.ipynb) | Monitor agent behavior in production |
91
+ | Custom Scorers | [HumanEval](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/custom_scorers/HumanEval_Custom_Scorer.ipynb) | Build custom evaluators for your agents |
92
+ | Offline Testing | [Get Started For Free] | Compare how different prompts, models, or agent configs affect performance across ANY metric |
93
+
94
+ You can access our [repo of cookbooks](https://github.com/JudgmentLabs/judgment-cookbook).
95
+
96
+ You can find a list of [video tutorials for Judgeval use cases](https://www.youtube.com/@Alexshander-JL).
97
+
98
+ ## Why Judgeval?
99
+
100
+ 🤖 **Simple to run multi-turn RL**: Optimize your agents with multi-turn RL without managing compute infrastructure or data pipelines. Just add a few lines of code to your existing agent code and train!
101
+
102
+ ⚙️ **Custom Evaluators**: No restriction to only monitoring with prefab scorers. Judgeval provides simple abstractions for custom Python scorers, supporting any LLM-as-a-judge rubrics/models and code-based scorers that integrate to our live agent-tracking infrastructure. [Learn more](https://docs.judgmentlabs.ai/documentation/evaluation/custom-scorers)
103
+
104
+ 🚨 **Production Monitoring**: Run any custom scorer in a hosted, virtualized secure container to flag agent behaviors online in production. Get Slack alerts for failures and add custom hooks to address regressions before they impact users. [Learn more](https://docs.judgmentlabs.ai/documentation/performance/online-evals)
105
+
106
+ 📊 **Behavior/Topic Grouping**: Group agent runs by behavior type or topic for deeper analysis. Drill down into subsets of users, agents, or use cases to reveal patterns of agent behavior.
107
+ <!-- Add link to Bucketing docs once we have it -->
108
+ <!--
109
+ TODO: Once we have trainer code docs, plug in here
110
+ -->
111
+
112
+ 🧪 **Run experiments on your agents**: Compare test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors.
113
+
114
+ <!--
115
+ Use this once we have AI PM features:
116
+
117
+ **Run experiments on your agents**: A/B test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors. [Learn more]
118
+
119
+ -->
120
+
121
+ ## 🛠️ Quickstart
122
+
123
+ Get started with Judgeval by installing our SDK using pip:
124
+
125
+ ```bash
126
+ pip install judgeval
127
+ ```
128
+
129
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
130
+
131
+ ```bash
132
+ export JUDGMENT_API_KEY=...
133
+ export JUDGMENT_ORG_ID=...
134
+ ```
135
+
136
+ **If you don't have keys, [create an account for free](https://app.judgmentlabs.ai/register) on the platform!**
137
+
138
+ ### Start monitoring with Judgeval
139
+
140
+ ```python
141
+ from judgeval.tracer import Tracer, wrap
142
+ from judgeval.data import Example
143
+ from judgeval.scorers import AnswerRelevancyScorer
144
+ from openai import OpenAI
145
+
146
+
147
+ judgment = Tracer(project_name="default_project")
148
+ client = wrap(OpenAI()) # tracks all LLM calls
149
+
150
+ @judgment.observe(span_type="tool")
151
+ def format_question(question: str) -> str:
152
+ # dummy tool
153
+ return f"Question : {question}"
154
+
155
+ @judgment.observe(span_type="function")
156
+ def run_agent(prompt: str) -> str:
157
+ task = format_question(prompt)
158
+ response = client.chat.completions.create(
159
+ model="gpt-5-mini",
160
+ messages=[{"role": "user", "content": task}]
161
+ )
162
+
163
+ judgment.async_evaluate( # trigger online monitoring
164
+ scorer=AnswerRelevancyScorer(threshold=0.5), # swap with any scorer
165
+ example=Example(input=task, actual_output=response), # customize to your data
166
+ model="gpt-5",
167
+ )
168
+ return response.choices[0].message.content
169
+
170
+ run_agent("What is the capital of the United States?")
171
+ ```
172
+
173
+ Running this code will deliver monitoring results to your [free platform account](https://app.judgmentlabs.ai/register) and should look like this:
174
+
175
+ ![Judgment Platform Trajectory View](assets/quickstart_trajectory_ss.png)
176
+
177
+
178
+ ### Customizable Scorers Over Agent Behavior
179
+
180
+ Judgeval's strongest suit is the full customization over the types of scorers you can run online monitoring with. No restrictions to only single-prompt LLM judges or prefab scorers - if you can express your scorer
181
+ in python code, judgeval can monitor it! Under the hood, judgeval hosts your scorer in a virtualized secure container, enabling online monitoring for any scorer.
182
+
183
+
184
+ First, create a behavior scorer in a file called `helpfulness_scorer.py`:
185
+
186
+ ```python
187
+ from judgeval.data import Example
188
+ from judgeval.scorers.example_scorer import ExampleScorer
189
+
190
+ # Define custom example class
191
+ class QuestionAnswer(Example):
192
+ question: str
193
+ answer: str
194
+
195
+ # Define a server-hosted custom scorer
196
+ class HelpfulnessScorer(ExampleScorer):
197
+ name: str = "Helpfulness Scorer"
198
+ server_hosted: bool = True # Enable server hosting
199
+ async def a_score_example(self, example: QuestionAnswer):
200
+ # Custom scoring logic for agent behavior
201
+ # Can be an arbitrary combination of code and LLM calls
202
+ if len(example.answer) > 10 and "?" not in example.answer:
203
+ self.reason = "Answer is detailed and provides helpful information"
204
+ return 1.0
205
+ else:
206
+ self.reason = "Answer is too brief or unclear"
207
+ return 0.0
208
+ ```
209
+
210
+ Then deploy your scorer to Judgment's infrastructure:
211
+
212
+ ```bash
213
+ echo "pydantic" > requirements.txt
214
+ uv run judgeval upload_scorer helpfulness_scorer.py requirements.txt
215
+ ```
216
+
217
+ Now you can instrument your agent with monitoring and online evaluation:
218
+
219
+ ```python
220
+ from judgeval.tracer import Tracer, wrap
221
+ from helpfulness_scorer import HelpfulnessScorer, QuestionAnswer
222
+ from openai import OpenAI
223
+
224
+ judgment = Tracer(project_name="default_project")
225
+ client = wrap(OpenAI()) # tracks all LLM calls
226
+
227
+ @judgment.observe(span_type="tool")
228
+ def format_task(question: str) -> str: # replace with your prompt engineering
229
+ return f"Please answer the following question: {question}"
230
+
231
+ @judgment.observe(span_type="tool")
232
+ def answer_question(prompt: str) -> str: # replace with your LLM system calls
233
+ response = client.chat.completions.create(
234
+ model="gpt-5-mini",
235
+ messages=[{"role": "user", "content": prompt}]
236
+ )
237
+ return response.choices[0].message.content
238
+
239
+ @judgment.observe(span_type="function")
240
+ def run_agent(question: str) -> str:
241
+ task = format_task(question)
242
+ answer = answer_question(task)
243
+
244
+ # Add online evaluation with server-hosted scorer
245
+ judgment.async_evaluate(
246
+ scorer=HelpfulnessScorer(),
247
+ example=QuestionAnswer(question=question, answer=answer),
248
+ sampling_rate=0.9 # Evaluate 90% of agent runs
249
+ )
250
+
251
+ return answer
252
+
253
+ if __name__ == "__main__":
254
+ result = run_agent("What is the capital of the United States?")
255
+ print(result)
256
+ ```
257
+
258
+ Congratulations! Your online eval result should look like this:
259
+
260
+ ![Custom Scorer Online ABM](assets/custom_scorer_online_abm.png)
261
+
262
+ You can now run any online scorer in a secure Firecracker microVMs with no latency impact on your applications.
263
+
264
+ ---
265
+
266
+ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).