judgeval 0.0.55__tar.gz → 0.22.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (310) hide show
  1. judgeval-0.22.3/.github/ISSUE_TEMPLATE/config.yml +5 -0
  2. {judgeval-0.0.55 → judgeval-0.22.3}/.github/pull_request_template.md +1 -8
  3. judgeval-0.22.3/.github/workflows/ci.yaml +148 -0
  4. judgeval-0.22.3/.github/workflows/claude-code-review.yml +35 -0
  5. judgeval-0.22.3/.github/workflows/claude.yml +40 -0
  6. {judgeval-0.0.55 → judgeval-0.22.3}/.github/workflows/lint.yaml +0 -13
  7. judgeval-0.22.3/.github/workflows/mypy.yaml +25 -0
  8. judgeval-0.22.3/.github/workflows/pre-commit-autoupdate.yaml +38 -0
  9. {judgeval-0.0.55 → judgeval-0.22.3}/.github/workflows/release.yaml +36 -4
  10. {judgeval-0.0.55 → judgeval-0.22.3}/.pre-commit-config.yaml +7 -5
  11. judgeval-0.22.3/CONTRIBUTING.md +10 -0
  12. judgeval-0.22.3/PKG-INFO +266 -0
  13. judgeval-0.22.3/README.md +238 -0
  14. judgeval-0.22.3/assets/agent_trace_example.png +0 -0
  15. judgeval-0.22.3/assets/brand/company.jpg +0 -0
  16. judgeval-0.22.3/assets/brand/company_banner.jpg +0 -0
  17. judgeval-0.22.3/assets/brand/darkmode.svg +7 -0
  18. judgeval-0.22.3/assets/brand/full_logo.png +0 -0
  19. judgeval-0.22.3/assets/brand/icon.png +0 -0
  20. judgeval-0.22.3/assets/brand/lightmode.svg +7 -0
  21. judgeval-0.22.3/assets/brand/white_background.png +0 -0
  22. judgeval-0.22.3/assets/custom_scorer_online_abm.png +0 -0
  23. judgeval-0.22.3/assets/errors.png +0 -0
  24. judgeval-0.22.3/assets/logo_darkmode.svg +7 -0
  25. judgeval-0.22.3/assets/logo_lightmode.svg +7 -0
  26. judgeval-0.22.3/assets/online_eval.png +0 -0
  27. judgeval-0.22.3/assets/product_shot.png +0 -0
  28. judgeval-0.22.3/assets/quickstart_trajectory_ss.png +0 -0
  29. judgeval-0.22.3/assets/test.png +0 -0
  30. judgeval-0.22.3/assets/tests.png +0 -0
  31. judgeval-0.22.3/pyproject.toml +101 -0
  32. judgeval-0.22.3/scripts/api_generator.py +365 -0
  33. judgeval-0.22.3/scripts/api_generator_v1.py +468 -0
  34. judgeval-0.22.3/scripts/openapi_transform.py +126 -0
  35. judgeval-0.22.3/scripts/update_types.sh +38 -0
  36. judgeval-0.22.3/src/judgeval/__init__.py +178 -0
  37. judgeval-0.22.3/src/judgeval/api/__init__.py +523 -0
  38. judgeval-0.22.3/src/judgeval/api/api_types.py +416 -0
  39. judgeval-0.22.3/src/judgeval/cli.py +112 -0
  40. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/constants.py +7 -57
  41. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/data/__init__.py +1 -3
  42. judgeval-0.22.3/src/judgeval/data/evaluation_run.py +125 -0
  43. judgeval-0.22.3/src/judgeval/data/example.py +35 -0
  44. judgeval-0.22.3/src/judgeval/data/judgment_types.py +459 -0
  45. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/data/result.py +12 -19
  46. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/data/scorer_data.py +5 -28
  47. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/data/scripts/openapi_transform.py +4 -4
  48. judgeval-0.22.3/src/judgeval/data/trace.py +121 -0
  49. judgeval-0.22.3/src/judgeval/dataset/__init__.py +264 -0
  50. judgeval-0.22.3/src/judgeval/env.py +53 -0
  51. judgeval-0.22.3/src/judgeval/evaluation/__init__.py +347 -0
  52. judgeval-0.22.3/src/judgeval/exceptions.py +28 -0
  53. judgeval-0.22.3/src/judgeval/integrations/langgraph/__init__.py +13 -0
  54. judgeval-0.22.3/src/judgeval/integrations/openlit/__init__.py +51 -0
  55. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/judges/__init__.py +2 -2
  56. judgeval-0.22.3/src/judgeval/judges/litellm_judge.py +129 -0
  57. judgeval-0.22.3/src/judgeval/judges/together_judge.py +136 -0
  58. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/judges/utils.py +7 -20
  59. judgeval-0.22.3/src/judgeval/judgment_attribute_keys.py +55 -0
  60. {judgeval-0.0.55/src/judgeval/common → judgeval-0.22.3/src/judgeval}/logger.py +24 -8
  61. judgeval-0.22.3/src/judgeval/prompt/__init__.py +330 -0
  62. judgeval-0.22.3/src/judgeval/scorers/__init__.py +29 -0
  63. judgeval-0.22.3/src/judgeval/scorers/agent_scorer.py +17 -0
  64. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/scorers/api_scorer.py +21 -23
  65. judgeval-0.22.3/src/judgeval/scorers/base_scorer.py +97 -0
  66. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/scorers/example_scorer.py +1 -3
  67. judgeval-0.22.3/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +25 -0
  68. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  69. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  70. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  71. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  72. judgeval-0.22.3/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  73. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/scorers/score.py +65 -47
  74. judgeval-0.22.3/src/judgeval/scorers/utils.py +14 -0
  75. judgeval-0.22.3/src/judgeval/tracer/__init__.py +1123 -0
  76. judgeval-0.22.3/src/judgeval/tracer/constants.py +1 -0
  77. judgeval-0.22.3/src/judgeval/tracer/exporters/__init__.py +40 -0
  78. judgeval-0.22.3/src/judgeval/tracer/exporters/s3.py +119 -0
  79. judgeval-0.22.3/src/judgeval/tracer/exporters/store.py +59 -0
  80. judgeval-0.22.3/src/judgeval/tracer/exporters/utils.py +32 -0
  81. judgeval-0.22.3/src/judgeval/tracer/keys.py +63 -0
  82. judgeval-0.22.3/src/judgeval/tracer/llm/__init__.py +7 -0
  83. judgeval-0.22.3/src/judgeval/tracer/llm/config.py +78 -0
  84. judgeval-0.22.3/src/judgeval/tracer/llm/constants.py +9 -0
  85. judgeval-0.22.3/src/judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  86. judgeval-0.22.3/src/judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  87. judgeval-0.22.3/src/judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  88. judgeval-0.22.3/src/judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  89. judgeval-0.22.3/src/judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  90. judgeval-0.22.3/src/judgeval/tracer/llm/llm_google/__init__.py +3 -0
  91. judgeval-0.22.3/src/judgeval/tracer/llm/llm_google/config.py +6 -0
  92. judgeval-0.22.3/src/judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  93. judgeval-0.22.3/src/judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  94. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  95. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  96. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  97. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/config.py +6 -0
  98. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/responses.py +506 -0
  99. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/utils.py +42 -0
  100. judgeval-0.22.3/src/judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  101. judgeval-0.22.3/src/judgeval/tracer/llm/llm_together/__init__.py +3 -0
  102. judgeval-0.22.3/src/judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  103. judgeval-0.22.3/src/judgeval/tracer/llm/llm_together/config.py +6 -0
  104. judgeval-0.22.3/src/judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  105. judgeval-0.22.3/src/judgeval/tracer/llm/providers.py +19 -0
  106. judgeval-0.22.3/src/judgeval/tracer/managers.py +167 -0
  107. judgeval-0.22.3/src/judgeval/tracer/processors/__init__.py +220 -0
  108. judgeval-0.22.3/src/judgeval/tracer/utils.py +19 -0
  109. judgeval-0.22.3/src/judgeval/trainer/__init__.py +14 -0
  110. judgeval-0.22.3/src/judgeval/trainer/base_trainer.py +122 -0
  111. judgeval-0.22.3/src/judgeval/trainer/config.py +123 -0
  112. judgeval-0.22.3/src/judgeval/trainer/console.py +144 -0
  113. judgeval-0.22.3/src/judgeval/trainer/fireworks_trainer.py +392 -0
  114. judgeval-0.22.3/src/judgeval/trainer/trainable_model.py +252 -0
  115. judgeval-0.22.3/src/judgeval/trainer/trainer.py +70 -0
  116. judgeval-0.22.3/src/judgeval/utils/async_utils.py +39 -0
  117. judgeval-0.22.3/src/judgeval/utils/decorators/__init__.py +0 -0
  118. judgeval-0.22.3/src/judgeval/utils/decorators/dont_throw.py +37 -0
  119. judgeval-0.22.3/src/judgeval/utils/decorators/use_once.py +13 -0
  120. judgeval-0.22.3/src/judgeval/utils/file_utils.py +97 -0
  121. judgeval-0.22.3/src/judgeval/utils/guards.py +36 -0
  122. judgeval-0.22.3/src/judgeval/utils/meta.py +27 -0
  123. judgeval-0.22.3/src/judgeval/utils/project.py +15 -0
  124. judgeval-0.22.3/src/judgeval/utils/serialize.py +253 -0
  125. judgeval-0.22.3/src/judgeval/utils/testing.py +70 -0
  126. judgeval-0.22.3/src/judgeval/utils/url.py +10 -0
  127. {judgeval-0.0.55/src/judgeval → judgeval-0.22.3/src/judgeval/utils}/version_check.py +5 -3
  128. judgeval-0.22.3/src/judgeval/utils/wrappers/README.md +3 -0
  129. judgeval-0.22.3/src/judgeval/utils/wrappers/__init__.py +15 -0
  130. judgeval-0.22.3/src/judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  131. judgeval-0.22.3/src/judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  132. judgeval-0.22.3/src/judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  133. judgeval-0.22.3/src/judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  134. judgeval-0.22.3/src/judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  135. judgeval-0.22.3/src/judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  136. judgeval-0.22.3/src/judgeval/utils/wrappers/py.typed +0 -0
  137. judgeval-0.22.3/src/judgeval/utils/wrappers/utils.py +35 -0
  138. judgeval-0.22.3/src/judgeval/v1/__init__.py +88 -0
  139. judgeval-0.22.3/src/judgeval/v1/data/__init__.py +7 -0
  140. judgeval-0.22.3/src/judgeval/v1/data/example.py +44 -0
  141. judgeval-0.22.3/src/judgeval/v1/data/scorer_data.py +42 -0
  142. judgeval-0.22.3/src/judgeval/v1/data/scoring_result.py +44 -0
  143. judgeval-0.22.3/src/judgeval/v1/datasets/__init__.py +6 -0
  144. judgeval-0.22.3/src/judgeval/v1/datasets/dataset.py +152 -0
  145. judgeval-0.22.3/src/judgeval/v1/datasets/dataset_factory.py +88 -0
  146. judgeval-0.22.3/src/judgeval/v1/evaluation/__init__.py +6 -0
  147. judgeval-0.22.3/src/judgeval/v1/evaluation/evaluation.py +184 -0
  148. judgeval-0.22.3/src/judgeval/v1/evaluation/evaluation_factory.py +17 -0
  149. judgeval-0.22.3/src/judgeval/v1/instrumentation/__init__.py +6 -0
  150. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/__init__.py +7 -0
  151. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/config.py +78 -0
  152. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/constants.py +11 -0
  153. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  154. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  155. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  156. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  157. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  158. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  159. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  160. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  161. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  162. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  163. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  164. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  165. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  166. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  167. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  168. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  169. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  170. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  171. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  172. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  173. judgeval-0.22.3/src/judgeval/v1/instrumentation/llm/providers.py +19 -0
  174. judgeval-0.22.3/src/judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  175. judgeval-0.22.3/src/judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  176. judgeval-0.22.3/src/judgeval/v1/integrations/langgraph/__init__.py +13 -0
  177. judgeval-0.22.3/src/judgeval/v1/integrations/openlit/__init__.py +47 -0
  178. judgeval-0.22.3/src/judgeval/v1/internal/api/__init__.py +525 -0
  179. judgeval-0.22.3/src/judgeval/v1/internal/api/api_types.py +416 -0
  180. judgeval-0.22.3/src/judgeval/v1/prompts/__init__.py +6 -0
  181. judgeval-0.22.3/src/judgeval/v1/prompts/prompt.py +29 -0
  182. judgeval-0.22.3/src/judgeval/v1/prompts/prompt_factory.py +189 -0
  183. judgeval-0.22.3/src/judgeval/v1/py.typed +0 -0
  184. judgeval-0.22.3/src/judgeval/v1/scorers/__init__.py +6 -0
  185. judgeval-0.22.3/src/judgeval/v1/scorers/api_scorer.py +82 -0
  186. judgeval-0.22.3/src/judgeval/v1/scorers/base_scorer.py +17 -0
  187. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/__init__.py +17 -0
  188. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  189. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  190. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  191. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  192. judgeval-0.22.3/src/judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  193. judgeval-0.22.3/src/judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  194. judgeval-0.22.3/src/judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  195. judgeval-0.22.3/src/judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  196. judgeval-0.22.3/src/judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  197. judgeval-0.22.3/src/judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  198. judgeval-0.22.3/src/judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  199. judgeval-0.22.3/src/judgeval/v1/scorers/scorers_factory.py +49 -0
  200. judgeval-0.22.3/src/judgeval/v1/tracer/__init__.py +7 -0
  201. judgeval-0.22.3/src/judgeval/v1/tracer/base_tracer.py +520 -0
  202. judgeval-0.22.3/src/judgeval/v1/tracer/exporters/__init__.py +14 -0
  203. judgeval-0.22.3/src/judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  204. judgeval-0.22.3/src/judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  205. judgeval-0.22.3/src/judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  206. judgeval-0.22.3/src/judgeval/v1/tracer/exporters/span_store.py +50 -0
  207. judgeval-0.22.3/src/judgeval/v1/tracer/processors/__init__.py +6 -0
  208. judgeval-0.22.3/src/judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  209. judgeval-0.22.3/src/judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  210. judgeval-0.22.3/src/judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  211. judgeval-0.22.3/src/judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  212. judgeval-0.22.3/src/judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  213. judgeval-0.22.3/src/judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  214. judgeval-0.22.3/src/judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  215. judgeval-0.22.3/src/judgeval/v1/tracer/tracer.py +61 -0
  216. judgeval-0.22.3/src/judgeval/v1/tracer/tracer_factory.py +36 -0
  217. judgeval-0.22.3/src/judgeval/v1/trainers/__init__.py +5 -0
  218. judgeval-0.22.3/src/judgeval/v1/trainers/base_trainer.py +62 -0
  219. judgeval-0.22.3/src/judgeval/v1/trainers/config.py +123 -0
  220. judgeval-0.22.3/src/judgeval/v1/trainers/console.py +144 -0
  221. judgeval-0.22.3/src/judgeval/v1/trainers/fireworks_trainer.py +392 -0
  222. judgeval-0.22.3/src/judgeval/v1/trainers/trainable_model.py +252 -0
  223. judgeval-0.22.3/src/judgeval/v1/trainers/trainers_factory.py +37 -0
  224. judgeval-0.22.3/src/judgeval/v1/utils.py +18 -0
  225. judgeval-0.22.3/src/judgeval/version.py +5 -0
  226. judgeval-0.22.3/src/judgeval/warnings.py +4 -0
  227. judgeval-0.22.3/update_version.py +35 -0
  228. judgeval-0.22.3/uv.lock +5786 -0
  229. judgeval-0.0.55/.github/workflows/ci.yaml +0 -163
  230. judgeval-0.0.55/PKG-INFO +0 -1384
  231. judgeval-0.0.55/README.md +0 -1354
  232. judgeval-0.0.55/assets/logo-dark.svg +0 -23
  233. judgeval-0.0.55/assets/logo-light.svg +0 -18
  234. judgeval-0.0.55/assets/new_darkmode.svg +0 -29
  235. judgeval-0.0.55/assets/new_lightmode.svg +0 -34
  236. judgeval-0.0.55/assets/product_shot.png +0 -0
  237. judgeval-0.0.55/pyproject.toml +0 -77
  238. judgeval-0.0.55/src/.coveragerc +0 -4
  239. judgeval-0.0.55/src/judgeval/__init__.py +0 -13
  240. judgeval-0.0.55/src/judgeval/clients.py +0 -34
  241. judgeval-0.0.55/src/judgeval/common/__init__.py +0 -13
  242. judgeval-0.0.55/src/judgeval/common/exceptions.py +0 -27
  243. judgeval-0.0.55/src/judgeval/common/s3_storage.py +0 -98
  244. judgeval-0.0.55/src/judgeval/common/tracer.py +0 -3215
  245. judgeval-0.0.55/src/judgeval/common/utils.py +0 -940
  246. judgeval-0.0.55/src/judgeval/data/datasets/__init__.py +0 -4
  247. judgeval-0.0.55/src/judgeval/data/datasets/dataset.py +0 -341
  248. judgeval-0.0.55/src/judgeval/data/datasets/eval_dataset_client.py +0 -341
  249. judgeval-0.0.55/src/judgeval/data/example.py +0 -61
  250. judgeval-0.0.55/src/judgeval/data/judgment_types.py +0 -214
  251. judgeval-0.0.55/src/judgeval/data/tool.py +0 -5
  252. judgeval-0.0.55/src/judgeval/data/trace.py +0 -135
  253. judgeval-0.0.55/src/judgeval/data/trace_run.py +0 -40
  254. judgeval-0.0.55/src/judgeval/evaluation_run.py +0 -77
  255. judgeval-0.0.55/src/judgeval/integrations/langgraph.py +0 -964
  256. judgeval-0.0.55/src/judgeval/judges/litellm_judge.py +0 -68
  257. judgeval-0.0.55/src/judgeval/judges/mixture_of_judges.py +0 -286
  258. judgeval-0.0.55/src/judgeval/judges/together_judge.py +0 -65
  259. judgeval-0.0.55/src/judgeval/judgment_client.py +0 -563
  260. judgeval-0.0.55/src/judgeval/rules.py +0 -521
  261. judgeval-0.0.55/src/judgeval/run_evaluation.py +0 -1086
  262. judgeval-0.0.55/src/judgeval/scorers/__init__.py +0 -33
  263. judgeval-0.0.55/src/judgeval/scorers/agent_scorer.py +0 -21
  264. judgeval-0.0.55/src/judgeval/scorers/base_scorer.py +0 -98
  265. judgeval-0.0.55/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -47
  266. judgeval-0.0.55/src/judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
  267. judgeval-0.0.55/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  268. judgeval-0.0.55/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  269. judgeval-0.0.55/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  270. judgeval-0.0.55/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  271. judgeval-0.0.55/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  272. judgeval-0.0.55/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  273. judgeval-0.0.55/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  274. judgeval-0.0.55/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
  275. judgeval-0.0.55/src/judgeval/scorers/utils.py +0 -131
  276. judgeval-0.0.55/src/judgeval/tracer/__init__.py +0 -3
  277. judgeval-0.0.55/src/judgeval/utils/alerts.py +0 -93
  278. judgeval-0.0.55/src/judgeval/utils/file_utils.py +0 -51
  279. judgeval-0.0.55/src/judgeval/utils/requests.py +0 -29
  280. judgeval-0.0.55/src/update_types.sh +0 -14
  281. judgeval-0.0.55/update_version.py +0 -32
  282. judgeval-0.0.55/uv.lock +0 -3789
  283. {judgeval-0.0.55 → judgeval-0.22.3}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  284. {judgeval-0.0.55 → judgeval-0.22.3}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  285. {judgeval-0.0.55 → judgeval-0.22.3}/.github/workflows/blocked-pr.yaml +0 -0
  286. {judgeval-0.0.55 → judgeval-0.22.3}/.github/workflows/merge-branch-check.yaml +0 -0
  287. {judgeval-0.0.55 → judgeval-0.22.3}/.github/workflows/validate-branch.yaml +0 -0
  288. {judgeval-0.0.55 → judgeval-0.22.3}/.gitignore +0 -0
  289. {judgeval-0.0.55 → judgeval-0.22.3}/LICENSE.md +0 -0
  290. {judgeval-0.0.55 → judgeval-0.22.3}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  291. {judgeval-0.0.55 → judgeval-0.22.3}/assets/agent.gif +0 -0
  292. {judgeval-0.0.55 → judgeval-0.22.3}/assets/data.gif +0 -0
  293. {judgeval-0.0.55 → judgeval-0.22.3}/assets/dataset_clustering_screenshot.png +0 -0
  294. {judgeval-0.0.55 → judgeval-0.22.3}/assets/dataset_clustering_screenshot_dm.png +0 -0
  295. {judgeval-0.0.55 → judgeval-0.22.3}/assets/datasets_preview_screenshot.png +0 -0
  296. {judgeval-0.0.55 → judgeval-0.22.3}/assets/document.gif +0 -0
  297. {judgeval-0.0.55 → judgeval-0.22.3}/assets/error_analysis_dashboard.png +0 -0
  298. {judgeval-0.0.55 → judgeval-0.22.3}/assets/experiments_dashboard_screenshot.png +0 -0
  299. {judgeval-0.0.55 → judgeval-0.22.3}/assets/experiments_page.png +0 -0
  300. {judgeval-0.0.55 → judgeval-0.22.3}/assets/experiments_pagev2.png +0 -0
  301. {judgeval-0.0.55 → judgeval-0.22.3}/assets/monitoring_screenshot.png +0 -0
  302. {judgeval-0.0.55 → judgeval-0.22.3}/assets/trace.gif +0 -0
  303. {judgeval-0.0.55 → judgeval-0.22.3}/assets/trace_demo.png +0 -0
  304. {judgeval-0.0.55 → judgeval-0.22.3}/assets/trace_screenshot.png +0 -0
  305. {judgeval-0.0.55 → judgeval-0.22.3}/assets/trace_screenshot_old.png +0 -0
  306. {judgeval-0.0.55 → judgeval-0.22.3}/pytest.ini +0 -0
  307. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  308. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/judges/base_judge.py +0 -0
  309. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/scorers/exceptions.py +0 -0
  310. {judgeval-0.0.55 → judgeval-0.22.3}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
@@ -0,0 +1,5 @@
1
+ blank_issues_enabled: false
2
+ contact_links:
3
+ - name: 🤔 Questions or discussions
4
+ url: https://github.com/orgs/JudgmentLabs/discussions
5
+ about: Please use discussions for questions
@@ -10,14 +10,7 @@
10
10
  -->
11
11
  - [ ] 1. ...
12
12
 
13
- ## 🎥 Demo of Changes
14
-
15
- <!-- Add a short 1-3 minute video describing/demoing the changes -->
16
-
17
13
  ## ✅ Checklist
18
14
 
19
- - [ ] Tagged Linear ticket in PR title. Ie. PR Title (JUD-XXXX)
20
- - [ ] Video demo of changes
21
- - [ ] Reviewers assigned
22
15
  - [ ] Docs updated ([if necessary](https://github.com/JudgmentLabs/docs))
23
- - [ ] Cookbooks updated ([if necessary](https://github.com/JudgmentLabs/judgment-cookbook))
16
+ - [ ] Changelogs are updated ([if necessary](https://github.com/JudgmentLabs/docs/tree/main/content/docs/changelog/%28weekly%29))
@@ -0,0 +1,148 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize, reopened]
6
+
7
+ permissions: read-all
8
+
9
+ jobs:
10
+ validate-branch:
11
+ uses: ./.github/workflows/merge-branch-check.yaml
12
+
13
+ run-tests:
14
+ needs: [validate-branch]
15
+ if: needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped'
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ os: [ubuntu-latest, macos-latest]
20
+ python-version:
21
+ - "3.10"
22
+ - "3.11"
23
+ - "3.12"
24
+ - "3.13"
25
+ name: Unit Tests
26
+ runs-on: ${{ matrix.os }}
27
+ env:
28
+ PYTHONPATH: "."
29
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
30
+ TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
31
+ GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
32
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
33
+ OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
34
+ JUDGMENT_DEV: true
35
+
36
+ steps:
37
+ - name: Checkout code
38
+ uses: actions/checkout@v4
39
+
40
+ - name: Set up Python
41
+ uses: actions/setup-python@v4
42
+ with:
43
+ python-version: ${{ matrix.python-version }}
44
+
45
+ - name: Install dependencies
46
+ run: |
47
+ pip install uv
48
+ uv sync --dev
49
+
50
+ - name: Install Claude Code CLI
51
+ run: |
52
+ npm install -g @anthropic-ai/claude-code
53
+
54
+ - name: Run tests
55
+ run: |
56
+ cd src
57
+ export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
58
+ export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
59
+ uv run pytest tests -n auto
60
+
61
+ run-e2e-tests:
62
+ needs: [validate-branch]
63
+ if: "(github.base_ref == 'staging' || github.base_ref == 'main') && !contains(github.actor, '[bot]') && (needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped')"
64
+ strategy:
65
+ fail-fast: false
66
+ matrix:
67
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
68
+ name: E2E Tests
69
+ runs-on: ubuntu-latest
70
+ env:
71
+ TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
72
+ steps:
73
+ - name: Configure AWS Credentials
74
+ uses: aws-actions/configure-aws-credentials@v4
75
+ with:
76
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
77
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
78
+ aws-region: us-west-1
79
+
80
+ - name: Checkout code
81
+ uses: actions/checkout@v4
82
+
83
+ - name: Set env based on branch
84
+ run: |
85
+ if [ "${{ github.base_ref }}" = "main" ]; then
86
+ echo "TARGET_ENV=main" >> "$GITHUB_ENV"
87
+ echo "BASE_URL=https://api.judgmentlabs.ai" >> "$GITHUB_ENV"
88
+ echo "SECRETS_PATH=prod/api-keys/e2e-tests" >> "$GITHUB_ENV"
89
+ echo "COVERAGE_ARTIFACT=coverage-html-production-${{ matrix.python-version }}" >> "$GITHUB_ENV"
90
+ else
91
+ echo "TARGET_ENV=staging" >> "$GITHUB_ENV"
92
+ echo "BASE_URL=https://staging.api.judgmentlabs.ai" >> "$GITHUB_ENV"
93
+ echo "SECRETS_PATH=stg/api-keys/e2e-tests" >> "$GITHUB_ENV"
94
+ echo "COVERAGE_ARTIFACT=coverage-html-staging-${{ matrix.python-version }}" >> "$GITHUB_ENV"
95
+ fi
96
+
97
+ - name: Restore uv cache
98
+ uses: actions/cache/restore@v4
99
+ id: restore-uv-cache
100
+ with:
101
+ path: ~/.cache/uv/
102
+ key: ${{ runner.os }}-uv-judgment-${{ hashFiles('./**/uv.lock') }}
103
+ restore-keys: |
104
+ ${{ runner.os }}-uv-judgment-
105
+ ${{ runner.os }}-uv-
106
+
107
+ - name: Set up Python
108
+ uses: actions/setup-python@v4
109
+ with:
110
+ python-version: ${{ matrix.python-version }}
111
+
112
+ - name: Install judgeval dependencies
113
+ run: |
114
+ pip install uv
115
+ uv sync --dev
116
+
117
+ - name: Check if server is running
118
+ run: |
119
+ if ! curl -s "$BASE_URL/health" > /dev/null; then
120
+ echo "Judgment server ($BASE_URL) is not running properly. Check CloudWatch logs."
121
+ exit 1
122
+ else
123
+ echo "Server is running."
124
+ fi
125
+
126
+ - name: Run E2E tests
127
+ working-directory: src
128
+ run: |
129
+ SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id "$SECRETS_PATH" --query SecretString --output text)
130
+ export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
131
+ export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
132
+ export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
133
+ export JUDGMENT_API_URL="$BASE_URL"
134
+ timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
135
+
136
+ - name: Upload coverage HTML report
137
+ if: always()
138
+ uses: actions/upload-artifact@v4
139
+ with:
140
+ name: ${{ env.COVERAGE_ARTIFACT }}
141
+ path: src/htmlcov
142
+
143
+ - name: Save uv cache
144
+ uses: actions/cache/save@v4
145
+ if: always() && steps.restore-uv-cache.outputs.cache-hit != 'true'
146
+ with:
147
+ path: ~/.cache/uv/
148
+ key: ${{ runner.os }}-uv-judgment-${{ hashFiles('./**/uv.lock') }}
@@ -0,0 +1,35 @@
1
+ name: Claude Code Review
2
+
3
+ on:
4
+ issue_comment:
5
+ types: [created]
6
+ jobs:
7
+ claude-review:
8
+ if: github.event.issue.pull_request && contains(github.event.comment.body, '/claude review')
9
+ runs-on: ubuntu-latest
10
+ permissions:
11
+ contents: read
12
+ pull-requests: read
13
+ issues: read
14
+ id-token: write
15
+
16
+ steps:
17
+ - name: Checkout repository
18
+ uses: actions/checkout@v4
19
+ with:
20
+ fetch-depth: 1
21
+
22
+ - name: Run Claude Code Review
23
+ id: claude-review
24
+ uses: anthropics/claude-code-action@beta
25
+ with:
26
+ anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
27
+ direct_prompt: |
28
+ Please review this pull request and provide feedback on:
29
+ - Code quality and best practices
30
+ - Potential bugs or issues
31
+ - Performance considerations
32
+ - Security concerns
33
+ - Test coverage
34
+
35
+ Be constructive and helpful in your feedback.
@@ -0,0 +1,40 @@
1
+ name: Claude Code
2
+
3
+ on:
4
+ issue_comment:
5
+ types: [created]
6
+ pull_request_review_comment:
7
+ types: [created]
8
+ issues:
9
+ types: [opened, assigned]
10
+ pull_request_review:
11
+ types: [submitted]
12
+
13
+ jobs:
14
+ claude:
15
+ if: |
16
+ (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
17
+ (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||
18
+ (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) ||
19
+ (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude')))
20
+ runs-on: ubuntu-latest
21
+ permissions:
22
+ contents: read
23
+ pull-requests: read
24
+ issues: read
25
+ id-token: write
26
+ actions: read
27
+ steps:
28
+ - name: Checkout repository
29
+ uses: actions/checkout@v4
30
+ with:
31
+ fetch-depth: 1
32
+
33
+ - name: Run Claude Code
34
+ id: claude
35
+ uses: anthropics/claude-code-action@beta
36
+ with:
37
+ anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
38
+
39
+ additional_permissions: |
40
+ actions: read
@@ -10,20 +10,11 @@ jobs:
10
10
  steps:
11
11
  - uses: actions/checkout@v4
12
12
 
13
- - name: Set up Python
14
- uses: actions/setup-python@v5
15
- with:
16
- python-version: '3.11'
17
-
18
13
  - name: Install ruff
19
14
  uses: astral-sh/ruff-action@v3
20
15
  with:
21
16
  args: "--version"
22
17
 
23
- - name: Install mypy and dependencies
24
- run: |
25
- pip install mypy types-requests types-PyYAML
26
-
27
18
  - name: Run ruff formatter
28
19
  if: always()
29
20
  run: ruff format --check .
@@ -31,7 +22,3 @@ jobs:
31
22
  - name: Run ruff linter
32
23
  if: always()
33
24
  run: ruff check .
34
-
35
- - name: Run mypy
36
- if: always()
37
- run: mypy --explicit-package-bases --ignore-missing-imports .
@@ -0,0 +1,25 @@
1
+ name: MyPy Check
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [ main, staging ]
6
+
7
+ jobs:
8
+ mypy:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+
13
+ - name: Set up Python
14
+ uses: actions/setup-python@v5
15
+ with:
16
+ python-version: '3.11'
17
+
18
+ - name: Install dependencies
19
+ run: |
20
+ pip install uv
21
+ uv sync --dev
22
+
23
+ - name: Run mypy
24
+ if: always()
25
+ run: uv run mypy ./src/judgeval/
@@ -0,0 +1,38 @@
1
+ name: Pre-commit auto-update
2
+ on:
3
+ schedule:
4
+ - cron: '0 0 * * 1' # Weekly on Monday at midnight UTC
5
+ workflow_dispatch:
6
+
7
+ jobs:
8
+ auto-update:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Checkout repository
12
+ uses: actions/checkout@v4
13
+ with:
14
+ ref: staging
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: '3.11'
20
+
21
+ - name: Install and update pre-commit
22
+ run: |
23
+ pip install pre-commit
24
+ pre-commit autoupdate
25
+
26
+ - name: Create Pull Request
27
+ uses: peter-evans/create-pull-request@v7
28
+ with:
29
+ commit-message: 'chore: update pre-commit hooks'
30
+ title: 'chore: update pre-commit hooks'
31
+ body: |
32
+ Auto-generated PR to update pre-commit hook versions.
33
+
34
+ Please review the changes and merge if everything looks good.
35
+
36
+ Updated by GitHub Actions on {{ date }}.
37
+ branch: update-pre-commit-hooks
38
+ base: staging
@@ -28,6 +28,18 @@ jobs:
28
28
  version=$(curl -s https://pypi.org/pypi/judgeval/json | jq -r .info.version)
29
29
  echo "latest_version=$version" >> $GITHUB_OUTPUT
30
30
 
31
+ - name: Determine bump type (minor if commit message starts with [Bump Minor Version], else patch)
32
+ id: bump_type
33
+ run: |
34
+ # Get the latest commit message
35
+ commit_message=$(git log -1 --pretty=%B)
36
+ # Default bump type
37
+ bump_type=patch
38
+ if [[ "$commit_message" == "[Bump Minor Version]"* ]]; then
39
+ bump_type=minor
40
+ fi
41
+ echo "bump_type=$bump_type" >> $GITHUB_OUTPUT
42
+
31
43
  - name: Bump version and create new tag
32
44
  id: bump_tag
33
45
  run: |
@@ -37,9 +49,29 @@ jobs:
37
49
  # Extract version numbers
38
50
  IFS='.' read -r major minor patch <<< "$latest_version"
39
51
 
40
- # Bump patch version
41
- patch=$((patch + 1))
42
- new_version="$major.$minor.$patch"
52
+ # Set major version (manually modify this if you want to change the major version)
53
+ new_major_version=0
54
+
55
+ # Validate that new major version is greater than current major version
56
+ if [ "$new_major_version" -lt "$major" ]; then
57
+ echo "Error: New major version ($new_major_version) must be greater than or equal to current major version ($major)"
58
+ exit 1
59
+ fi
60
+
61
+ if [ "$new_major_version" -ne "$major" ]; then # If major version changed, set minor and patch to 0
62
+ echo "Major version bumped, setting minor and patch to 0"
63
+ minor=0
64
+ patch=0
65
+ elif [ "${{ steps.bump_type.outputs.bump_type }}" = "minor" ]; then
66
+ echo "Minor version bumped, setting patch to 0"
67
+ minor=$((minor + 1))
68
+ patch=0
69
+ else
70
+ echo "Patch version bumped"
71
+ patch=$((patch + 1))
72
+ fi
73
+
74
+ new_version="$new_major_version.$minor.$patch"
43
75
 
44
76
  echo "New version: $new_version"
45
77
  echo "new_version=$new_version" >> $GITHUB_OUTPUT
@@ -59,7 +91,7 @@ jobs:
59
91
  env:
60
92
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
61
93
 
62
- - name: Bump pyproject.toml version
94
+ - name: Bump pyproject.toml and version.py version
63
95
  run: |
64
96
  python update_version.py ${{ steps.bump_tag.outputs.new_version }}
65
97
 
@@ -1,11 +1,11 @@
1
1
  repos:
2
2
  - repo: https://github.com/astral-sh/uv-pre-commit
3
- rev: 0.7.14
3
+ rev: 0.9.7
4
4
  hooks:
5
5
  - id: uv-lock
6
6
 
7
7
  - repo: https://github.com/astral-sh/ruff-pre-commit
8
- rev: v0.12.0
8
+ rev: v0.14.3
9
9
  hooks:
10
10
  - id: ruff
11
11
  name: ruff (linter)
@@ -14,8 +14,10 @@ repos:
14
14
  name: ruff (formatter)
15
15
 
16
16
  - repo: https://github.com/pre-commit/mirrors-mypy
17
- rev: v1.16.1
17
+ rev: v1.18.2
18
18
  hooks:
19
19
  - id: mypy
20
- args: [--explicit-package-bases, --ignore-missing-imports]
21
- additional_dependencies: [types-requests, types-PyYAML]
20
+ language: system
21
+ # These next two lines allow commits even if mypy fails, REMOVE once we fix all mypy errors
22
+ verbose: true
23
+ entry: bash -c 'mypy src/judgeval/ || true'
@@ -0,0 +1,10 @@
1
+ # Contribute to Judgeval
2
+
3
+ There are many ways to contribute to Judgeval:
4
+
5
+ - Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
6
+ - Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
7
+ - Speaking or writing about Judgment and letting us know!
8
+
9
+ <!-- Contributors collage -->
10
+ [![Contributors](https://contributors-img.web.app/image?repo=JudgmentLabs/judgeval)](https://github.com/JudgmentLabs/judgeval/graphs/contributors)