judgeval 0.0.39__tar.gz → 0.0.41__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. judgeval-0.0.41/.github/workflows/blocked-pr.yaml +19 -0
  2. judgeval-0.0.41/.github/workflows/ci-staging.yaml +103 -0
  3. {judgeval-0.0.39 → judgeval-0.0.41}/.github/workflows/ci.yaml +13 -3
  4. judgeval-0.0.41/.github/workflows/merge-to-main.yaml +32 -0
  5. judgeval-0.0.41/.github/workflows/release.yaml +92 -0
  6. judgeval-0.0.41/PKG-INFO +1450 -0
  7. {judgeval-0.0.39 → judgeval-0.0.41}/Pipfile +1 -0
  8. {judgeval-0.0.39 → judgeval-0.0.41}/Pipfile.lock +85 -4
  9. judgeval-0.0.41/README.md +1422 -0
  10. judgeval-0.0.41/assets/experiments_pagev2.png +0 -0
  11. judgeval-0.0.41/assets/new_darkmode.svg +29 -0
  12. judgeval-0.0.41/assets/new_lightmode.svg +34 -0
  13. {judgeval-0.0.39 → judgeval-0.0.41}/docs/monitoring/tracing.mdx +1 -1
  14. {judgeval-0.0.39 → judgeval-0.0.41}/pyproject.toml +2 -2
  15. judgeval-0.0.41/src/.coveragerc +4 -0
  16. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/clients.py +6 -4
  17. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/common/tracer.py +504 -257
  18. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/common/utils.py +5 -1
  19. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/constants.py +2 -0
  20. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/data/__init__.py +2 -1
  21. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/data/datasets/dataset.py +12 -6
  22. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/data/datasets/eval_dataset_client.py +3 -1
  23. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/data/example.py +7 -7
  24. judgeval-0.0.41/src/judgeval/data/tool.py +47 -0
  25. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/data/trace.py +31 -39
  26. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/data/trace_run.py +2 -1
  27. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/evaluation_run.py +4 -7
  28. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/judgment_client.py +34 -7
  29. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/run_evaluation.py +67 -19
  30. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/__init__.py +4 -1
  31. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorer.py +12 -1
  32. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
  33. judgeval-0.0.41/src/judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
  34. judgeval-0.0.41/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
  35. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
  36. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/prompt_scorer.py +8 -164
  37. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/score.py +15 -15
  38. judgeval-0.0.41/update_version.py +32 -0
  39. judgeval-0.0.39/PKG-INFO +0 -247
  40. judgeval-0.0.39/README.md +0 -219
  41. judgeval-0.0.39/src/judgeval/data/tool.py +0 -19
  42. {judgeval-0.0.39 → judgeval-0.0.41}/.github/pull_request_template.md +0 -0
  43. {judgeval-0.0.39 → judgeval-0.0.41}/.gitignore +0 -0
  44. {judgeval-0.0.39 → judgeval-0.0.41}/LICENSE.md +0 -0
  45. {judgeval-0.0.39 → judgeval-0.0.41}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  46. {judgeval-0.0.39 → judgeval-0.0.41}/assets/dataset_clustering_screenshot.png +0 -0
  47. {judgeval-0.0.39 → judgeval-0.0.41}/assets/dataset_clustering_screenshot_dm.png +0 -0
  48. {judgeval-0.0.39 → judgeval-0.0.41}/assets/datasets_preview_screenshot.png +0 -0
  49. {judgeval-0.0.39 → judgeval-0.0.41}/assets/experiments_dashboard_screenshot.png +0 -0
  50. {judgeval-0.0.39 → judgeval-0.0.41}/assets/experiments_page.png +0 -0
  51. {judgeval-0.0.39 → judgeval-0.0.41}/assets/logo-dark.svg +0 -0
  52. {judgeval-0.0.39 → judgeval-0.0.41}/assets/logo-light.svg +0 -0
  53. {judgeval-0.0.39 → judgeval-0.0.41}/assets/monitoring_screenshot.png +0 -0
  54. {judgeval-0.0.39 → judgeval-0.0.41}/assets/trace_screenshot.png +0 -0
  55. {judgeval-0.0.39 → judgeval-0.0.41}/docs/README.md +0 -0
  56. {judgeval-0.0.39 → judgeval-0.0.41}/docs/alerts/notifications.mdx +0 -0
  57. {judgeval-0.0.39 → judgeval-0.0.41}/docs/alerts/platform_notifications.mdx +0 -0
  58. {judgeval-0.0.39 → judgeval-0.0.41}/docs/alerts/rules.mdx +0 -0
  59. {judgeval-0.0.39 → judgeval-0.0.41}/docs/api_reference/judgment_client.mdx +0 -0
  60. {judgeval-0.0.39 → judgeval-0.0.41}/docs/api_reference/trace.mdx +0 -0
  61. {judgeval-0.0.39 → judgeval-0.0.41}/docs/changelog/2025-04-21.mdx +0 -0
  62. {judgeval-0.0.39 → judgeval-0.0.41}/docs/clustering/clustering.mdx +0 -0
  63. {judgeval-0.0.39 → judgeval-0.0.41}/docs/compliance/certifications.mdx +0 -0
  64. {judgeval-0.0.39 → judgeval-0.0.41}/docs/development.mdx +0 -0
  65. {judgeval-0.0.39 → judgeval-0.0.41}/docs/essentials/code.mdx +0 -0
  66. {judgeval-0.0.39 → judgeval-0.0.41}/docs/essentials/images.mdx +0 -0
  67. {judgeval-0.0.39 → judgeval-0.0.41}/docs/essentials/markdown.mdx +0 -0
  68. {judgeval-0.0.39 → judgeval-0.0.41}/docs/essentials/navigation.mdx +0 -0
  69. {judgeval-0.0.39 → judgeval-0.0.41}/docs/essentials/reusable-snippets.mdx +0 -0
  70. {judgeval-0.0.39 → judgeval-0.0.41}/docs/essentials/settings.mdx +0 -0
  71. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/data_datasets.mdx +0 -0
  72. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/data_examples.mdx +0 -0
  73. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/data_sequences.mdx +0 -0
  74. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/experiment_comparisons.mdx +0 -0
  75. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/introduction.mdx +0 -0
  76. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/judges.mdx +0 -0
  77. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/agent/derailment.mdx +0 -0
  78. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  79. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  80. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/default/answer_correctness.mdx +0 -0
  81. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/default/answer_relevancy.mdx +0 -0
  82. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/default/comparison.mdx +0 -0
  83. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/default/contextual_precision.mdx +0 -0
  84. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/default/contextual_recall.mdx +0 -0
  85. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/default/contextual_relevancy.mdx +0 -0
  86. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/default/execution_order.mdx +0 -0
  87. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/default/faithfulness.mdx +0 -0
  88. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/default/groundedness.mdx +0 -0
  89. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/default/json_correctness.mdx +0 -0
  90. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/default/summarization.mdx +0 -0
  91. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/scorers/introduction.mdx +0 -0
  92. {judgeval-0.0.39 → judgeval-0.0.41}/docs/evaluation/unit_testing.mdx +0 -0
  93. {judgeval-0.0.39 → judgeval-0.0.41}/docs/favicon.svg +0 -0
  94. {judgeval-0.0.39 → judgeval-0.0.41}/docs/getting_started.mdx +0 -0
  95. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/annotation_queue_ui.png +0 -0
  96. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/basic_trace_example.png +0 -0
  97. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/checks-passed.png +0 -0
  98. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/cluster.png +0 -0
  99. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/cluster_button.png +0 -0
  100. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/create_aggressive_scorer.png +0 -0
  101. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/create_scorer.png +0 -0
  102. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/dashboard_annotation_queue_button.png +0 -0
  103. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/evaluation_diagram.png +0 -0
  104. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/experiment-comparison-page-2.png +0 -0
  105. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/experiment-page-comparison.png +0 -0
  106. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/experiment-popout-comparison.png +0 -0
  107. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/experiments-page-comparison-2.png +0 -0
  108. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/experiments-page-comparison.png +0 -0
  109. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/export-dataset.png +0 -0
  110. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/hero-dark.svg +0 -0
  111. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/hero-light.svg +0 -0
  112. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/notifications_page.png +0 -0
  113. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/online_eval_fault.png +0 -0
  114. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/reports_modal.png +0 -0
  115. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/synth_data_button.png +0 -0
  116. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/synth_data_window.png +0 -0
  117. {judgeval-0.0.39 → judgeval-0.0.41}/docs/images/trace_ss.png +0 -0
  118. {judgeval-0.0.39 → judgeval-0.0.41}/docs/integration/langgraph.mdx +0 -0
  119. {judgeval-0.0.39 → judgeval-0.0.41}/docs/introduction.mdx +0 -0
  120. {judgeval-0.0.39 → judgeval-0.0.41}/docs/judgment_cli/installation.mdx +0 -0
  121. {judgeval-0.0.39 → judgeval-0.0.41}/docs/judgment_cli/self-hosting.mdx +0 -0
  122. {judgeval-0.0.39 → judgeval-0.0.41}/docs/judgment_cli/supabase-org-id.png +0 -0
  123. {judgeval-0.0.39 → judgeval-0.0.41}/docs/logo/dark.svg +0 -0
  124. {judgeval-0.0.39 → judgeval-0.0.41}/docs/logo/light.svg +0 -0
  125. {judgeval-0.0.39 → judgeval-0.0.41}/docs/mint.json +0 -0
  126. {judgeval-0.0.39 → judgeval-0.0.41}/docs/monitoring/annotations.mdx +0 -0
  127. {judgeval-0.0.39 → judgeval-0.0.41}/docs/monitoring/introduction.mdx +0 -0
  128. {judgeval-0.0.39 → judgeval-0.0.41}/docs/monitoring/production_insights.mdx +0 -0
  129. {judgeval-0.0.39 → judgeval-0.0.41}/docs/monitoring/tracing_s3.mdx +0 -0
  130. {judgeval-0.0.39 → judgeval-0.0.41}/docs/notebooks/create_dataset.ipynb +0 -0
  131. {judgeval-0.0.39 → judgeval-0.0.41}/docs/notebooks/create_scorer.ipynb +0 -0
  132. {judgeval-0.0.39 → judgeval-0.0.41}/docs/notebooks/demo.ipynb +0 -0
  133. {judgeval-0.0.39 → judgeval-0.0.41}/docs/notebooks/prompt_scorer.ipynb +0 -0
  134. {judgeval-0.0.39 → judgeval-0.0.41}/docs/notebooks/quickstart.ipynb +0 -0
  135. {judgeval-0.0.39 → judgeval-0.0.41}/docs/optimization/osiris_agent.mdx +0 -0
  136. {judgeval-0.0.39 → judgeval-0.0.41}/docs/quickstart.mdx +0 -0
  137. {judgeval-0.0.39 → judgeval-0.0.41}/docs/self_hosting/get_started.mdx +0 -0
  138. {judgeval-0.0.39 → judgeval-0.0.41}/docs/snippets/snippet-intro.mdx +0 -0
  139. {judgeval-0.0.39 → judgeval-0.0.41}/docs/synthetic_data/synthetic_data.mdx +0 -0
  140. {judgeval-0.0.39 → judgeval-0.0.41}/pytest.ini +0 -0
  141. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/__init__.py +0 -0
  142. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/common/__init__.py +0 -0
  143. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/common/exceptions.py +0 -0
  144. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/common/logger.py +0 -0
  145. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/common/s3_storage.py +0 -0
  146. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/data/custom_example.py +0 -0
  147. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/data/datasets/__init__.py +0 -0
  148. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/data/result.py +0 -0
  149. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/data/scorer_data.py +0 -0
  150. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/integrations/langgraph.py +0 -0
  151. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/judges/__init__.py +0 -0
  152. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/judges/base_judge.py +0 -0
  153. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/judges/litellm_judge.py +0 -0
  154. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/judges/mixture_of_judges.py +0 -0
  155. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/judges/together_judge.py +0 -0
  156. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/judges/utils.py +0 -0
  157. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/rules.py +0 -0
  158. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/api_scorer.py +0 -0
  159. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/exceptions.py +0 -0
  160. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  161. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  162. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  163. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
  164. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
  165. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
  166. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
  167. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
  168. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  169. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  170. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
  171. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  172. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  173. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
  174. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
  175. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
  176. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  177. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  178. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/scorers/utils.py +0 -0
  179. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/tracer/__init__.py +0 -0
  180. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/utils/alerts.py +0 -0
  181. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/utils/data_utils.py +0 -0
  182. {judgeval-0.0.39 → judgeval-0.0.41}/src/judgeval/version_check.py +0 -0
@@ -0,0 +1,19 @@
1
+ name: Check Blocked PR
2
+
3
+ on:
4
+ pull_request:
5
+ types:
6
+ - opened
7
+ - labeled
8
+ - unlabeled
9
+ - synchronize
10
+
11
+ jobs:
12
+ fail-for-blocked:
13
+ if: contains(github.event.pull_request.labels.*.name, 'Blocked')
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - name: Fail if PR is blocked
17
+ run: |
18
+ echo "This PR is currently blocked. Please unblock it before merging."
19
+ exit 1
@@ -0,0 +1,103 @@
1
+
2
+ name: Staging CI Tests
3
+
4
+ on:
5
+ pull_request:
6
+ types: [opened, synchronize, reopened]
7
+ branches:
8
+ - staging
9
+
10
+ permissions: read-all
11
+
12
+ jobs:
13
+ run-tests:
14
+ strategy:
15
+ fail-fast: false
16
+ matrix:
17
+ os: [ubuntu-latest, macos-latest]
18
+ python-version:
19
+ - "3.11"
20
+ name: Test
21
+ runs-on: ${{ matrix.os }}
22
+ env:
23
+ PYTHONPATH: "."
24
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
25
+ TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
26
+ JUDGMENT_DEV: true
27
+
28
+ steps:
29
+ - name: Checkout code
30
+ uses: actions/checkout@v4
31
+
32
+ - name: Set up Python
33
+ uses: actions/setup-python@v4
34
+ with:
35
+ python-version: ${{ matrix.python-version }}
36
+
37
+ - name: Install dependencies
38
+ run: |
39
+ pip install pipenv
40
+ pipenv install --dev
41
+
42
+
43
+ - name: Run tests
44
+ run: |
45
+ cd src
46
+ pipenv run pytest tests
47
+
48
+ run-e2e-tests-staging:
49
+ if: "!contains(github.actor, '[bot]')" # Exclude if the actor is a bot
50
+ name: Staging E2E Tests
51
+ runs-on: ubuntu-latest
52
+ env:
53
+ TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
54
+ steps:
55
+ - name: Wait for turn
56
+ uses: softprops/turnstyle@v2
57
+ with:
58
+ poll-interval-seconds: 10
59
+ same-branch-only: false
60
+ job-to-wait-for: "Staging E2E Tests"
61
+
62
+ - name: Configure AWS Credentials
63
+ uses: aws-actions/configure-aws-credentials@v4
64
+ with:
65
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
66
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
67
+ aws-region: us-west-1
68
+
69
+ - name: Checkout code
70
+ uses: actions/checkout@v4
71
+
72
+ - name: Set up Python
73
+ uses: actions/setup-python@v4
74
+ with:
75
+ python-version: "3.11"
76
+
77
+ - name: Install judgeval dependencies
78
+ run: |
79
+ pip install pipenv
80
+ pipenv install --dev
81
+
82
+ - name: Check if server is running
83
+ run: |
84
+ if ! curl -s https://staging.api.judgmentlabs.ai/health > /dev/null; then
85
+ echo "Staging Judgment server is not running properly. Check logs on AWS CloudWatch for more details."
86
+ exit 1
87
+ else
88
+ echo "Staging server is running."
89
+ fi
90
+
91
+ - name: Run E2E tests
92
+ working-directory: src
93
+ run: |
94
+ SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id gh-actions-stg-judgeval/api-keys/judgeval --query SecretString --output text)
95
+ export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
96
+ timeout ${TEST_TIMEOUT_SECONDS}s pipenv run pytest --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
97
+
98
+ - name: Upload coverage HTML report
99
+ if: always()
100
+ uses: actions/upload-artifact@v4
101
+ with:
102
+ name: coverage-html
103
+ path: src/htmlcov
@@ -1,3 +1,4 @@
1
+
1
2
  name: CI Tests
2
3
 
3
4
  on:
@@ -48,6 +49,8 @@ jobs:
48
49
  if: "!contains(github.actor, '[bot]')" # Exclude if the actor is a bot
49
50
  name: E2E Tests
50
51
  runs-on: ubuntu-latest
52
+ env:
53
+ TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
51
54
  steps:
52
55
  - name: Wait for turn
53
56
  uses: softprops/turnstyle@v2
@@ -78,7 +81,7 @@ jobs:
78
81
 
79
82
  - name: Check if server is running
80
83
  run: |
81
- if ! curl -s http://api.judgmentlabs.ai/health > /dev/null; then
84
+ if ! curl -s https://api.judgmentlabs.ai/health > /dev/null; then
82
85
  echo "Production Judgment server is not running properly. Check logs on AWS CloudWatch for more details."
83
86
  exit 1
84
87
  else
@@ -88,6 +91,13 @@ jobs:
88
91
  - name: Run E2E tests
89
92
  working-directory: src
90
93
  run: |
91
- SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id gh-actions/api-keys/judgeval --query SecretString --output text)
94
+ SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id gh-actions-judgeval/api-keys/judgeval --query SecretString --output text)
92
95
  export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
93
- pipenv run pytest --durations=0 ./e2etests
96
+ timeout ${TEST_TIMEOUT_SECONDS}s pipenv run pytest --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
97
+
98
+ - name: Upload coverage HTML report
99
+ if: always()
100
+ uses: actions/upload-artifact@v4
101
+ with:
102
+ name: coverage-html
103
+ path: src/htmlcov
@@ -0,0 +1,32 @@
1
+ name: Enforce Main Branch Protection
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize, reopened, edited]
6
+
7
+ jobs:
8
+ validate-branch:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Check branch name
12
+ run: |
13
+ # Get the base and source branch names
14
+ BASE_BRANCH="${{ github.base_ref }}"
15
+ SOURCE_BRANCH="${{ github.head_ref }}"
16
+
17
+ echo "BASE_BRANCH: $BASE_BRANCH"
18
+ echo "SOURCE_BRANCH: $SOURCE_BRANCH"
19
+
20
+ # Only run validation if the base branch is main
21
+ if [[ "$BASE_BRANCH" != "main" ]]; then
22
+ echo "Skipping branch validation - not targeting main branch"
23
+ exit 0
24
+ fi
25
+
26
+ # Check if the source branch is staging or starts with hotfix/
27
+ if [[ "$SOURCE_BRANCH" != "staging" && ! "$SOURCE_BRANCH" =~ ^hotfix/ ]]; then
28
+ echo "::error::Pull requests to main can only be created from 'staging' or 'hotfix/*' branches. Current branch: $SOURCE_BRANCH"
29
+ exit 1
30
+ fi
31
+
32
+ echo "Branch validation passed. Source branch: $SOURCE_BRANCH"
@@ -0,0 +1,92 @@
1
+ name: Release on Main Merge
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ release:
10
+ runs-on: ubuntu-latest
11
+ outputs:
12
+ new_version: ${{ steps.bump_tag.outputs.new_version }}
13
+
14
+ steps:
15
+ - name: Checkout code
16
+ uses: actions/checkout@v4
17
+ with:
18
+ fetch-depth: 0
19
+
20
+ - name: Install Python
21
+ uses: actions/setup-python@v4
22
+ with:
23
+ python-version: 3.11
24
+
25
+ - name: Get latest version
26
+ id: get_version
27
+ run: |
28
+ version=$(curl -s https://pypi.org/pypi/judgeval/json | jq -r .info.version)
29
+ echo "latest_version=$version" >> $GITHUB_OUTPUT
30
+
31
+ - name: Bump version and create new tag
32
+ id: bump_tag
33
+ run: |
34
+ latest_version=${{ steps.get_version.outputs.latest_version }}
35
+ echo "Latest version: $latest_version"
36
+
37
+ # Extract version numbers
38
+ IFS='.' read -r major minor patch <<< "$latest_version"
39
+
40
+ # Bump patch version
41
+ patch=$((patch + 1))
42
+ new_version="$major.$minor.$patch"
43
+
44
+ echo "New version: $new_version"
45
+ echo "new_version=$new_version" >> $GITHUB_OUTPUT
46
+
47
+ git config user.name "github-actions"
48
+ git config user.email "github-actions@github.com"
49
+ git tag v$new_version
50
+ git push origin v$new_version
51
+
52
+ - name: Create GitHub release
53
+ uses: softprops/action-gh-release@v2
54
+ with:
55
+ tag_name: v${{ steps.bump_tag.outputs.new_version }}
56
+ generate_release_notes: true
57
+ body: |
58
+ You can find this package release on PyPI: https://pypi.org/project/judgeval/${{ steps.bump_tag.outputs.new_version }}/
59
+ env:
60
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
61
+
62
+ - name: Bump pyproject.toml version
63
+ run: |
64
+ python update_version.py ${{ steps.bump_tag.outputs.new_version }}
65
+
66
+ - name: Build PyPI package
67
+ run: |
68
+ python -m pip install --upgrade build
69
+ python -m build
70
+
71
+ - name: Create PyPI release
72
+ run: |
73
+ python -m pip install --upgrade twine
74
+ python -m twine upload --repository pypi -u ${{ secrets.PYPI_USERNAME }} -p ${{ secrets.PYPI_PASSWORD }} dist/*
75
+
76
+ cleanup:
77
+ needs: release
78
+ if: failure()
79
+ runs-on: ubuntu-latest
80
+ steps:
81
+ - name: Checkout code
82
+ uses: actions/checkout@v4
83
+
84
+ - name: Authenticate GitHub CLI
85
+ run: echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token
86
+
87
+ - name: Delete tag and release
88
+ run: |
89
+ gh release delete v${{ needs.release.outputs.new_version }} --yes
90
+ git push --delete origin v${{ needs.release.outputs.new_version }}
91
+ env:
92
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}