eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,36 @@
1
+ import json
2
+ import logging
3
+ from typing import Any, Dict, List
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
9
+ """
10
+ Reads a JSONL file where each line is a valid JSON object and returns a list of these objects.
11
+
12
+ Args:
13
+ file_path: Path to the JSONL file.
14
+
15
+ Returns:
16
+ A list of dictionaries, where each dictionary is a parsed JSON object from a line.
17
+ Returns an empty list if the file is not found or if errors occur during parsing,
18
+ with errors logged.
19
+ """
20
+ data: List[Dict[str, Any]] = []
21
+ try:
22
+ with open(file_path, "r", encoding="utf-8") as f:
23
+ for i, line in enumerate(f):
24
+ try:
25
+ data.append(json.loads(line.strip()))
26
+ except json.JSONDecodeError as e:
27
+ logger.error(f"Error decoding JSON on line {i+1} in {file_path}: {e} - Line: '{line.strip()}'")
28
+ # Optionally, re-raise, or return partial data, or handle as per desired strictness
29
+ # For now, we'll log and continue, returning successfully parsed lines.
30
+ except FileNotFoundError:
31
+ logger.error(f"File not found: {file_path}")
32
+ return []
33
+ except Exception as e:
34
+ logger.error(f"An unexpected error occurred while reading {file_path}: {e}")
35
+ return []
36
+ return data
@@ -0,0 +1,180 @@
1
+ import os
2
+ from typing import Dict, Literal, Optional
3
+
4
+ import yaml
5
+ from pydantic import BaseModel, ValidationError
6
+
7
+ CONFIG_FILE_NAME = "rewardkit.yaml"
8
+
9
+ # --- Pydantic Models for Configuration Structure ---
10
+
11
+
12
+ class GCPCloudRunConfig(BaseModel):
13
+ project_id: Optional[str] = None # Default will be applied in deploy_command if not set
14
+ region: Optional[str] = None # Default will be applied in deploy_command if not set
15
+ artifact_registry_repository: Optional[str] = None # Default will be applied in deploy_command
16
+ service_name_template: Optional[str] = "rewardeval-{evaluator_id}"
17
+ default_auth_mode: Optional[Literal["api-key", "iam", "mtls-client-auth"]] = (
18
+ "api-key" # Default auth mode if using GCP target and not specified
19
+ )
20
+ secrets: Optional[Dict[str, str]] = {} # Maps ENV_VAR_NAME to GCP Secret Manager ID
21
+
22
+
23
+ class AWSLambdaConfig(BaseModel):
24
+ region: Optional[str] = None
25
+ function_name_template: Optional[str] = "rewardeval-{evaluator_id}"
26
+ default_auth_mode: Optional[Literal["api-key", "iam", "mtls-client-auth"]] = "api-key"
27
+ secrets: Optional[Dict[str, str]] = {} # Maps ENV_VAR_NAME to AWS Secret ARN
28
+
29
+
30
+ class RewardKitConfig(BaseModel):
31
+ default_deployment_target: Optional[Literal["gcp-cloud-run", "aws-lambda", "fireworks", "local"]] = "fireworks"
32
+ gcp_cloud_run: Optional[GCPCloudRunConfig] = GCPCloudRunConfig()
33
+ aws_lambda: Optional[AWSLambdaConfig] = AWSLambdaConfig()
34
+ evaluator_endpoint_keys: Optional[Dict[str, str]] = (
35
+ {} # Stores generated API keys for self-hosted evaluator endpoints
36
+ )
37
+
38
+
39
+ # --- Global variable to hold the loaded configuration ---
40
+ _loaded_config: Optional[RewardKitConfig] = None
41
+ _config_file_path: Optional[str] = None
42
+
43
+
44
+ def find_config_file(start_path: Optional[str] = None) -> Optional[str]:
45
+ """
46
+ Finds the rewardkit.yaml file by searching upwards from start_path (or CWD).
47
+ """
48
+ if start_path is None:
49
+ start_path = os.getcwd()
50
+
51
+ current_path = os.path.abspath(start_path)
52
+ while True:
53
+ potential_path = os.path.join(current_path, CONFIG_FILE_NAME)
54
+ if os.path.isfile(potential_path):
55
+ return potential_path
56
+
57
+ parent_path = os.path.dirname(current_path)
58
+ if parent_path == current_path:
59
+ return None
60
+ current_path = parent_path
61
+
62
+
63
+ def load_config(config_path: Optional[str] = None) -> RewardKitConfig:
64
+ """
65
+ Loads the rewardkit.yaml configuration.
66
+ If already loaded, returns the cached version unless a new path is provided.
67
+ If no path is provided, it tries to find rewardkit.yaml in CWD or parent directories.
68
+ """
69
+ global _loaded_config, _config_file_path
70
+
71
+ if config_path:
72
+ pass
73
+ elif _loaded_config and not config_path:
74
+ return _loaded_config
75
+ else:
76
+ config_path = find_config_file()
77
+
78
+ if not config_path:
79
+ _loaded_config = RewardKitConfig()
80
+ _config_file_path = None
81
+ return _loaded_config
82
+
83
+ if _loaded_config and config_path == _config_file_path:
84
+ return _loaded_config
85
+
86
+ try:
87
+ with open(config_path, "r") as f:
88
+ raw_config = yaml.safe_load(f)
89
+
90
+ if raw_config is None:
91
+ _loaded_config = RewardKitConfig()
92
+ else:
93
+ _loaded_config = RewardKitConfig(**raw_config)
94
+
95
+ _config_file_path = config_path
96
+ return _loaded_config
97
+ except FileNotFoundError:
98
+ _loaded_config = RewardKitConfig()
99
+ _config_file_path = None
100
+ return _loaded_config
101
+ except yaml.YAMLError as e:
102
+ print(f"Error parsing YAML from {config_path}: {e}")
103
+ # Decide: raise error or return default? For now, return default and warn.
104
+ _loaded_config = RewardKitConfig()
105
+ _config_file_path = config_path # So it doesn't try to reload this broken file again
106
+ return _loaded_config
107
+ except ValidationError as e:
108
+ print(f"Error validating configuration from {config_path}: {e}")
109
+ _loaded_config = RewardKitConfig()
110
+ _config_file_path = config_path
111
+ return _loaded_config
112
+ except Exception as e:
113
+ print(f"An unexpected error occurred while loading configuration from {config_path}: {e}")
114
+ _loaded_config = RewardKitConfig()
115
+ _config_file_path = config_path
116
+ return _loaded_config
117
+
118
+
119
+ def get_config() -> RewardKitConfig:
120
+ """
121
+ Returns the loaded configuration. Loads it if not already loaded.
122
+ """
123
+ if _loaded_config is None:
124
+ return load_config()
125
+ return _loaded_config
126
+
127
+
128
+ # Example usage (can be removed or kept for testing module directly)
129
+ if __name__ == "__main__":
130
+ # Create a dummy rewardkit.yaml for testing
131
+ dummy_config_content = """
132
+ default_deployment_target: gcp-cloud-run
133
+
134
+ gcp_cloud_run:
135
+ project_id: "test-gcp-project"
136
+ region: "europe-west1"
137
+ default_auth_mode: "iam"
138
+ secrets:
139
+ DB_PASSWORD: "projects/test-gcp-project/secrets/db-password/versions/latest"
140
+
141
+ aws_lambda:
142
+ region: "eu-north-1"
143
+
144
+ evaluator_endpoint_keys:
145
+ my_test_eval: "dummy_key_for_test_eval"
146
+ """
147
+ dummy_file_path = os.path.join(os.getcwd(), CONFIG_FILE_NAME)
148
+ with open(dummy_file_path, "w") as f:
149
+ f.write(dummy_config_content)
150
+
151
+ print(f"Created dummy {CONFIG_FILE_NAME} for testing.")
152
+
153
+ config = get_config()
154
+ print("\nLoaded configuration:")
155
+ print(f" Default Target: {config.default_deployment_target}")
156
+ if config.gcp_cloud_run:
157
+ print(f" GCP Project ID: {config.gcp_cloud_run.project_id}")
158
+ print(f" GCP Region: {config.gcp_cloud_run.region}")
159
+ print(f" GCP Auth Mode: {config.gcp_cloud_run.default_auth_mode}")
160
+ print(f" GCP Secrets: {config.gcp_cloud_run.secrets}")
161
+ if config.aws_lambda:
162
+ print(f" AWS Region: {config.aws_lambda.region}")
163
+ print(f" Endpoint Keys: {config.evaluator_endpoint_keys}")
164
+
165
+ # Test finding config
166
+ print("\nTesting find_config_file():")
167
+ found_path = find_config_file()
168
+ print(f" Found at: {found_path}")
169
+
170
+ # Clean up dummy file
171
+ os.remove(dummy_file_path)
172
+ print(f"\nRemoved dummy {CONFIG_FILE_NAME}.")
173
+
174
+ # Test loading with no file
175
+ _loaded_config = None # Reset cache
176
+ _config_file_path = None
177
+ print("\nTesting get_config() with no file present:")
178
+ config_no_file = get_config()
179
+ print(f" Default Target (no file): {config_no_file.default_deployment_target}")
180
+ print(f" GCP Config (no file): {config_no_file.gcp_cloud_run}")
@@ -0,0 +1 @@
1
+ """Dataset loading and processing utilities."""