eval-protocol 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- development/__init__.py +1 -0
- development/normalize_sandbox_fusion.py +628 -0
- development/utils/__init__.py +1 -0
- development/utils/generate_api_key.py +31 -0
- development/utils/subprocess_manager.py +481 -0
- eval_protocol/__init__.py +86 -0
- eval_protocol/__main__.py +10 -0
- eval_protocol/_version.py +21 -0
- eval_protocol/adapters/__init__.py +1 -0
- eval_protocol/adapters/braintrust.py +8 -0
- eval_protocol/adapters/trl.py +8 -0
- eval_protocol/agent/__init__.py +29 -0
- eval_protocol/agent/models.py +69 -0
- eval_protocol/agent/orchestrator.py +893 -0
- eval_protocol/agent/resource_abc.py +89 -0
- eval_protocol/agent/resource_pool.py +184 -0
- eval_protocol/agent/resources/__init__.py +44 -0
- eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
- eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
- eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
- eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
- eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
- eval_protocol/agent/resources/docker_resource.py +479 -0
- eval_protocol/agent/resources/filesystem_resource.py +371 -0
- eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
- eval_protocol/agent/resources/http_rollout_resource.py +325 -0
- eval_protocol/agent/resources/python_state_resource.py +170 -0
- eval_protocol/agent/resources/sql_resource.py +271 -0
- eval_protocol/agent/task_manager.py +1064 -0
- eval_protocol/agent/tool_registry.py +111 -0
- eval_protocol/auth.py +156 -0
- eval_protocol/cli.py +425 -0
- eval_protocol/cli_commands/__init__.py +1 -0
- eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
- eval_protocol/cli_commands/common.py +242 -0
- eval_protocol/cli_commands/deploy.py +486 -0
- eval_protocol/cli_commands/deploy_mcp.py +287 -0
- eval_protocol/cli_commands/preview.py +186 -0
- eval_protocol/cli_commands/run_eval_cmd.py +202 -0
- eval_protocol/common_utils.py +36 -0
- eval_protocol/config.py +180 -0
- eval_protocol/datasets/__init__.py +1 -0
- eval_protocol/datasets/loader.py +521 -0
- eval_protocol/evaluation.py +1045 -0
- eval_protocol/execution/__init__.py +1 -0
- eval_protocol/execution/pipeline.py +920 -0
- eval_protocol/gcp_tools.py +484 -0
- eval_protocol/generation/cache.py +141 -0
- eval_protocol/generation/clients/base.py +67 -0
- eval_protocol/generation/clients.py +248 -0
- eval_protocol/generic_server.py +165 -0
- eval_protocol/integrations/__init__.py +12 -0
- eval_protocol/integrations/braintrust.py +51 -0
- eval_protocol/integrations/deepeval.py +106 -0
- eval_protocol/integrations/openeval.py +40 -0
- eval_protocol/integrations/trl.py +187 -0
- eval_protocol/mcp/__init__.py +48 -0
- eval_protocol/mcp/adapter.py +131 -0
- eval_protocol/mcp/client/__init__.py +12 -0
- eval_protocol/mcp/client/connection.py +499 -0
- eval_protocol/mcp/clients.py +195 -0
- eval_protocol/mcp/execution/__init__.py +23 -0
- eval_protocol/mcp/execution/base_policy.py +227 -0
- eval_protocol/mcp/execution/fireworks_policy.py +209 -0
- eval_protocol/mcp/execution/manager.py +506 -0
- eval_protocol/mcp/execution/policy.py +421 -0
- eval_protocol/mcp/grid_renderer.py +54 -0
- eval_protocol/mcp/mcpgym.py +637 -0
- eval_protocol/mcp/process_manager.py +177 -0
- eval_protocol/mcp/session/__init__.py +11 -0
- eval_protocol/mcp/session/manager.py +228 -0
- eval_protocol/mcp/simple_process_manager.py +291 -0
- eval_protocol/mcp/simulation_server.py +458 -0
- eval_protocol/mcp/types.py +80 -0
- eval_protocol/mcp_agent/__init__.py +1 -0
- eval_protocol/mcp_agent/config.py +147 -0
- eval_protocol/mcp_agent/intermediary_server.py +542 -0
- eval_protocol/mcp_agent/main.py +210 -0
- eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
- eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
- eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
- eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
- eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
- eval_protocol/mcp_agent/session.py +79 -0
- eval_protocol/mcp_env.py +304 -0
- eval_protocol/models.py +366 -0
- eval_protocol/packaging.py +219 -0
- eval_protocol/platform_api.py +360 -0
- eval_protocol/playback_policy.py +396 -0
- eval_protocol/resources.py +128 -0
- eval_protocol/reward_function.py +410 -0
- eval_protocol/rewards/__init__.py +94 -0
- eval_protocol/rewards/accuracy.py +454 -0
- eval_protocol/rewards/accuracy_length.py +173 -0
- eval_protocol/rewards/apps_coding_reward.py +331 -0
- eval_protocol/rewards/apps_execution_utils.py +149 -0
- eval_protocol/rewards/apps_testing_util.py +559 -0
- eval_protocol/rewards/bfcl_reward.py +313 -0
- eval_protocol/rewards/code_execution.py +1620 -0
- eval_protocol/rewards/code_execution_utils.py +72 -0
- eval_protocol/rewards/cpp_code.py +861 -0
- eval_protocol/rewards/deepcoder_reward.py +161 -0
- eval_protocol/rewards/format.py +129 -0
- eval_protocol/rewards/function_calling.py +541 -0
- eval_protocol/rewards/json_schema.py +422 -0
- eval_protocol/rewards/language_consistency.py +700 -0
- eval_protocol/rewards/lean_prover.py +479 -0
- eval_protocol/rewards/length.py +375 -0
- eval_protocol/rewards/list_comparison_math_reward.py +221 -0
- eval_protocol/rewards/math.py +762 -0
- eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
- eval_protocol/rewards/reasoning_steps.py +249 -0
- eval_protocol/rewards/repetition.py +342 -0
- eval_protocol/rewards/tag_count.py +162 -0
- eval_protocol/rl_processing.py +82 -0
- eval_protocol/server.py +271 -0
- eval_protocol/typed_interface.py +260 -0
- eval_protocol/utils/__init__.py +8 -0
- eval_protocol/utils/batch_evaluation.py +217 -0
- eval_protocol/utils/batch_transformation.py +205 -0
- eval_protocol/utils/dataset_helpers.py +112 -0
- eval_protocol/utils/module_loader.py +56 -0
- eval_protocol/utils/packaging_utils.py +108 -0
- eval_protocol/utils/static_policy.py +305 -0
- eval_protocol-0.0.3.dist-info/METADATA +635 -0
- eval_protocol-0.0.3.dist-info/RECORD +130 -0
- eval_protocol-0.0.3.dist-info/WHEEL +5 -0
- eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
- eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
- eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Abstract Base Class for Forkable Resources in the Agent Evaluation Framework V2.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import ( # Callable removed as not directly used in ABC signatures
|
|
7
|
+
Any,
|
|
8
|
+
Dict,
|
|
9
|
+
List,
|
|
10
|
+
Optional,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ForkableResource(ABC):
|
|
15
|
+
"""
|
|
16
|
+
Abstract base class defining the interface for a forkable, checkpointable,
|
|
17
|
+
and interactive environment resource for agent evaluation.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
async def setup(self, config: Dict[str, Any]) -> None:
|
|
22
|
+
"""
|
|
23
|
+
Initializes the resource with a given configuration.
|
|
24
|
+
This method should prepare the resource for its first use or fork.
|
|
25
|
+
For example, setting up a database schema, creating a base file system,
|
|
26
|
+
or starting a base Docker container.
|
|
27
|
+
"""
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
async def fork(self) -> "ForkableResource":
|
|
32
|
+
"""
|
|
33
|
+
Creates and returns a new, independent instance of this resource
|
|
34
|
+
with an identical copy of the current state of the resource it was forked from.
|
|
35
|
+
This new instance is typically an EpisodeResource, used for a single agent rollout.
|
|
36
|
+
"""
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
async def checkpoint(self) -> Any:
|
|
41
|
+
"""
|
|
42
|
+
Returns a serializable representation of the resource's current state.
|
|
43
|
+
The format of this state (e.g., bytes, dict, path to a file) is specific
|
|
44
|
+
to the resource implementation but must be restorable by `restore()`.
|
|
45
|
+
"""
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
async def restore(self, state_data: Any) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Restores the resource's state from previously checkpointed `state_data`.
|
|
52
|
+
The resource should be in the same state as when `checkpoint()` was called.
|
|
53
|
+
"""
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
async def step(self, action_name: str, action_params: Dict[str, Any]) -> Any:
|
|
58
|
+
"""
|
|
59
|
+
Executes a named action with given parameters on the resource.
|
|
60
|
+
This typically modifies the resource's state.
|
|
61
|
+
Returns an observation or result of the action, specific to the resource and action.
|
|
62
|
+
"""
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
async def get_observation(self) -> Any:
|
|
67
|
+
"""
|
|
68
|
+
Returns the current observable state of the resource for the agent.
|
|
69
|
+
The format of the observation is resource-specific.
|
|
70
|
+
"""
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
@abstractmethod
|
|
74
|
+
async def get_tools_spec(self) -> List[Dict[str, Any]]:
|
|
75
|
+
"""
|
|
76
|
+
Returns a list of tool specifications (e.g., OpenAI function calling format)
|
|
77
|
+
that are currently available or applicable to this resource's state.
|
|
78
|
+
This can be dynamic, changing based on the resource's current state.
|
|
79
|
+
"""
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
@abstractmethod
|
|
83
|
+
async def close(self) -> None:
|
|
84
|
+
"""
|
|
85
|
+
Performs any necessary cleanup for the resource.
|
|
86
|
+
This includes releasing acquired resources like database connections,
|
|
87
|
+
stopping containers, deleting temporary files or directories, etc.
|
|
88
|
+
"""
|
|
89
|
+
pass
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Resource Pool for the Agent Evaluation Framework V2.
|
|
3
|
+
Manages and allocates resources to specific tasks.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Any, Dict, List, Optional, Set, Type
|
|
9
|
+
|
|
10
|
+
from .resource_abc import ForkableResource
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ResourcePool:
|
|
14
|
+
"""
|
|
15
|
+
Manages a pool of ForkableResources that can be shared and reused across tasks.
|
|
16
|
+
Provides tracking and lifecycle management for resources.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self):
|
|
20
|
+
"""Initialize an empty resource pool."""
|
|
21
|
+
self.resources: Dict[str, ForkableResource] = {} # resource_id -> resource instance
|
|
22
|
+
self.resource_tasks: Dict[str, Set[str]] = {} # resource_id -> set of task_ids using it
|
|
23
|
+
self.task_resources: Dict[str, Set[str]] = {} # task_id -> set of resource_ids used by it
|
|
24
|
+
self.logger = logging.getLogger("ResourcePool")
|
|
25
|
+
|
|
26
|
+
async def create_resource(
|
|
27
|
+
self,
|
|
28
|
+
resource_type: Type[ForkableResource],
|
|
29
|
+
resource_id: str,
|
|
30
|
+
config: Dict[str, Any],
|
|
31
|
+
task_id: Optional[str] = None,
|
|
32
|
+
) -> Optional[ForkableResource]:
|
|
33
|
+
"""
|
|
34
|
+
Create a new resource of the specified type and add it to the pool.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
resource_type: The ForkableResource class to instantiate
|
|
38
|
+
resource_id: Unique identifier for the resource
|
|
39
|
+
config: Configuration dictionary for the resource setup
|
|
40
|
+
task_id: Optional task ID to associate with this resource
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
The created resource or None if creation fails
|
|
44
|
+
"""
|
|
45
|
+
if resource_id in self.resources:
|
|
46
|
+
self.logger.warning(f"Resource '{resource_id}' already exists in the pool. Returning existing instance.")
|
|
47
|
+
if task_id:
|
|
48
|
+
self._associate_task_with_resource(task_id, resource_id)
|
|
49
|
+
return self.resources[resource_id]
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
resource = resource_type()
|
|
53
|
+
await resource.setup(config)
|
|
54
|
+
|
|
55
|
+
self.resources[resource_id] = resource
|
|
56
|
+
self.resource_tasks[resource_id] = set()
|
|
57
|
+
|
|
58
|
+
if task_id:
|
|
59
|
+
self._associate_task_with_resource(task_id, resource_id)
|
|
60
|
+
|
|
61
|
+
self.logger.info(f"Created resource '{resource_id}' of type {resource_type.__name__}")
|
|
62
|
+
return resource
|
|
63
|
+
except Exception as e:
|
|
64
|
+
self.logger.error(f"Failed to create resource '{resource_id}': {e}")
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
def get_resource(self, resource_id: str) -> Optional[ForkableResource]:
|
|
68
|
+
"""
|
|
69
|
+
Get a resource from the pool by its ID.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
resource_id: The identifier of the resource to retrieve
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
The resource instance or None if not found
|
|
76
|
+
"""
|
|
77
|
+
return self.resources.get(resource_id)
|
|
78
|
+
|
|
79
|
+
def _associate_task_with_resource(self, task_id: str, resource_id: str) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Associate a task with a resource for tracking purposes.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
task_id: The task identifier
|
|
85
|
+
resource_id: The resource identifier
|
|
86
|
+
"""
|
|
87
|
+
if resource_id not in self.resources:
|
|
88
|
+
self.logger.warning(f"Cannot associate task '{task_id}' with non-existent resource '{resource_id}'.")
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
# Add task to resource's task set
|
|
92
|
+
if resource_id not in self.resource_tasks:
|
|
93
|
+
self.resource_tasks[resource_id] = set()
|
|
94
|
+
self.resource_tasks[resource_id].add(task_id)
|
|
95
|
+
|
|
96
|
+
# Add resource to task's resource set
|
|
97
|
+
if task_id not in self.task_resources:
|
|
98
|
+
self.task_resources[task_id] = set()
|
|
99
|
+
self.task_resources[task_id].add(resource_id)
|
|
100
|
+
|
|
101
|
+
self.logger.debug(f"Associated task '{task_id}' with resource '{resource_id}'.")
|
|
102
|
+
|
|
103
|
+
async def fork_resource_for_task(self, resource_id: str, task_id: str) -> Optional[ForkableResource]:
|
|
104
|
+
"""
|
|
105
|
+
Fork a resource for a specific task.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
resource_id: The identifier of the resource to fork
|
|
109
|
+
task_id: The task that will use the forked resource
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
The forked resource instance or None if forking fails
|
|
113
|
+
"""
|
|
114
|
+
base_resource = self.get_resource(resource_id)
|
|
115
|
+
if not base_resource:
|
|
116
|
+
self.logger.error(f"Cannot fork non-existent resource '{resource_id}'.")
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
forked_resource = await base_resource.fork()
|
|
121
|
+
# We don't track forked resources in the pool, as they are typically
|
|
122
|
+
# short-lived and managed by the Orchestrator
|
|
123
|
+
self.logger.debug(f"Forked resource '{resource_id}' for task '{task_id}'.")
|
|
124
|
+
return forked_resource
|
|
125
|
+
except Exception as e:
|
|
126
|
+
self.logger.error(f"Failed to fork resource '{resource_id}' for task '{task_id}': {e}")
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
async def cleanup_task_resources(self, task_id: str) -> None:
|
|
130
|
+
"""
|
|
131
|
+
Clean up all resources associated with a task.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
task_id: The task identifier
|
|
135
|
+
"""
|
|
136
|
+
if task_id not in self.task_resources:
|
|
137
|
+
self.logger.debug(f"No resources to clean up for task '{task_id}'.")
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
resource_ids = list(self.task_resources[task_id])
|
|
141
|
+
for resource_id in resource_ids:
|
|
142
|
+
# Remove task from resource's task set
|
|
143
|
+
if resource_id in self.resource_tasks:
|
|
144
|
+
self.resource_tasks[resource_id].discard(task_id)
|
|
145
|
+
|
|
146
|
+
# If resource has no more tasks, close and remove it
|
|
147
|
+
if not self.resource_tasks[resource_id]:
|
|
148
|
+
await self.close_resource(resource_id)
|
|
149
|
+
|
|
150
|
+
# Clear task's resource tracking
|
|
151
|
+
self.task_resources.pop(task_id, None)
|
|
152
|
+
self.logger.info(f"Cleaned up resources for task '{task_id}'.")
|
|
153
|
+
|
|
154
|
+
async def close_resource(self, resource_id: str) -> None:
|
|
155
|
+
"""
|
|
156
|
+
Close a resource and remove it from the pool.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
resource_id: The identifier of the resource to close
|
|
160
|
+
"""
|
|
161
|
+
if resource_id not in self.resources:
|
|
162
|
+
self.logger.debug(f"Cannot close non-existent resource '{resource_id}'.")
|
|
163
|
+
return
|
|
164
|
+
|
|
165
|
+
resource = self.resources[resource_id]
|
|
166
|
+
try:
|
|
167
|
+
await resource.close()
|
|
168
|
+
self.resources.pop(resource_id)
|
|
169
|
+
self.resource_tasks.pop(resource_id, None)
|
|
170
|
+
self.logger.info(f"Closed and removed resource '{resource_id}' from pool.")
|
|
171
|
+
except Exception as e:
|
|
172
|
+
self.logger.error(f"Error closing resource '{resource_id}': {e}")
|
|
173
|
+
|
|
174
|
+
async def close_all_resources(self) -> None:
|
|
175
|
+
"""Close all resources in the pool and clear it."""
|
|
176
|
+
resource_ids = list(self.resources.keys())
|
|
177
|
+
for resource_id in resource_ids:
|
|
178
|
+
await self.close_resource(resource_id)
|
|
179
|
+
|
|
180
|
+
# Clear all tracking dictionaries
|
|
181
|
+
self.resources.clear()
|
|
182
|
+
self.resource_tasks.clear()
|
|
183
|
+
self.task_resources.clear()
|
|
184
|
+
self.logger.info("Closed all resources in the pool.")
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Resources for the Reward Kit Agent V2 Framework.
|
|
3
|
+
|
|
4
|
+
This package contains concrete implementations of the ForkableResource ABC.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .bfcl_sim_api_resource import BFCLSimAPIResource
|
|
8
|
+
from .docker_resource import DockerResource
|
|
9
|
+
from .filesystem_resource import FileSystemResource
|
|
10
|
+
|
|
11
|
+
# HTTP Rollout Protocol types for server implementations
|
|
12
|
+
from .http_rollout_protocol import (
|
|
13
|
+
EndEpisodeRequest,
|
|
14
|
+
EndEpisodeResponse,
|
|
15
|
+
GameObservation,
|
|
16
|
+
HealthResponse,
|
|
17
|
+
HttpRolloutConfig,
|
|
18
|
+
StartEpisodeRequest,
|
|
19
|
+
StartEpisodeResponse,
|
|
20
|
+
StepRequest,
|
|
21
|
+
StepResponse,
|
|
22
|
+
)
|
|
23
|
+
from .http_rollout_resource import HttpRolloutResource
|
|
24
|
+
from .python_state_resource import PythonStateResource
|
|
25
|
+
from .sql_resource import SQLResource
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"PythonStateResource",
|
|
29
|
+
"SQLResource",
|
|
30
|
+
"FileSystemResource",
|
|
31
|
+
"DockerResource",
|
|
32
|
+
"BFCLSimAPIResource",
|
|
33
|
+
"HttpRolloutResource",
|
|
34
|
+
# HTTP Rollout Protocol
|
|
35
|
+
"HttpRolloutConfig",
|
|
36
|
+
"StartEpisodeRequest",
|
|
37
|
+
"StartEpisodeResponse",
|
|
38
|
+
"StepRequest",
|
|
39
|
+
"StepResponse",
|
|
40
|
+
"EndEpisodeRequest",
|
|
41
|
+
"EndEpisodeResponse",
|
|
42
|
+
"HealthResponse",
|
|
43
|
+
"GameObservation",
|
|
44
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# BFCL environment implementations
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
"""Implementation of GorillaFileSystem."""
|
|
2
|
+
|
|
3
|
+
from typing import Dict, Optional, Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class File:
|
|
7
|
+
"""A file in the Gorilla File System."""
|
|
8
|
+
|
|
9
|
+
def __init__(
|
|
10
|
+
self, name: str = "", content: str = "", parent: Optional["Directory"] = None
|
|
11
|
+
): # 'Directory' as string literal
|
|
12
|
+
self.name: str = name
|
|
13
|
+
self.content: str = content
|
|
14
|
+
self.parent: Optional["Directory"] = parent
|
|
15
|
+
|
|
16
|
+
def __repr__(self):
|
|
17
|
+
return f"<File: {self.name}, Content: '{self.content[:20]}{'...' if len(self.content) > 20 else ''}'>"
|
|
18
|
+
|
|
19
|
+
def __eq__(self, other):
|
|
20
|
+
if not isinstance(other, File):
|
|
21
|
+
return False
|
|
22
|
+
return self.name == other.name and self.content == other.content
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Directory:
|
|
26
|
+
"""A directory in the Gorilla File System."""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
name: str = "",
|
|
31
|
+
parent: Optional["Directory"] = None, # Changed to string literal
|
|
32
|
+
contents: Optional[Dict[str, Union[File, "Directory"]]] = None,
|
|
33
|
+
):
|
|
34
|
+
self.name: str = name
|
|
35
|
+
self.parent: Optional["Directory"] = parent # Changed to string literal
|
|
36
|
+
self.contents: Dict[str, Union[File, Directory]] = contents or {}
|
|
37
|
+
|
|
38
|
+
def __repr__(self):
|
|
39
|
+
parent_name = self.parent.name if self.parent else None
|
|
40
|
+
return f"<Directory: {self.name}, Parent: {parent_name}, Keys: {list(self.contents.keys())}>"
|
|
41
|
+
|
|
42
|
+
def __eq__(self, other):
|
|
43
|
+
if not isinstance(other, Directory):
|
|
44
|
+
return False
|
|
45
|
+
return self.name == other.name and self.contents == other.contents
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class GorillaFileSystem:
|
|
49
|
+
"""A file system for BFCL evaluation."""
|
|
50
|
+
|
|
51
|
+
def __init__(self):
|
|
52
|
+
self.root: Directory = Directory(name="workspace", parent=None)
|
|
53
|
+
self.current_dir: Directory = self.root
|
|
54
|
+
self.long_context: bool = False
|
|
55
|
+
|
|
56
|
+
def _load_scenario(self, config: Dict):
|
|
57
|
+
"""Load the file system from configuration."""
|
|
58
|
+
# self.root and self.current_dir are already initialized.
|
|
59
|
+
# We will only overwrite them if loading is successful.
|
|
60
|
+
if "root" in config:
|
|
61
|
+
try:
|
|
62
|
+
loaded_dir: Optional[Directory] = None
|
|
63
|
+
root_config = config["root"]
|
|
64
|
+
if isinstance(root_config, dict) and "type" in root_config:
|
|
65
|
+
loaded_dir = self._load_directory_from_config("workspace", None, root_config)
|
|
66
|
+
elif isinstance(root_config, dict): # Assuming if not 'type', it's the other YAML format
|
|
67
|
+
loaded_dir = self._load_directory_from_yaml_config("workspace", None, root_config)
|
|
68
|
+
|
|
69
|
+
if loaded_dir: # Check if loading returned a Directory
|
|
70
|
+
self.root = loaded_dir
|
|
71
|
+
self.current_dir = self.root
|
|
72
|
+
# If loaded_dir is None, self.root and self.current_dir retain their initial default values.
|
|
73
|
+
except Exception as e:
|
|
74
|
+
print(f"Error loading GorillaFileSystem scenario: {e}")
|
|
75
|
+
# If an exception occurred during loading, reset to a fresh default.
|
|
76
|
+
self.root = Directory(name="workspace", parent=None)
|
|
77
|
+
self.current_dir = self.root
|
|
78
|
+
|
|
79
|
+
if "long_context" in config:
|
|
80
|
+
self.long_context = config.get("long_context", False)
|
|
81
|
+
|
|
82
|
+
def _load_directory_from_config(self, name: str, parent: Optional[Directory], config: Dict) -> Optional[Directory]:
|
|
83
|
+
"""Create a directory structure from configuration."""
|
|
84
|
+
if config.get("type") == "directory":
|
|
85
|
+
directory = Directory(name=name, parent=parent)
|
|
86
|
+
contents: Dict[str, Union[File, Directory]] = {}
|
|
87
|
+
for item_name, item_config in config.get("contents", {}).items():
|
|
88
|
+
item_type = item_config.get("type")
|
|
89
|
+
if item_type == "directory":
|
|
90
|
+
loaded_item = self._load_directory_from_config(item_name, directory, item_config)
|
|
91
|
+
if loaded_item:
|
|
92
|
+
contents[item_name] = loaded_item
|
|
93
|
+
elif item_type == "file":
|
|
94
|
+
contents[item_name] = File(
|
|
95
|
+
name=item_name,
|
|
96
|
+
content=item_config.get("content", ""),
|
|
97
|
+
parent=directory,
|
|
98
|
+
)
|
|
99
|
+
directory.contents = contents
|
|
100
|
+
return directory
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
def _load_directory_from_yaml_config(self, name: str, parent: Optional[Directory], config: Dict) -> Directory:
|
|
104
|
+
"""Create a directory structure from YAML configuration format."""
|
|
105
|
+
directory = Directory(name=name, parent=parent)
|
|
106
|
+
contents: Dict[str, Union[File, Directory]] = {}
|
|
107
|
+
|
|
108
|
+
# Ensure config.get("contents") is treated as a dictionary
|
|
109
|
+
config_contents = config.get("contents", {})
|
|
110
|
+
if not isinstance(config_contents, dict):
|
|
111
|
+
config_contents = {} # Default to empty dict if not a dict
|
|
112
|
+
|
|
113
|
+
for item_name, item_config in config_contents.items():
|
|
114
|
+
if isinstance(item_config, dict):
|
|
115
|
+
if "contents" in item_config: # Heuristic for directory
|
|
116
|
+
loaded_subdir = self._load_directory_from_yaml_config(item_name, directory, item_config)
|
|
117
|
+
if loaded_subdir: # Ensure it's not None, though current impl always returns Directory
|
|
118
|
+
contents[item_name] = loaded_subdir
|
|
119
|
+
elif "content" in item_config: # Heuristic for file
|
|
120
|
+
contents[item_name] = File(
|
|
121
|
+
name=item_name,
|
|
122
|
+
content=item_config.get("content", ""),
|
|
123
|
+
parent=directory,
|
|
124
|
+
)
|
|
125
|
+
elif item_config.get("type") == "directory":
|
|
126
|
+
loaded_subdir = self._load_directory_from_yaml_config(item_name, directory, item_config)
|
|
127
|
+
if loaded_subdir:
|
|
128
|
+
contents[item_name] = loaded_subdir
|
|
129
|
+
elif item_config.get("type") == "file":
|
|
130
|
+
contents[item_name] = File(
|
|
131
|
+
name=item_name,
|
|
132
|
+
content=item_config.get("content", ""),
|
|
133
|
+
parent=directory,
|
|
134
|
+
)
|
|
135
|
+
directory.contents = contents
|
|
136
|
+
return directory
|
|
137
|
+
|
|
138
|
+
def ls(self, path: Optional[str] = None) -> Dict:
|
|
139
|
+
"""List directory contents."""
|
|
140
|
+
target_dir: Directory = self.current_dir
|
|
141
|
+
if path:
|
|
142
|
+
found_node = self._find_path(path)
|
|
143
|
+
if not isinstance(found_node, Directory):
|
|
144
|
+
return {"error": f"Path not found or not a directory: {path}"}
|
|
145
|
+
target_dir = found_node
|
|
146
|
+
|
|
147
|
+
items: Dict[str, Dict[str, str]] = {}
|
|
148
|
+
# target_dir is now guaranteed to be a Directory.
|
|
149
|
+
for name, item in target_dir.contents.items():
|
|
150
|
+
if isinstance(item, Directory):
|
|
151
|
+
items[name] = {"type": "directory"}
|
|
152
|
+
elif isinstance(item, File):
|
|
153
|
+
items[name] = {"type": "file"}
|
|
154
|
+
|
|
155
|
+
return {"current_directory": target_dir.name, "contents": items}
|
|
156
|
+
|
|
157
|
+
def cd(self, folder: str) -> Dict:
|
|
158
|
+
"""Change current directory."""
|
|
159
|
+
if folder == "..":
|
|
160
|
+
parent_dir = self.current_dir.parent
|
|
161
|
+
if parent_dir is not None:
|
|
162
|
+
self.current_dir = parent_dir
|
|
163
|
+
return {
|
|
164
|
+
"status": "success",
|
|
165
|
+
"message": f"Changed to {self.current_dir.name}",
|
|
166
|
+
}
|
|
167
|
+
else: # Parent is None, so we are at root
|
|
168
|
+
return {"status": "error", "message": "Already at root directory"}
|
|
169
|
+
|
|
170
|
+
# self.current_dir is always a Directory. Accessing .contents is safe.
|
|
171
|
+
target_item = self.current_dir.contents.get(folder)
|
|
172
|
+
if isinstance(target_item, Directory):
|
|
173
|
+
self.current_dir = target_item
|
|
174
|
+
return {"status": "success", "message": f"Changed to {folder}"}
|
|
175
|
+
|
|
176
|
+
return {"status": "error", "message": f"Directory {folder} not found"}
|
|
177
|
+
|
|
178
|
+
def mkdir(self, dir_name: str) -> Dict:
|
|
179
|
+
"""Create a new directory."""
|
|
180
|
+
# self.current_dir is always a Directory. Accessing .contents is safe.
|
|
181
|
+
if dir_name in self.current_dir.contents:
|
|
182
|
+
return {
|
|
183
|
+
"status": "error",
|
|
184
|
+
"message": f"Directory {dir_name} already exists",
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
self.current_dir.contents[dir_name] = Directory(name=dir_name, parent=self.current_dir)
|
|
188
|
+
return {"status": "success", "message": f"Created directory {dir_name}"}
|
|
189
|
+
|
|
190
|
+
def cat(self, file_name: str) -> Dict:
|
|
191
|
+
"""Display file contents."""
|
|
192
|
+
# self.current_dir is always a Directory. Accessing .contents is safe.
|
|
193
|
+
item = self.current_dir.contents.get(file_name)
|
|
194
|
+
if not isinstance(item, File):
|
|
195
|
+
return {"status": "error", "message": f"File {file_name} not found"}
|
|
196
|
+
|
|
197
|
+
return {
|
|
198
|
+
"status": "success",
|
|
199
|
+
"content": item.content, # item is File, .content is safe
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
def mv(self, source: str, destination: str) -> Dict:
|
|
203
|
+
"""Move a file or directory."""
|
|
204
|
+
source_item = self.current_dir.contents.get(source)
|
|
205
|
+
if source_item is None:
|
|
206
|
+
return {"status": "error", "message": f"Source {source} not found"}
|
|
207
|
+
|
|
208
|
+
parts = destination.split("/")
|
|
209
|
+
dest_name = parts[-1]
|
|
210
|
+
target_dir_path = "/".join(parts[:-1])
|
|
211
|
+
|
|
212
|
+
final_target_dir: Directory = self.current_dir
|
|
213
|
+
if target_dir_path: # If destination includes a path
|
|
214
|
+
found_dir = self._find_path(target_dir_path)
|
|
215
|
+
if not isinstance(found_dir, Directory):
|
|
216
|
+
return {
|
|
217
|
+
"status": "error",
|
|
218
|
+
"message": f"Target directory path {target_dir_path} not found or not a directory",
|
|
219
|
+
}
|
|
220
|
+
final_target_dir = found_dir
|
|
221
|
+
|
|
222
|
+
if dest_name in final_target_dir.contents:
|
|
223
|
+
return {
|
|
224
|
+
"status": "error",
|
|
225
|
+
"message": f"Destination {destination} already exists",
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
# Move item
|
|
229
|
+
del self.current_dir.contents[source] # Remove from old location
|
|
230
|
+
source_item.name = dest_name
|
|
231
|
+
source_item.parent = final_target_dir
|
|
232
|
+
final_target_dir.contents[dest_name] = source_item
|
|
233
|
+
|
|
234
|
+
return {"status": "success", "message": f"Moved {source} to {destination}"}
|
|
235
|
+
|
|
236
|
+
def grep(self, file_name: str, pattern: str) -> Dict:
|
|
237
|
+
"""Search for a pattern in a file."""
|
|
238
|
+
item = self.current_dir.contents.get(file_name)
|
|
239
|
+
if not isinstance(item, File):
|
|
240
|
+
return {"status": "error", "message": f"File {file_name} not found"}
|
|
241
|
+
|
|
242
|
+
content = item.content # item is File, .content is safe
|
|
243
|
+
lines = content.split("\n")
|
|
244
|
+
matches = [line for line in lines if pattern in line]
|
|
245
|
+
|
|
246
|
+
return {"status": "success", "matches": matches, "count": len(matches)}
|
|
247
|
+
|
|
248
|
+
def sort(self, file_name: str) -> Dict:
|
|
249
|
+
"""Sort the lines in a file."""
|
|
250
|
+
item = self.current_dir.contents.get(file_name)
|
|
251
|
+
if not isinstance(item, File):
|
|
252
|
+
return {"status": "error", "message": f"File {file_name} not found"}
|
|
253
|
+
|
|
254
|
+
content = item.content # item is File, .content is safe
|
|
255
|
+
lines = content.split("\n")
|
|
256
|
+
sorted_lines = sorted(lines)
|
|
257
|
+
|
|
258
|
+
item.content = "\n".join(sorted_lines) # item is File, assigning .content is safe
|
|
259
|
+
|
|
260
|
+
return {"status": "success", "message": f"Sorted {file_name}"}
|
|
261
|
+
|
|
262
|
+
def diff(self, file_name1: str, file_name2: str) -> Dict:
|
|
263
|
+
"""Compare two files."""
|
|
264
|
+
item1 = self.current_dir.contents.get(file_name1)
|
|
265
|
+
item2 = self.current_dir.contents.get(file_name2)
|
|
266
|
+
|
|
267
|
+
if not isinstance(item1, File):
|
|
268
|
+
return {"status": "error", "message": f"File {file_name1} not found"}
|
|
269
|
+
if not isinstance(item2, File):
|
|
270
|
+
return {"status": "error", "message": f"File {file_name2} not found"}
|
|
271
|
+
|
|
272
|
+
content1 = item1.content # item1 is File
|
|
273
|
+
content2 = item2.content # item2 is File
|
|
274
|
+
|
|
275
|
+
if content1 == content2:
|
|
276
|
+
return {
|
|
277
|
+
"status": "success",
|
|
278
|
+
"message": "Files are identical",
|
|
279
|
+
"differences": [],
|
|
280
|
+
}
|
|
281
|
+
else:
|
|
282
|
+
lines1 = content1.split("\n")
|
|
283
|
+
lines2 = content2.split("\n")
|
|
284
|
+
differences = []
|
|
285
|
+
for i in range(max(len(lines1), len(lines2))):
|
|
286
|
+
line1_val = lines1[i] if i < len(lines1) else None
|
|
287
|
+
line2_val = lines2[i] if i < len(lines2) else None
|
|
288
|
+
if line1_val != line2_val:
|
|
289
|
+
differences.append({"line": i + 1, "file1": line1_val, "file2": line2_val})
|
|
290
|
+
return {
|
|
291
|
+
"status": "success",
|
|
292
|
+
"message": f"Found {len(differences)} differences",
|
|
293
|
+
"differences": differences,
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
def _find_path(self, path: str) -> Optional[Union[File, Directory]]:
|
|
297
|
+
"""Helper to find a File or Directory by path. Returns None if not found."""
|
|
298
|
+
current_node: Optional[Directory]
|
|
299
|
+
parts: list[str]
|
|
300
|
+
|
|
301
|
+
if path.startswith("/"):
|
|
302
|
+
current_node = self.root
|
|
303
|
+
path_str = path.strip("/")
|
|
304
|
+
parts = path_str.split("/") if path_str else []
|
|
305
|
+
else:
|
|
306
|
+
current_node = self.current_dir
|
|
307
|
+
parts = path.split("/")
|
|
308
|
+
|
|
309
|
+
if (
|
|
310
|
+
not path or path == "." or (path == "/" and not parts)
|
|
311
|
+
): # Handle current dir or root for empty/special paths
|
|
312
|
+
return self.current_dir if (not path.startswith("/")) and (path == "." or not path) else self.root
|
|
313
|
+
|
|
314
|
+
for i, part_name in enumerate(parts):
|
|
315
|
+
if current_node is None: # Should not happen if logic is correct and current_node starts as Directory
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
if not part_name: # Skip empty parts resulting from multiple slashes e.g. /dir1//file
|
|
319
|
+
if i == 0 and path.startswith("/"): # special case for absolute path like "//file"
|
|
320
|
+
continue
|
|
321
|
+
elif i > 0:
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
if part_name == "..":
|
|
325
|
+
current_node = current_node.parent # Parent can be None
|
|
326
|
+
if current_node is None: # Moved up from root
|
|
327
|
+
return None
|
|
328
|
+
continue # Successfully moved to parent
|
|
329
|
+
|
|
330
|
+
# current_node is a Directory here.
|
|
331
|
+
found_item = current_node.contents.get(part_name)
|
|
332
|
+
|
|
333
|
+
if i == len(parts) - 1: # This is the last part of the path
|
|
334
|
+
return found_item # Return File, Directory, or None if not found
|
|
335
|
+
|
|
336
|
+
if isinstance(found_item, Directory):
|
|
337
|
+
current_node = found_item # Navigate into subdirectory
|
|
338
|
+
else: # Path part is not a directory or not found, and it's not the last part
|
|
339
|
+
return None
|
|
340
|
+
|
|
341
|
+
# This return is for cases like path="dir" and it's a directory, or path="/"
|
|
342
|
+
return current_node
|