isage-benchmark-agent 0.1.0.1__cp311-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
- isage_benchmark_agent-0.1.0.1.dist-info/RECORD +51 -0
- isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +5 -0
- isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
- isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
- isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
- sage/__init__.py +0 -0
- sage/benchmark/__init__.py +0 -0
- sage/benchmark/benchmark_agent/__init__.py +108 -0
- sage/benchmark/benchmark_agent/__main__.py +177 -0
- sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
- sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
- sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
- sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
- sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
- sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
- sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
- sage/benchmark/benchmark_agent/data_paths.py +332 -0
- sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
- sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
- sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
- sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
- sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
- sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
- sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
- sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
- sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
- sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
- sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
- sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
- sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
- sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
- sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
- sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
- sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
- sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
- sage/benchmark/benchmark_agent/tools_loader.py +212 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration Loader for Agent Benchmark Experiments
|
|
3
|
+
|
|
4
|
+
Provides utilities for loading and parsing YAML configuration files
|
|
5
|
+
with environment variable substitution.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
|
|
13
|
+
import yaml
|
|
14
|
+
|
|
15
|
+
from sage.benchmark.benchmark_agent.experiments.base_experiment import (
|
|
16
|
+
ExperimentConfig,
|
|
17
|
+
create_config,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ConfigLoader:
|
|
22
|
+
"""
|
|
23
|
+
Loader for experiment YAML configurations.
|
|
24
|
+
|
|
25
|
+
Supports:
|
|
26
|
+
- Environment variable substitution (${VAR} or $VAR)
|
|
27
|
+
- Special ${PROJECT_ROOT} variable
|
|
28
|
+
- Type-safe config object creation
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def _find_project_root() -> Path:
|
|
33
|
+
"""Find project root directory (where .git exists)."""
|
|
34
|
+
current = Path.cwd()
|
|
35
|
+
while current.parent != current:
|
|
36
|
+
if (current / ".git").exists():
|
|
37
|
+
return current
|
|
38
|
+
current = current.parent
|
|
39
|
+
return Path.cwd()
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def _expand_vars(value: Any, context: Optional[dict[str, str]] = None) -> Any:
|
|
43
|
+
"""
|
|
44
|
+
Recursively expand environment variables in config values.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
value: Config value (can be str, dict, list, etc.)
|
|
48
|
+
context: Additional variable context
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Value with expanded variables
|
|
52
|
+
"""
|
|
53
|
+
if context is None:
|
|
54
|
+
context = {}
|
|
55
|
+
|
|
56
|
+
# Add PROJECT_ROOT to context
|
|
57
|
+
if "PROJECT_ROOT" not in context:
|
|
58
|
+
context["PROJECT_ROOT"] = str(ConfigLoader._find_project_root())
|
|
59
|
+
|
|
60
|
+
if isinstance(value, str):
|
|
61
|
+
# Pattern matches ${VAR} or $VAR
|
|
62
|
+
pattern = r"\$\{([^}]+)\}|\$([A-Za-z_][A-Za-z0-9_]*)"
|
|
63
|
+
|
|
64
|
+
def replacer(match):
|
|
65
|
+
var_name = match.group(1) or match.group(2)
|
|
66
|
+
# Check context first, then environment
|
|
67
|
+
return context.get(var_name, os.environ.get(var_name, match.group(0)))
|
|
68
|
+
|
|
69
|
+
return re.sub(pattern, replacer, value)
|
|
70
|
+
|
|
71
|
+
elif isinstance(value, dict):
|
|
72
|
+
return {k: ConfigLoader._expand_vars(v, context) for k, v in value.items()}
|
|
73
|
+
|
|
74
|
+
elif isinstance(value, list):
|
|
75
|
+
return [ConfigLoader._expand_vars(item, context) for item in value]
|
|
76
|
+
|
|
77
|
+
return value
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def load_yaml(cls, config_path) -> dict[str, Any]:
|
|
81
|
+
"""
|
|
82
|
+
Load and parse YAML config file.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
config_path: Path to YAML config file (str or Path)
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Parsed config dictionary with expanded variables
|
|
89
|
+
"""
|
|
90
|
+
config_path = Path(config_path)
|
|
91
|
+
if not config_path.exists():
|
|
92
|
+
raise FileNotFoundError(f"Config file not found: {config_path}")
|
|
93
|
+
|
|
94
|
+
with open(config_path, encoding="utf-8") as f:
|
|
95
|
+
raw_config = yaml.safe_load(f)
|
|
96
|
+
|
|
97
|
+
# Expand environment variables
|
|
98
|
+
expanded = cls._expand_vars(raw_config)
|
|
99
|
+
|
|
100
|
+
return expanded
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def load_config(cls, config_path) -> ExperimentConfig:
|
|
104
|
+
"""
|
|
105
|
+
Load YAML config and create typed config object.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
config_path: Path to YAML config file (str or Path)
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
ExperimentConfig subclass instance
|
|
112
|
+
"""
|
|
113
|
+
config_dict = cls.load_yaml(config_path)
|
|
114
|
+
return create_config(config_dict)
|
|
115
|
+
|
|
116
|
+
@classmethod
|
|
117
|
+
def load_default_config(cls) -> dict[str, Any]:
|
|
118
|
+
"""
|
|
119
|
+
Load default configuration.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Default config dictionary
|
|
123
|
+
"""
|
|
124
|
+
default_path = Path(__file__).parent.parent / "config" / "default_config.yaml"
|
|
125
|
+
if default_path.exists():
|
|
126
|
+
return cls.load_yaml(default_path)
|
|
127
|
+
return {}
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
def merge_configs(
|
|
131
|
+
cls, base_config: dict[str, Any], override_config: dict[str, Any]
|
|
132
|
+
) -> dict[str, Any]:
|
|
133
|
+
"""
|
|
134
|
+
Merge two configuration dictionaries.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
base_config: Base configuration
|
|
138
|
+
override_config: Override configuration
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Merged configuration (override takes precedence)
|
|
142
|
+
"""
|
|
143
|
+
merged = base_config.copy()
|
|
144
|
+
|
|
145
|
+
for key, value in override_config.items():
|
|
146
|
+
if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
|
|
147
|
+
merged[key] = cls.merge_configs(merged[key], value)
|
|
148
|
+
else:
|
|
149
|
+
merged[key] = value
|
|
150
|
+
|
|
151
|
+
return merged
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def load_config_with_defaults(config_path: Path) -> ExperimentConfig:
|
|
155
|
+
"""
|
|
156
|
+
Load config with default values merged in.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
config_path: Path to experiment config file
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
ExperimentConfig with defaults applied
|
|
163
|
+
"""
|
|
164
|
+
loader = ConfigLoader()
|
|
165
|
+
|
|
166
|
+
# Load default config
|
|
167
|
+
defaults = loader.load_default_config()
|
|
168
|
+
|
|
169
|
+
# Load experiment config
|
|
170
|
+
exp_config = loader.load_yaml(config_path)
|
|
171
|
+
|
|
172
|
+
# Merge
|
|
173
|
+
merged = loader.merge_configs(defaults, exp_config)
|
|
174
|
+
|
|
175
|
+
# Create typed config
|
|
176
|
+
return create_config(merged)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Default Configuration for Agent Benchmark Experiments
|
|
2
|
+
|
|
3
|
+
# Data settings
|
|
4
|
+
profile: "quick_eval" # agent_eval usage profile
|
|
5
|
+
split: "dev" # Data split: train/dev/test
|
|
6
|
+
max_samples: # Limit samples (null = all)
|
|
7
|
+
|
|
8
|
+
# Randomness
|
|
9
|
+
seed: 42
|
|
10
|
+
|
|
11
|
+
# Output and reporting
|
|
12
|
+
report:
|
|
13
|
+
format: ["json", "markdown"]
|
|
14
|
+
include_breakdowns: true
|
|
15
|
+
path: "${PROJECT_ROOT}/outputs/agent_benchmark"
|
|
16
|
+
markdown_template:
|
|
17
|
+
|
|
18
|
+
# Metrics (common defaults, overridden per experiment)
|
|
19
|
+
metrics:
|
|
20
|
+
- "accuracy"
|
|
21
|
+
|
|
22
|
+
# Logging
|
|
23
|
+
verbose: true
|
|
24
|
+
log_level: "INFO"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Planning Experiment Configuration
|
|
2
|
+
|
|
3
|
+
experiment: planning
|
|
4
|
+
|
|
5
|
+
# Data configuration
|
|
6
|
+
profile: "full_eval"
|
|
7
|
+
split: "dev"
|
|
8
|
+
max_samples:
|
|
9
|
+
|
|
10
|
+
# Strategy configuration
|
|
11
|
+
planner: "baseline.template"
|
|
12
|
+
min_steps: 5
|
|
13
|
+
max_steps: 10
|
|
14
|
+
planner_params:
|
|
15
|
+
allow_tool_reuse: true
|
|
16
|
+
enforce_sequence: true
|
|
17
|
+
|
|
18
|
+
# Metrics to evaluate
|
|
19
|
+
metrics:
|
|
20
|
+
- "plan_success_rate"
|
|
21
|
+
- "step_accuracy"
|
|
22
|
+
- "tool_sequence_match"
|
|
23
|
+
- "average_plan_length"
|
|
24
|
+
|
|
25
|
+
# Report configuration
|
|
26
|
+
report:
|
|
27
|
+
format: ["json", "markdown"]
|
|
28
|
+
include_breakdowns: true
|
|
29
|
+
path: "${PROJECT_ROOT}/outputs/agent_benchmark/planning"
|
|
30
|
+
markdown_template:
|
|
31
|
+
|
|
32
|
+
# Reproducibility
|
|
33
|
+
seed: 42
|
|
34
|
+
verbose: true
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Timing Detection Experiment Configuration
|
|
2
|
+
|
|
3
|
+
experiment: timing_detection
|
|
4
|
+
|
|
5
|
+
# Data configuration
|
|
6
|
+
profile: "full_eval"
|
|
7
|
+
split: "dev"
|
|
8
|
+
max_samples:
|
|
9
|
+
|
|
10
|
+
# Strategy configuration
|
|
11
|
+
detector: "baseline.threshold"
|
|
12
|
+
threshold: 0.5
|
|
13
|
+
detector_params:
|
|
14
|
+
use_context: true
|
|
15
|
+
confidence_threshold: 0.7
|
|
16
|
+
|
|
17
|
+
# Metrics to evaluate
|
|
18
|
+
metrics:
|
|
19
|
+
- "f1_score"
|
|
20
|
+
- "precision"
|
|
21
|
+
- "recall"
|
|
22
|
+
- "accuracy"
|
|
23
|
+
- "confusion_matrix"
|
|
24
|
+
|
|
25
|
+
# Report configuration
|
|
26
|
+
report:
|
|
27
|
+
format: ["json", "markdown"]
|
|
28
|
+
include_breakdowns: true
|
|
29
|
+
path: "${PROJECT_ROOT}/outputs/agent_benchmark/timing_detection"
|
|
30
|
+
markdown_template:
|
|
31
|
+
|
|
32
|
+
# Reproducibility
|
|
33
|
+
seed: 42
|
|
34
|
+
verbose: true
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Tool Selection Experiment Configuration
|
|
2
|
+
|
|
3
|
+
experiment: tool_selection
|
|
4
|
+
|
|
5
|
+
# Data configuration
|
|
6
|
+
profile: "quick_eval"
|
|
7
|
+
split: "dev"
|
|
8
|
+
max_samples:
|
|
9
|
+
|
|
10
|
+
# Strategy configuration
|
|
11
|
+
selector: "baseline.keyword"
|
|
12
|
+
top_k: 5
|
|
13
|
+
selector_params:
|
|
14
|
+
min_score: 0.1
|
|
15
|
+
|
|
16
|
+
# Metrics to evaluate
|
|
17
|
+
metrics:
|
|
18
|
+
- "top_k_accuracy"
|
|
19
|
+
- "recall@5"
|
|
20
|
+
- "precision@5"
|
|
21
|
+
- "mrr"
|
|
22
|
+
|
|
23
|
+
# Report configuration
|
|
24
|
+
report:
|
|
25
|
+
format: ["json", "markdown"]
|
|
26
|
+
include_breakdowns: true
|
|
27
|
+
path: "${PROJECT_ROOT}/outputs/agent_benchmark/tool_selection"
|
|
28
|
+
markdown_template:
|
|
29
|
+
|
|
30
|
+
# Reproducibility
|
|
31
|
+
seed: 42
|
|
32
|
+
verbose: true
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Paths Configuration for Agent Benchmark
|
|
3
|
+
|
|
4
|
+
This module provides centralized management of data paths for agent benchmark.
|
|
5
|
+
It follows SAGE's two-layer data architecture:
|
|
6
|
+
1. Source Layer: Original data accessed via DataManager
|
|
7
|
+
2. Runtime Layer: Generated data for specific experiments
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from sage.benchmark.benchmark_agent.data_paths import (
|
|
11
|
+
get_source_paths,
|
|
12
|
+
get_runtime_paths,
|
|
13
|
+
ensure_runtime_dirs,
|
|
14
|
+
DataPathsConfig,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Get source data paths (read-only)
|
|
18
|
+
source = get_source_paths()
|
|
19
|
+
tools_file = source.tools_catalog
|
|
20
|
+
|
|
21
|
+
# Get runtime data paths
|
|
22
|
+
runtime = get_runtime_paths()
|
|
23
|
+
output_dir = runtime.tool_selection_dir
|
|
24
|
+
|
|
25
|
+
# Ensure runtime directories exist
|
|
26
|
+
ensure_runtime_dirs()
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import os
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import Optional
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _find_sage_root() -> Path:
|
|
36
|
+
"""
|
|
37
|
+
Find SAGE project root directory.
|
|
38
|
+
|
|
39
|
+
Looks for .git directory or SAGE_ROOT environment variable.
|
|
40
|
+
"""
|
|
41
|
+
# Check environment variable first
|
|
42
|
+
if "SAGE_ROOT" in os.environ:
|
|
43
|
+
return Path(os.environ["SAGE_ROOT"])
|
|
44
|
+
|
|
45
|
+
# Walk up from current file to find project root
|
|
46
|
+
current = Path(__file__).resolve()
|
|
47
|
+
while current.parent != current:
|
|
48
|
+
if (current / ".git").exists() and (current / "packages").exists():
|
|
49
|
+
return current
|
|
50
|
+
current = current.parent
|
|
51
|
+
|
|
52
|
+
# Fallback to current working directory
|
|
53
|
+
return Path.cwd()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _find_package_root() -> Path:
|
|
57
|
+
"""Find sage-benchmark package root."""
|
|
58
|
+
current = Path(__file__).resolve()
|
|
59
|
+
# Navigate up to find sage-benchmark/src
|
|
60
|
+
while current.parent != current:
|
|
61
|
+
if current.name == "sage-benchmark":
|
|
62
|
+
return current
|
|
63
|
+
current = current.parent
|
|
64
|
+
|
|
65
|
+
# Fallback
|
|
66
|
+
sage_root = _find_sage_root()
|
|
67
|
+
return sage_root / "packages" / "sage-benchmark"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class SourcePaths:
|
|
72
|
+
"""
|
|
73
|
+
Paths to source data (read-only, via DataManager).
|
|
74
|
+
|
|
75
|
+
These are the original data files that should not be modified directly.
|
|
76
|
+
Use DataManager for standard access.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
# Root directories
|
|
80
|
+
data_root: Path
|
|
81
|
+
|
|
82
|
+
# Agent benchmark data
|
|
83
|
+
benchmark_dir: Path
|
|
84
|
+
benchmark_splits_dir: Path
|
|
85
|
+
benchmark_metadata_dir: Path
|
|
86
|
+
|
|
87
|
+
# Agent tools data
|
|
88
|
+
tools_dir: Path
|
|
89
|
+
tools_data_dir: Path
|
|
90
|
+
|
|
91
|
+
# Agent SFT data
|
|
92
|
+
sft_dir: Path
|
|
93
|
+
sft_data_dir: Path
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def tool_selection_file(self) -> Path:
|
|
97
|
+
"""Tool selection benchmark data file."""
|
|
98
|
+
return self.benchmark_splits_dir / "tool_selection.jsonl"
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def task_planning_file(self) -> Path:
|
|
102
|
+
"""Task planning benchmark data file."""
|
|
103
|
+
return self.benchmark_splits_dir / "task_planning.jsonl"
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def timing_judgment_file(self) -> Path:
|
|
107
|
+
"""Timing judgment benchmark data file."""
|
|
108
|
+
return self.benchmark_splits_dir / "timing_judgment.jsonl"
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def tools_catalog(self) -> Path:
|
|
112
|
+
"""Tools catalog file."""
|
|
113
|
+
return self.tools_data_dir / "tool_catalog.jsonl"
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def tools_categories(self) -> Path:
|
|
117
|
+
"""Tools categories file."""
|
|
118
|
+
return self.tools_data_dir / "categories.json"
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def sft_conversations(self) -> Path:
|
|
122
|
+
"""SFT conversations file."""
|
|
123
|
+
return self.sft_data_dir / "sft_conversations.jsonl"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass
|
|
127
|
+
class RuntimePaths:
|
|
128
|
+
"""
|
|
129
|
+
Paths to runtime/generated data.
|
|
130
|
+
|
|
131
|
+
These are generated by prepare_*.py scripts for specific experiments.
|
|
132
|
+
Stored in .sage/benchmark/data/ (gitignored).
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
# Root directories
|
|
136
|
+
data_root: Path
|
|
137
|
+
results_root: Path
|
|
138
|
+
|
|
139
|
+
# Task-specific data directories
|
|
140
|
+
tool_selection_dir: Path
|
|
141
|
+
task_planning_dir: Path
|
|
142
|
+
timing_judgment_dir: Path
|
|
143
|
+
|
|
144
|
+
# Results directories
|
|
145
|
+
tool_selection_results: Path
|
|
146
|
+
task_planning_results: Path
|
|
147
|
+
timing_judgment_results: Path
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def tool_selection_base(self) -> Path:
|
|
151
|
+
"""Base tool selection data file."""
|
|
152
|
+
return self.tool_selection_dir / "tool_selection.jsonl"
|
|
153
|
+
|
|
154
|
+
def tool_selection_with_candidates(self, num_candidates: int) -> Path:
|
|
155
|
+
"""Tool selection data file with specific candidate pool size."""
|
|
156
|
+
return self.tool_selection_dir / f"tool_selection_{num_candidates}.jsonl"
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def task_planning_base(self) -> Path:
|
|
160
|
+
"""Base task planning data file."""
|
|
161
|
+
return self.task_planning_dir / "task_planning.jsonl"
|
|
162
|
+
|
|
163
|
+
def timing_split_file(self, split: str) -> Path:
|
|
164
|
+
"""Timing judgment data file for specific split."""
|
|
165
|
+
return self.timing_judgment_dir / f"{split}.jsonl"
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@dataclass
|
|
169
|
+
class DataPathsConfig:
|
|
170
|
+
"""
|
|
171
|
+
Complete data paths configuration.
|
|
172
|
+
|
|
173
|
+
Provides access to both source and runtime paths.
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
source: SourcePaths
|
|
177
|
+
runtime: RuntimePaths
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def sage_root(self) -> Path:
|
|
181
|
+
"""SAGE project root."""
|
|
182
|
+
return _find_sage_root()
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def package_root(self) -> Path:
|
|
186
|
+
"""sage-benchmark package root."""
|
|
187
|
+
return _find_package_root()
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# Module-level cached config
|
|
191
|
+
_config: Optional[DataPathsConfig] = None
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def get_source_paths() -> SourcePaths:
|
|
195
|
+
"""
|
|
196
|
+
Get source data paths.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
SourcePaths object with paths to original data files.
|
|
200
|
+
"""
|
|
201
|
+
package_root = _find_package_root()
|
|
202
|
+
data_root = package_root / "src" / "sage" / "data" / "sources"
|
|
203
|
+
|
|
204
|
+
return SourcePaths(
|
|
205
|
+
data_root=data_root,
|
|
206
|
+
# Agent benchmark
|
|
207
|
+
benchmark_dir=data_root / "agent_benchmark",
|
|
208
|
+
benchmark_splits_dir=data_root / "agent_benchmark" / "splits",
|
|
209
|
+
benchmark_metadata_dir=data_root / "agent_benchmark" / "metadata",
|
|
210
|
+
# Agent tools
|
|
211
|
+
tools_dir=data_root / "agent_tools",
|
|
212
|
+
tools_data_dir=data_root / "agent_tools" / "data",
|
|
213
|
+
# Agent SFT
|
|
214
|
+
sft_dir=data_root / "agent_sft",
|
|
215
|
+
sft_data_dir=data_root / "agent_sft" / "data",
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def get_runtime_paths() -> RuntimePaths:
|
|
220
|
+
"""
|
|
221
|
+
Get runtime/generated data paths.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
RuntimePaths object with paths to generated data files.
|
|
225
|
+
"""
|
|
226
|
+
sage_root = _find_sage_root()
|
|
227
|
+
|
|
228
|
+
# Use .sage directory for runtime data
|
|
229
|
+
data_root = sage_root / ".sage" / "benchmark" / "data"
|
|
230
|
+
results_root = sage_root / ".sage" / "benchmark" / "results"
|
|
231
|
+
|
|
232
|
+
return RuntimePaths(
|
|
233
|
+
data_root=data_root,
|
|
234
|
+
results_root=results_root,
|
|
235
|
+
# Data directories
|
|
236
|
+
tool_selection_dir=data_root / "tool_selection",
|
|
237
|
+
task_planning_dir=data_root / "task_planning",
|
|
238
|
+
timing_judgment_dir=data_root / "timing_judgment",
|
|
239
|
+
# Results directories
|
|
240
|
+
tool_selection_results=results_root / "tool_selection",
|
|
241
|
+
task_planning_results=results_root / "task_planning",
|
|
242
|
+
timing_judgment_results=results_root / "timing_judgment",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def get_data_paths_config() -> DataPathsConfig:
|
|
247
|
+
"""
|
|
248
|
+
Get complete data paths configuration.
|
|
249
|
+
|
|
250
|
+
Returns cached config object.
|
|
251
|
+
"""
|
|
252
|
+
global _config
|
|
253
|
+
if _config is None:
|
|
254
|
+
_config = DataPathsConfig(
|
|
255
|
+
source=get_source_paths(),
|
|
256
|
+
runtime=get_runtime_paths(),
|
|
257
|
+
)
|
|
258
|
+
return _config
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def ensure_runtime_dirs() -> None:
|
|
262
|
+
"""
|
|
263
|
+
Ensure all runtime directories exist.
|
|
264
|
+
|
|
265
|
+
Call this before writing generated data.
|
|
266
|
+
"""
|
|
267
|
+
runtime = get_runtime_paths()
|
|
268
|
+
|
|
269
|
+
# Create data directories
|
|
270
|
+
runtime.tool_selection_dir.mkdir(parents=True, exist_ok=True)
|
|
271
|
+
runtime.task_planning_dir.mkdir(parents=True, exist_ok=True)
|
|
272
|
+
runtime.timing_judgment_dir.mkdir(parents=True, exist_ok=True)
|
|
273
|
+
|
|
274
|
+
# Create results directories
|
|
275
|
+
runtime.tool_selection_results.mkdir(parents=True, exist_ok=True)
|
|
276
|
+
runtime.task_planning_results.mkdir(parents=True, exist_ok=True)
|
|
277
|
+
runtime.timing_judgment_results.mkdir(parents=True, exist_ok=True)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def print_data_paths_summary() -> None:
|
|
281
|
+
"""Print summary of all data paths for debugging."""
|
|
282
|
+
config = get_data_paths_config()
|
|
283
|
+
|
|
284
|
+
print("\n" + "=" * 60)
|
|
285
|
+
print("SAGE Agent Benchmark Data Paths")
|
|
286
|
+
print("=" * 60)
|
|
287
|
+
|
|
288
|
+
print("\nš Source Data (read-only, via DataManager):")
|
|
289
|
+
print(f" Root: {config.source.data_root}")
|
|
290
|
+
print(f" Tool Selection: {config.source.tool_selection_file}")
|
|
291
|
+
print(f" Task Planning: {config.source.task_planning_file}")
|
|
292
|
+
print(f" Timing Judgment: {config.source.timing_judgment_file}")
|
|
293
|
+
print(f" Tools Catalog: {config.source.tools_catalog}")
|
|
294
|
+
print(f" SFT Data: {config.source.sft_conversations}")
|
|
295
|
+
|
|
296
|
+
print("\nš Runtime Data (generated, in .sage/):")
|
|
297
|
+
print(f" Root: {config.runtime.data_root}")
|
|
298
|
+
print(f" Tool Selection: {config.runtime.tool_selection_dir}")
|
|
299
|
+
print(f" Task Planning: {config.runtime.task_planning_dir}")
|
|
300
|
+
print(f" Timing Judgment: {config.runtime.timing_judgment_dir}")
|
|
301
|
+
|
|
302
|
+
print("\nš Results:")
|
|
303
|
+
print(f" Root: {config.runtime.results_root}")
|
|
304
|
+
|
|
305
|
+
# Check existence
|
|
306
|
+
print("\nš Status:")
|
|
307
|
+
source_exists = config.source.tool_selection_file.exists()
|
|
308
|
+
runtime_exists = config.runtime.tool_selection_dir.exists()
|
|
309
|
+
print(f" Source data exists: {'ā
' if source_exists else 'ā'}")
|
|
310
|
+
print(f" Runtime data exists: {'ā
' if runtime_exists else 'ā'}")
|
|
311
|
+
print("=" * 60)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
# Backwards compatibility aliases
|
|
315
|
+
def get_data_paths() -> dict:
|
|
316
|
+
"""
|
|
317
|
+
Legacy function for backwards compatibility.
|
|
318
|
+
|
|
319
|
+
Returns dict with source data paths.
|
|
320
|
+
Prefer using get_source_paths() or get_runtime_paths() directly.
|
|
321
|
+
"""
|
|
322
|
+
source = get_source_paths()
|
|
323
|
+
return {
|
|
324
|
+
"tools_dir": source.tools_data_dir,
|
|
325
|
+
"benchmark_dir": source.benchmark_splits_dir,
|
|
326
|
+
"output_dir": get_runtime_paths().tool_selection_dir,
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
if __name__ == "__main__":
|
|
331
|
+
# Print summary when run directly
|
|
332
|
+
print_data_paths_summary()
|