PyPI - isage-benchmark-agent - Versions diffs - 0.1.0.1__py2.py3-none-any.whl - Mend

isage-benchmark-agent 0.1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

isage_benchmark_agent-0.1.0.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,91 @@
+Metadata-Version: 2.4
+Name: isage-benchmark-agent
+Version: 0.1.0.1
+Summary: SAGE Benchmark Agent - Agent capability evaluation framework
+Author-email: IntelliStream Team <shuhao_zhang@hust.edu.cn>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/intellistream/sage-agent-benchmark
+Project-URL: Documentation, https://github.com/intellistream/sage-agent-benchmark#readme
+Project-URL: Repository, https://github.com/intellistream/sage-agent-benchmark
+Project-URL: Issues, https://github.com/intellistream/sage-agent-benchmark/issues
+Keywords: sage,benchmark,agent,tool-selection,planning,evaluation,intellistream
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: ==3.11.*
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: isage-common
+Requires-Dist: isage-libs
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: numpy<2.3.0,>=1.26.0
+Requires-Dist: typer<1.0.0,>=0.15.0
+Requires-Dist: rich<14.0.0,>=13.0.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: ruff==0.14.6; extra == "dev"
+Requires-Dist: pre-commit>=3.0.0; extra == "dev"
+Requires-Dist: mypy>=1.0.0; extra == "dev"
+Requires-Dist: types-PyYAML>=6.0.0; extra == "dev"
+Provides-Extra: all
+Requires-Dist: isage-benchmark-agent[dev]; extra == "all"
+Dynamic: license-file
+# SAGE Benchmark Agent
+Configuration-driven experiment framework for evaluating agent capabilities.
+## Features
+- **Tool Selection Evaluation**: Tool retrieval and ranking benchmarks
+- **Planning Evaluation**: Multi-step planning with tool composition
+- **Timing Detection**: Timing judgment for tool invocation decisions
+## Quick Start
+```bash
+# Install
+pip install isage-benchmark-agent
+# Run tool selection experiment
+sage-agent-bench tool-selection --config config/tool_selection_exp.yaml
+# Run planning experiment
+sage-agent-bench planning --config config/planning_exp.yaml
+```
+## Documentation
+See [benchmark_agent/README.md](src/sage/benchmark/benchmark_agent/README.md) for detailed documentation.
+## Development
+```bash
+# Clone
+git clone https://github.com/intellistream/sage-agent-benchmark.git
+cd sage-agent-benchmark
+# Setup virtual environment
+python -m venv .venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+# Install in development mode
+pip install -e ".[dev]"
+# Run tests
+pytest
+```
+## License
+MIT License - see [LICENSE](LICENSE) for details.

isage_benchmark_agent-0.1.0.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,59 @@
+isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE,sha256=vBNVIGkYYZY0B8f0Ui1ITYwRu7WNtSwyxvIAVGYS6jU,1075
+sage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sage/__init__.pyc,sha256=8sSJ7mfq8oAAPGQurhAtP-2HOTLofIZVpwYzrJJD1YM,125
+sage/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sage/benchmark/__init__.pyc,sha256=3hwYkavnuAMyMha9hrX3i5bNBHfVAYz9I3TEeE1Ru0k,135
+sage/benchmark/benchmark_agent/__init__.py,sha256=Bnlx9jxIxIq0xo1Q-3Hrr2VRJZq0g6nHhXcmffotBYc,2807
+sage/benchmark/benchmark_agent/__init__.pyc,sha256=5wbDpUu6ZFmlHb8R_U1Hy9RzHkijQC7K4y9rfzjsl8Y,2820
+sage/benchmark/benchmark_agent/__main__.pyc,sha256=wpjYGWkeFldKbzeiJGUkgk3JU0QYDLE7T7nIog0z5AU,8148
+sage/benchmark/benchmark_agent/acebench_loader.pyc,sha256=Qyg66TJIa-CqZYusCguWKCvl2yVcwR0GpZ2BBNl8kNM,15940
+sage/benchmark/benchmark_agent/adapter_registry.pyc,sha256=BvfWmqPx7cQyrn_Ql7oUZNDAPfwouoqUZOFTFCKygug,140109
+sage/benchmark/benchmark_agent/data_paths.pyc,sha256=bcjsP7pvFRM0Wip9uCpVI3E2jzibqUD6FRDgrrCZSUQ,13674
+sage/benchmark/benchmark_agent/tools_loader.pyc,sha256=5e2RdIfloId-kA5ahUPojOfNUu0blrkP5yC8tfnND-0,9898
+sage/benchmark/benchmark_agent/config/config_loader.pyc,sha256=dQuSmaXDSbfkE17S_Cjy84_L4aV31OGy0Pvms4xURMg,7986
+sage/benchmark/benchmark_agent/config/default_config.yaml,sha256=iHNVAkou8UxcIOMBEuJheNeVI0fAlfe-BV9inzxw9_s,532
+sage/benchmark/benchmark_agent/config/planning_exp.yaml,sha256=nyimmpQ725skpvEF1eNJpYudkCxYuh2hz6u2e1velNE,607
+sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml,sha256=058TwEeeP8RkzjV7hUbYWFBctrzpcbAXKAlZzw6fJdI,604
+sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml,sha256=dfar6m9XJ1AHsoqVqEN3iN5x8Ka2M3UpvzhpXlCkc6o,544
+sage/benchmark/benchmark_agent/evaluation/__init__.py,sha256=Ypoer04sin7kTryIJWOqIImifonmXvhHWhOAwrhHK10,6615
+sage/benchmark/benchmark_agent/evaluation/__init__.pyc,sha256=LGkSU9NogatJm6IPN9KiXib5GNwU-QthoiI8bapO7nk,9188
+sage/benchmark/benchmark_agent/evaluation/evaluator.pyc,sha256=m2DvOAQNdghDbx0pa9YdzaV4JYX8nzXJorZRqyG_3Bs,11834
+sage/benchmark/benchmark_agent/evaluation/metrics.pyc,sha256=OBE68eXcN9flhN85cjO_XhcJN0Ra8HJxjmaKdxTYHv4,26906
+sage/benchmark/benchmark_agent/evaluation/report_builder.pyc,sha256=kz0QCGqe-YbtTFZAWC8q93WogXCsYONFa4frmQUtOS8,12968
+sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.pyc,sha256=7cbQ-ESoBow0Y3WaUk5eDZW3-q-fHn3lH9Y1gzps0X8,30176
+sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py,sha256=e_kzn4nWKe6l4L0OtSd4-V7Tt064bzwvlz3BsfQXBZI,282
+sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.pyc,sha256=V06E_4Y5JBZXxwl5hZX4flDqQJrJeCpCiU71PRux89U,502
+sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.pyc,sha256=P8kgxnATy9X4goe8HgfXehFH5CVTnLPx3BItPEf8fCM,5747
+sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.pyc,sha256=j58rnbJPOzrjYMUrTsqznUBeGDfeASHJuOkE9bzz3zA,5336
+sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.pyc,sha256=YFhM8485b10cNQF9H7U-sKngrivS4_i8WpwQp5tIjpQ,5213
+sage/benchmark/benchmark_agent/experiments/__init__.py,sha256=w3aKDv8nfgXceZX81KFtOwxuRXhbETLSxUeQDD7HONY,1574
+sage/benchmark/benchmark_agent/experiments/__init__.pyc,sha256=RjkbMzkZ9hRmnYOgz08sJHQV1HKeDeyDVIyqJLo-33A,1649
+sage/benchmark/benchmark_agent/experiments/base_experiment.pyc,sha256=tQdZ_cLBgBX8NW7Fbb59czSA46xYp7Tfp_Lt-PcjhVM,14444
+sage/benchmark/benchmark_agent/experiments/method_comparison.pyc,sha256=tjV8lCsbCM9X-nS3HqZ1o3Gr9olshnDoLWiCVDhEtDo,35466
+sage/benchmark/benchmark_agent/experiments/planning_exp.pyc,sha256=9_P2NcZiQDvRmMDJf19lgVSOq779L_r9Mn2PVmiM3SE,12677
+sage/benchmark/benchmark_agent/experiments/timing_detection_exp.pyc,sha256=aUcKpUrchwe4Lq-pQfURKmVk4cBLLuxQ-0ZuZ1uKX9U,8773
+sage/benchmark/benchmark_agent/experiments/tool_selection_exp.pyc,sha256=d2_jIGC8cNVN-_7W7a3bzJDtSsATMNYcBjfUlsuLWgA,12461
+sage/benchmark/benchmark_agent/scripts/__init__.py,sha256=C8YqglL5eDKIyB8fKg7mC5NZJZ_Fn7LlmEFl2e8RVRw,668
+sage/benchmark/benchmark_agent/scripts/__init__.pyc,sha256=hns6MbbPoHu8GNDw8s2XY0DGPf-JMqI1XI73po2s1ng,845
+sage/benchmark/benchmark_agent/scripts/experiments/__init__.py,sha256=d3VxS2Qfuz7WujRnREz25IUUOVOOatokoD5rhAExy14,1125
+sage/benchmark/benchmark_agent/scripts/experiments/__init__.pyc,sha256=MQOkJMGUuYrYITQ225WvtrKDiNFCvAuDE-Pk_v6K4rc,1322
+sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.pyc,sha256=J-pb1e3K8yDfIEnydkZ3MVZE-nQjCRAVrmGi0KET7JI,14701
+sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.pyc,sha256=MPoQzFjEAKaPvpq6eeVzkvfxkc21ZpnrV9LqbcR7Zh4,18167
+sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.pyc,sha256=U4eC1dOMGx34k_c4AYJ0TBh_dY6ifL-cHqZkIdmxyro,16056
+sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.pyc,sha256=V6FL4YHPDFddTBAvszsghCY7MpgnK6E8Tlcu5pU7eGU,23045
+sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.pyc,sha256=rNQuqNYSbc-MpPU7_l5GXBGJ-VaeecQm8avdYJsiT0k,16312
+sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.pyc,sha256=q7WSoKmHhTrt9uSsNDq6S0iLflcBEktZ0zWlHuOgkI4,14665
+sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.pyc,sha256=7SxuvuZI0W-iL48yF6jfdEmAZeG5V6gYnBy0_GQ22mA,16696
+sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.pyc,sha256=UPzcsz8HF8rQgENNDgKEbDGvFARO0u55atKQnvtuQ9k,10934
+sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.pyc,sha256=2AqFj1NpfCvOyryt9PlMBzWNsm3ASyJchixHVmzwOM4,26165
+sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.pyc,sha256=JhcOpoFow2eFjJgX9VlxLXGKkSltJFYueUsyO3j9APk,18357
+sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.pyc,sha256=Iodn1OUeH8K2RGVu5IVWjCpeC2XurVXC0hz5H46YgRM,29757
+sage/benchmark/benchmark_agent/scripts/experiments/llm_service.pyc,sha256=GsEOHw1Vz5JLohLAfHWcA4LIfEP3z2EubN5KwbstLlg,13530
+sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.pyc,sha256=qJHju7y2tWFX66c9wFRs-LbqkvEatGVr7CW4wcV1LF8,25575
+sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.pyc,sha256=W9Q6L4spqf-KS2bMM-wnAu5dDKme0FS0rXKhpBqoosE,16987
+sage/benchmark/benchmark_agent/scripts/experiments/table_generator.pyc,sha256=z11-4HOgnsn-JtP-nodk-zzWIE30YvCBF3DTzCYqGGk,17114
+isage_benchmark_agent-0.1.0.1.dist-info/METADATA,sha256=2tuTsOY7txE2XqvJcAF0nDeB6gV_jRJDtia5aHKndjc,3018
+isage_benchmark_agent-0.1.0.1.dist-info/WHEEL,sha256=Mk1ST5gDzEO5il5kYREiBnzzM469m5sI8ESPl7TRhJY,110
+isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt,sha256=g34HO224bwnCvIklVp2JQw4wTNDgNW7u61F-cL02pSA,82
+isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt,sha256=hibFyzQHiLOMK68qL1OWsNKaXOmSXqZjeLTBem6Yy7I,5
+isage_benchmark_agent-0.1.0.1.dist-info/RECORD,,

isage_benchmark_agent-0.1.0.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,6 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.10.2)
+Root-Is-Purelib: true
+Tag: py2-none-any
+Tag: py3-none-any

isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ sage-agent-bench = sage.benchmark.benchmark_agent.__main__:main

isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 IntelliStream Team
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ sage

sage/__init__.py ADDED Viewed

File without changes

sage/__init__.pyc ADDED Viewed

Binary file

sage/benchmark/__init__.py ADDED Viewed

File without changes

sage/benchmark/__init__.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/__init__.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""
+Agent Capability Benchmark Module
+This module provides infrastructure for evaluating agent capabilities including:
+- Tool selection
+- Task planning
+- Timing detection
+Architecture:
+    config/         Configuration files and loaders
+    experiments/    Experiment runners and base classes
+    adapter_registry.py  Strategy adapter registry
+Usage:
+    # Via CLI
+    python -m sage.benchmark.benchmark_agent --config config/tool_selection_exp.yaml
+    # Programmatic
+    from sage.benchmark.benchmark_agent import ToolSelectionExperiment
+    from sage.benchmark.benchmark_agent.config import ConfigLoader
+    from sage.benchmark.benchmark_agent.adapter_registry import get_adapter_registry
+    from sage.data import DataManager
+    loader = ConfigLoader()
+    config = loader.load_config("config/tool_selection_exp.yaml")
+    dm = DataManager.get_instance()
+    registry = get_adapter_registry()
+    exp = ToolSelectionExperiment(config, data_manager=dm, adapter_registry=registry)
+    exp.prepare()
+    result = exp.run()
+    exp.finalize()
+"""
+from sage.benchmark.benchmark_agent.adapter_registry import (
+    AdapterRegistry,
+    PlannerAdapter,
+    SelectorAdapter,
+    TimingAdapter,
+    get_adapter_registry,
+    register_strategy,
+)
+# Data paths management
+from sage.benchmark.benchmark_agent.data_paths import (
+    DataPathsConfig,
+    RuntimePaths,
+    SourcePaths,
+    ensure_runtime_dirs,
+    get_data_paths_config,
+    get_runtime_paths,
+    get_source_paths,
+)
+from sage.benchmark.benchmark_agent.experiments import (  # Base classes; Configs; Experiments
+    BaseExperiment,
+    ExperimentConfig,
+    ExperimentResult,
+    PlanningConfig,
+    PlanningExperiment,
+    TimingDetectionConfig,
+    TimingDetectionExperiment,
+    ToolSelectionConfig,
+    ToolSelectionExperiment,
+)
+from sage.benchmark.benchmark_agent.experiments.method_comparison import (
+    ExperimentResult as ComparisonResult,
+)
+from sage.benchmark.benchmark_agent.experiments.method_comparison import (
+    MethodComparisonExperiment,
+    MethodConfig,
+    MethodRegistry,
+    run_full_comparison,
+    run_quick_comparison,
+)
+__version__ = "0.1.0"
+__all__ = [
+    "__version__",
+    # Experiments
+    "ToolSelectionExperiment",
+    "PlanningExperiment",
+    "TimingDetectionExperiment",
+    # Base
+    "BaseExperiment",
+    "ExperimentConfig",
+    "ExperimentResult",
+    # Configs
+    "ToolSelectionConfig",
+    "PlanningConfig",
+    "TimingDetectionConfig",
+    # Adapter Registry
+    "AdapterRegistry",
+    "SelectorAdapter",
+    "PlannerAdapter",
+    "TimingAdapter",
+    "get_adapter_registry",
+    "register_strategy",
+    # Data Paths
+    "get_source_paths",
+    "get_runtime_paths",
+    "get_data_paths_config",
+    "ensure_runtime_dirs",
+    "SourcePaths",
+    "RuntimePaths",
+    "DataPathsConfig",
+]

sage/benchmark/benchmark_agent/__init__.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/__main__.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/acebench_loader.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/adapter_registry.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/config/config_loader.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/config/default_config.yaml ADDED Viewed

@@ -0,0 +1,24 @@
+# Default Configuration for Agent Benchmark Experiments
+# Data settings
+profile: "quick_eval"  # agent_eval usage profile
+split: "dev"           # Data split: train/dev/test
+max_samples:           # Limit samples (null = all)
+# Randomness
+seed: 42
+# Output and reporting
+report:
+  format: ["json", "markdown"]
+  include_breakdowns: true
+  path: "${PROJECT_ROOT}/outputs/agent_benchmark"
+  markdown_template:
+# Metrics (common defaults, overridden per experiment)
+metrics:
+- "accuracy"
+# Logging
+verbose: true
+log_level: "INFO"

sage/benchmark/benchmark_agent/config/planning_exp.yaml ADDED Viewed

@@ -0,0 +1,34 @@
+# Planning Experiment Configuration
+experiment: planning
+# Data configuration
+profile: "full_eval"
+split: "dev"
+max_samples:
+# Strategy configuration
+planner: "baseline.template"
+min_steps: 5
+max_steps: 10
+planner_params:
+  allow_tool_reuse: true
+  enforce_sequence: true
+# Metrics to evaluate
+metrics:
+- "plan_success_rate"
+- "step_accuracy"
+- "tool_sequence_match"
+- "average_plan_length"
+# Report configuration
+report:
+  format: ["json", "markdown"]
+  include_breakdowns: true
+  path: "${PROJECT_ROOT}/outputs/agent_benchmark/planning"
+  markdown_template:
+# Reproducibility
+seed: 42
+verbose: true

sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml ADDED Viewed

@@ -0,0 +1,34 @@
+# Timing Detection Experiment Configuration
+experiment: timing_detection
+# Data configuration
+profile: "full_eval"
+split: "dev"
+max_samples:
+# Strategy configuration
+detector: "baseline.threshold"
+threshold: 0.5
+detector_params:
+  use_context: true
+  confidence_threshold: 0.7
+# Metrics to evaluate
+metrics:
+- "f1_score"
+- "precision"
+- "recall"
+- "accuracy"
+- "confusion_matrix"
+# Report configuration
+report:
+  format: ["json", "markdown"]
+  include_breakdowns: true
+  path: "${PROJECT_ROOT}/outputs/agent_benchmark/timing_detection"
+  markdown_template:
+# Reproducibility
+seed: 42
+verbose: true

sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml ADDED Viewed

@@ -0,0 +1,32 @@
+# Tool Selection Experiment Configuration
+experiment: tool_selection
+# Data configuration
+profile: "quick_eval"
+split: "dev"
+max_samples:
+# Strategy configuration
+selector: "baseline.keyword"
+top_k: 5
+selector_params:
+  min_score: 0.1
+# Metrics to evaluate
+metrics:
+- "top_k_accuracy"
+- "recall@5"
+- "precision@5"
+- "mrr"
+# Report configuration
+report:
+  format: ["json", "markdown"]
+  include_breakdowns: true
+  path: "${PROJECT_ROOT}/outputs/agent_benchmark/tool_selection"
+  markdown_template:
+# Reproducibility
+seed: 42
+verbose: true

sage/benchmark/benchmark_agent/data_paths.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/evaluation/__init__.py ADDED Viewed

@@ -0,0 +1,217 @@
+"""
+Evaluation module for Agent Capability Benchmark.
+This module provides metrics, analyzers, and report builders for evaluating
+agent performance across three capabilities: tool selection, task planning,
+and timing judgment.
+"""
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Protocol, Sequence
+from pydantic import BaseModel, ConfigDict, Field
+__all__ = [
+    "MetricOutput",
+    "EvaluationReport",
+    "Metric",
+    "Analyzer",
+    "ReportBuilder",
+    "compute_metrics",
+    "MetricRegistry",
+]
+class MetricOutput(BaseModel):
+    """Output from a metric computation."""
+    value: float
+    details: dict[str, Any] = Field(default_factory=dict)
+class EvaluationReport(BaseModel):
+    """Complete evaluation report with metrics, breakdowns, and artifacts."""
+    task: str
+    experiment_id: str
+    metrics: dict[str, float]
+    breakdowns: dict[str, Any] = Field(default_factory=dict)
+    artifacts: dict[str, Path] = Field(default_factory=dict)
+    timestamp: str
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+class Metric(Protocol):
+    """Protocol for metric implementations."""
+    name: str
+    def compute(self, predictions: Sequence[Any], references: Sequence[Any]) -> MetricOutput:
+        """
+        Compute metric from predictions and references.
+        Args:
+            predictions: Model predictions
+            references: Ground truth references
+        Returns:
+            MetricOutput with value and optional details
+        """
+        ...
+class Analyzer(Protocol):
+    """Protocol for analyzer implementations."""
+    name: str
+    def analyze(
+        self, predictions: Sequence[Any], references: Sequence[Any], metadata: dict[str, Any]
+    ) -> dict[str, Any]:
+        """
+        Analyze predictions and produce breakdowns.
+        Args:
+            predictions: Model predictions
+            references: Ground truth references
+            metadata: Additional context from experiment
+        Returns:
+            Dictionary with analysis results
+        """
+        ...
+class ReportBuilder(Protocol):
+    """Protocol for report builder implementations."""
+    def build(self, report: EvaluationReport, output_path: Path) -> Path:
+        """
+        Build and save report to file.
+        Args:
+            report: EvaluationReport to format
+            output_path: Path to save report
+        Returns:
+            Path to saved report file
+        """
+        ...
+# Import metric registry after defining base classes
+from sage.benchmark.benchmark_agent.evaluation.metrics import MetricRegistry
+def compute_metrics(
+    task: str,
+    predictions: list[dict[str, Any]],
+    references: list[dict[str, Any]],
+    metrics: list[str],
+    k: int = 5,
+) -> dict[str, float]:
+    """
+    Compute evaluation metrics for experiment results.
+    Args:
+        task: Task type ('tool_selection', 'planning', 'timing_detection')
+        predictions: List of prediction dictionaries
+        references: List of reference dictionaries
+        metrics: List of metric names to compute
+        k: Top-k parameter for ranking metrics
+    Returns:
+        Dictionary mapping metric names to values
+    """
+    results = {}
+    if task == "tool_selection":
+        # Extract tool lists from predictions and references
+        pred_tools = []
+        ref_tools = []
+        for pred, ref in zip(predictions, references):
+            # Get predicted tool IDs
+            if "predicted_tools" in pred:
+                tools = pred["predicted_tools"]
+                if tools and isinstance(tools[0], dict):
+                    pred_tools.append([t["tool_id"] for t in tools])
+                else:
+                    pred_tools.append(tools if tools else [])
+            else:
+                pred_tools.append([])
+            # Get reference tool IDs
+            if "ground_truth_tools" in ref:
+                ref_tools.append(ref["ground_truth_tools"])
+            elif "top_k" in ref:
+                ref_tools.append(ref["top_k"])
+            else:
+                ref_tools.append([])
+        # Compute each metric
+        for metric_name in metrics:
+            try:
+                if metric_name in ("top_k_accuracy", "recall_at_k", "precision_at_k"):
+                    metric = MetricRegistry.get(metric_name, k=k)
+                elif metric_name == "mrr":
+                    metric = MetricRegistry.get("mrr")
+                else:
+                    continue
+                output = metric.compute(pred_tools, ref_tools)
+                results[metric_name] = output.value
+            except Exception as e:
+                results[metric_name] = 0.0
+                results[f"{metric_name}_error"] = str(e)
+    elif task == "timing_detection":
+        # Extract boolean decisions
+        pred_decisions = []
+        ref_decisions = []
+        for pred, ref in zip(predictions, references):
+            pred_decisions.append(pred.get("should_call_tool", False))
+            ref_decisions.append(ref.get("should_call_tool", False))
+        # Metric name mapping for timing detection
+        timing_metric_map = {
+            "accuracy": "timing_accuracy",
+            "precision": "timing_precision",
+            "recall": "timing_recall",
+            "f1": "timing_f1",
+        }
+        for metric_name in metrics:
+            try:
+                # Map simple names to full metric names
+                registry_name = timing_metric_map.get(metric_name, metric_name)
+                metric = MetricRegistry.get(registry_name)
+                output = metric.compute(pred_decisions, ref_decisions)
+                results[metric_name] = output.value
+                # Include details if available
+                if hasattr(output, "details") and output.details:
+                    results[f"{metric_name}_details"] = output.details
+            except Exception as e:
+                results[metric_name] = 0.0
+                results[f"{metric_name}_error"] = str(e)
+    elif task == "planning":
+        # Extract tool sequences
+        pred_sequences = []
+        ref_sequences = []
+        for pred, ref in zip(predictions, references):
+            pred_sequences.append(pred.get("tool_sequence", []))
+            ref_sequences.append(ref.get("tool_sequence", []))
+        for metric_name in metrics:
+            try:
+                metric = MetricRegistry.get(metric_name)
+                output = metric.compute(pred_sequences, ref_sequences)
+                results[metric_name] = output.value
+            except Exception:
+                results[metric_name] = 0.0
+    return results

sage/benchmark/benchmark_agent/evaluation/__init__.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Analyzers package initialization."""
+from .planning_analyzer import PlanningAnalyzer
+from .timing_analyzer import TimingAnalyzer
+from .tool_selection_analyzer import ToolSelectionAnalyzer
+__all__ = [
+    "ToolSelectionAnalyzer",
+    "PlanningAnalyzer",
+    "TimingAnalyzer",
+]

sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/evaluation/evaluator.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/evaluation/metrics.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/evaluation/report_builder.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/experiments/__init__.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""
+Experiment implementations for agent benchmark evaluation.
+Available experiments:
+- ToolSelectionExperiment: Tool retrieval and ranking
+- PlanningExperiment: Multi-step planning with tool composition
+- TimingDetectionExperiment: Timing judgment for tool invocation
+"""
+from sage.benchmark.benchmark_agent.experiments.base_experiment import (
+    CONFIG_TYPES,
+    BaseExperiment,
+    ExperimentConfig,
+    ExperimentResult,
+    PlanningConfig,
+    PlanningPrediction,
+    PlanStep,
+    ReportConfig,
+    TimingDecision,
+    TimingDetectionConfig,
+    ToolPrediction,
+    ToolSelectionConfig,
+    create_config,
+)
+from sage.benchmark.benchmark_agent.experiments.planning_exp import (
+    PlanningExperiment,
+    PlanningTask,
+)
+from sage.benchmark.benchmark_agent.experiments.timing_detection_exp import (
+    TimingDetectionExperiment,
+    TimingMessage,
+)
+from sage.benchmark.benchmark_agent.experiments.tool_selection_exp import (
+    ToolSelectionExperiment,
+    ToolSelectionQuery,
+)
+__all__ = [
+    # Base classes
+    "BaseExperiment",
+    "ExperimentConfig",
+    "ExperimentResult",
+    # Config models
+    "ToolSelectionConfig",
+    "PlanningConfig",
+    "TimingDetectionConfig",
+    "ReportConfig",
+    # Result/task models
+    "ToolPrediction",
+    "PlanStep",
+    "PlanningPrediction",
+    "TimingDecision",
+    "ToolSelectionQuery",
+    "PlanningTask",
+    "TimingMessage",
+    # Utilities
+    "CONFIG_TYPES",
+    "create_config",
+    # Experiment implementations
+    "ToolSelectionExperiment",
+    "PlanningExperiment",
+    "TimingDetectionExperiment",
+]

sage/benchmark/benchmark_agent/experiments/__init__.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/experiments/base_experiment.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/experiments/method_comparison.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/experiments/planning_exp.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/experiments/timing_detection_exp.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/experiments/tool_selection_exp.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/scripts/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""
+SAGE-Bench Scripts
+统一的 Benchmark 实验脚本入口。
+所有实验功能位于 experiments/ 子包:
+- run_paper1_experiments.py: 论文 1 实验统一入口
+- exp_main_*.py: Section 5.2 主实验
+- exp_analysis_*.py: Section 5.3 分析实验
+- exp_cross_dataset.py: Section 5.4 跨数据集泛化
+- exp_training_comparison.py: Section 5.5 训练方法对比
+Usage:
+    # CLI 入口
+    sage-bench run --quick
+    sage-bench eval --dataset all
+    sage-bench train --dry-run
+    sage-bench llm status
+    # 直接运行
+    python -m sage.benchmark.benchmark_agent.scripts.experiments.run_paper1_experiments --quick
+"""
+__all__ = [
+    "experiments",
+]

sage/benchmark/benchmark_agent/scripts/__init__.pyc ADDED Viewed

Binary file

sage/benchmark/benchmark_agent/scripts/experiments/__init__.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""
+SAGE-Bench Paper 1 Experiments Package
+按论文 Experiment Section 组织的实验脚本集合:
+- Section 5.2 (Main Results):
+    - exp_main_timing.py      # RQ1: Timing Detection
+    - exp_main_planning.py    # RQ2: Task Planning
+    - exp_main_selection.py   # RQ3: Tool Selection
+- Section 5.3 (Analysis):
+    - exp_analysis_error.py       # 5.3.1 Error Analysis
+    - exp_analysis_scaling.py     # 5.3.2 Scaling Analysis
+    - exp_analysis_robustness.py  # 5.3.3 Robustness Analysis
+    - exp_analysis_ablation.py    # 5.3.4 Ablation Studies
+- Section 5.4 (Generalization):
+    - exp_cross_dataset.py    # Cross-dataset evaluation
+Usage:
+    sage-bench paper1 run                    # 运行所有实验
+    sage-bench paper1 run --section 5.2      # 仅主实验
+    sage-bench paper1 timing                 # 单个实验
+"""
+from .exp_utils import (
+    get_embedding_client,
+    get_llm_client,
+    load_benchmark_data,
+    save_results,
+    setup_experiment_env,
+)
+__all__ = [
+    "setup_experiment_env",
+    "load_benchmark_data",
+    "save_results",
+    "get_llm_client",
+    "get_embedding_client",
+]