ftl-bench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ftl_bench-0.1.0/.gitignore +31 -0
- ftl_bench-0.1.0/PKG-INFO +66 -0
- ftl_bench-0.1.0/README.md +40 -0
- ftl_bench-0.1.0/hatch_build.py +54 -0
- ftl_bench-0.1.0/pyproject.toml +60 -0
- ftl_bench-0.1.0/src/ftl_bench/__init__.py +68 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/LICENSE +33 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/README.md +102 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/adapter/README.md +67 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/adapter/baseline_agent.py +338 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/adapter/capture.py +105 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/adapter/eval.py +78 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/adapter/ftl_mcp_server.py +209 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/adapter/llm_agent.py +373 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/adapter/play_cli.py +490 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/adapter/run_benchmark.py +315 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/mod/ftl_bench_bridge/README.md +13 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/mod/ftl_bench_bridge/data/bridge.lua +18 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/mod/ftl_bench_bridge/data/hyperspace.xml.append +11 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/mod/ftl_bench_bridge/data/json.lua +140 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/mod/ftl_bench_bridge/data/observation.lua +125 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/mod/ftl_bench_bridge/dev/ftl_bench_dev.lua +1595 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/mod/ftl_bench_bridge/mod-appendix/metadata.xml +7 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/prompts/ftl_agent_v1.md +146 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/prompts/ftl_agent_v2.md +171 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/prompts/ftl_agent_v3.md +85 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/scenarios/README.md +9 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/scenarios/full_game.json +15 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/scenarios/suite_v1.json +59 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/scripts/deploy_dev.sh +15 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/scripts/install_macos.sh +68 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/scripts/restart_ftl.sh +83 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/scripts/setup_pc.sh +294 -0
- ftl_bench-0.1.0/src/ftl_bench/_bundled/scripts/verify_observation.sh +50 -0
- ftl_bench-0.1.0/src/ftl_bench/aggregate.py +58 -0
- ftl_bench-0.1.0/src/ftl_bench/cli.py +130 -0
- ftl_bench-0.1.0/src/ftl_bench/observation.py +91 -0
- ftl_bench-0.1.0/src/ftl_bench/scenario.py +83 -0
- ftl_bench-0.1.0/src/ftl_bench/scoring.py +192 -0
- ftl_bench-0.1.0/src/ftl_bench/session.py +441 -0
- ftl_bench-0.1.0/src/ftl_bench/trajectory.py +42 -0
- ftl_bench-0.1.0/tests/fixtures/observation_sample.json +28 -0
- ftl_bench-0.1.0/tests/test_aggregate.py +383 -0
- ftl_bench-0.1.0/tests/test_cli.py +248 -0
- ftl_bench-0.1.0/tests/test_observation_client.py +68 -0
- ftl_bench-0.1.0/tests/test_observation_extra.py +346 -0
- ftl_bench-0.1.0/tests/test_scoring.py +651 -0
- ftl_bench-0.1.0/tests/test_session.py +113 -0
- ftl_bench-0.1.0/tests/test_session_platform.py +366 -0
- ftl_bench-0.1.0/tests/test_trajectory.py +328 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
.venv/
|
|
5
|
+
venv/
|
|
6
|
+
*.egg-info/
|
|
7
|
+
.pytest_cache/
|
|
8
|
+
.coverage
|
|
9
|
+
htmlcov/
|
|
10
|
+
|
|
11
|
+
# Env / secrets
|
|
12
|
+
.env
|
|
13
|
+
*.local
|
|
14
|
+
|
|
15
|
+
# Runtime artifacts
|
|
16
|
+
runs/
|
|
17
|
+
logs/
|
|
18
|
+
*.log
|
|
19
|
+
trajectories/
|
|
20
|
+
state.json
|
|
21
|
+
action.json
|
|
22
|
+
|
|
23
|
+
# OS
|
|
24
|
+
.DS_Store
|
|
25
|
+
|
|
26
|
+
# Build
|
|
27
|
+
build/
|
|
28
|
+
dist/
|
|
29
|
+
# vendored into the package at build time by harness/hatch_build.py (not a source of truth)
|
|
30
|
+
src/ftl_bench/_bundled/
|
|
31
|
+
harness/src/ftl_bench/_bundled/
|
ftl_bench-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ftl-bench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A turn-based, intent-level environment + scenario suite for evaluating LLM agents on FTL: Faster Than Light
|
|
5
|
+
Project-URL: Homepage, https://github.com/ogabrielluiz/ftl_bench
|
|
6
|
+
Project-URL: Repository, https://github.com/ogabrielluiz/ftl_bench
|
|
7
|
+
Author: ogabrielluiz
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: agents,benchmark,evaluation,ftl,game-playing,llm
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Operating System :: MacOS
|
|
13
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
14
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
23
|
+
Provides-Extra: screenshot
|
|
24
|
+
Requires-Dist: pyobjc-framework-quartz>=10.0; (sys_platform == 'darwin') and extra == 'screenshot'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# ftl-bench
|
|
28
|
+
|
|
29
|
+
An agent-evaluation benchmark that has an LLM agent play **FTL: Faster Than Light** through a
|
|
30
|
+
turn-based, intent-level interface built on the FTL-Hyperspace Lua API. The agent reads a
|
|
31
|
+
decision-complete JSON observation and replies with one command; the harness scores how far it
|
|
32
|
+
gets on a suite of reproducible, seed-pinned scenarios.
|
|
33
|
+
|
|
34
|
+
This package ships the Python harness, the scenario suite, the agents, and the `ftlbench`
|
|
35
|
+
command line. Driving the real game additionally needs FTL installed (via Steam) plus the bench
|
|
36
|
+
Hyperspace mod.
|
|
37
|
+
|
|
38
|
+
## Install
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install ftl-bench
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Use
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
ftlbench run --agent scripted # run the scenario suite with the scripted baseline
|
|
48
|
+
ftlbench run --agent random --tier public # the legal-move floor on the public tier
|
|
49
|
+
ftlbench run --agent llm --backend anthropic --model claude-sonnet-4-6 # a model plays the suite
|
|
50
|
+
ftlbench play obs # print the live observation the agent sees
|
|
51
|
+
ftlbench install-mod --url <release-asset> # install the prebuilt bench Hyperspace mod into FTL
|
|
52
|
+
ftlbench version
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
`ftlbench run --help` and `ftlbench play` show the full options. Results and a reproducibility
|
|
56
|
+
manifest are written under `runs/benchmark/`.
|
|
57
|
+
|
|
58
|
+
## Platforms
|
|
59
|
+
|
|
60
|
+
The harness runs on native Windows, WSL, or macOS and launches FTL for you (via Steam on
|
|
61
|
+
Windows). It reads/writes the FTL user folder, resolved per OS or overridden with `FTL_SAVE_DIR`.
|
|
62
|
+
|
|
63
|
+
## More
|
|
64
|
+
|
|
65
|
+
Full design, architecture, and the in-game bridge live in the project repository:
|
|
66
|
+
<https://github.com/ogabrielluiz/ftl_bench>.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# ftl-bench
|
|
2
|
+
|
|
3
|
+
An agent-evaluation benchmark that has an LLM agent play **FTL: Faster Than Light** through a
|
|
4
|
+
turn-based, intent-level interface built on the FTL-Hyperspace Lua API. The agent reads a
|
|
5
|
+
decision-complete JSON observation and replies with one command; the harness scores how far it
|
|
6
|
+
gets on a suite of reproducible, seed-pinned scenarios.
|
|
7
|
+
|
|
8
|
+
This package ships the Python harness, the scenario suite, the agents, and the `ftlbench`
|
|
9
|
+
command line. Driving the real game additionally needs FTL installed (via Steam) plus the bench
|
|
10
|
+
Hyperspace mod.
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install ftl-bench
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Use
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
ftlbench run --agent scripted # run the scenario suite with the scripted baseline
|
|
22
|
+
ftlbench run --agent random --tier public # the legal-move floor on the public tier
|
|
23
|
+
ftlbench run --agent llm --backend anthropic --model claude-sonnet-4-6 # a model plays the suite
|
|
24
|
+
ftlbench play obs # print the live observation the agent sees
|
|
25
|
+
ftlbench install-mod --url <release-asset> # install the prebuilt bench Hyperspace mod into FTL
|
|
26
|
+
ftlbench version
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
`ftlbench run --help` and `ftlbench play` show the full options. Results and a reproducibility
|
|
30
|
+
manifest are written under `runs/benchmark/`.
|
|
31
|
+
|
|
32
|
+
## Platforms
|
|
33
|
+
|
|
34
|
+
The harness runs on native Windows, WSL, or macOS and launches FTL for you (via Steam on
|
|
35
|
+
Windows). It reads/writes the FTL user folder, resolved per OS or overridden with `FTL_SAVE_DIR`.
|
|
36
|
+
|
|
37
|
+
## More
|
|
38
|
+
|
|
39
|
+
Full design, architecture, and the in-game bridge live in the project repository:
|
|
40
|
+
<https://github.com/ogabrielluiz/ftl_bench>.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Hatchling build hook: vendor the repo's sibling data dirs INTO the package tree so that the
|
|
2
|
+
wheel AND the sdist are both self-contained.
|
|
3
|
+
|
|
4
|
+
The runnable benchmark needs files that live in repo top-level dirs (adapter/, scenarios/,
|
|
5
|
+
prompts/, mod/, scripts/) and the README, all OUTSIDE this package root (harness/). A wheel-only
|
|
6
|
+
`force-include "../adapter"` can reach them when building from the repo, but the sdist cannot
|
|
7
|
+
(paths above the project root are not shipped, and the sdist->wheel rebuild then fails trying to
|
|
8
|
+
reach a `../adapter` sibling that no longer exists). So instead we copy them under
|
|
9
|
+
`src/ftl_bench/_bundled/` at build time and ship that as package data (declared via `artifacts`),
|
|
10
|
+
which both targets include cleanly without reaching above the project root.
|
|
11
|
+
|
|
12
|
+
Idempotent: when building from an unpacked sdist the siblings are absent but `_bundled/` was
|
|
13
|
+
already shipped, so we keep it as-is.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import shutil
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
from hatchling.builders.hooks.plugin.interface import BuildHookInterface
|
|
21
|
+
|
|
22
|
+
# repo top-level entries to vendor under src/ftl_bench/_bundled/ (preserving names so each
|
|
23
|
+
# module's own `REPO = __file__.parent.parent` path logic keeps resolving them).
|
|
24
|
+
_VENDOR = ["adapter", "scenarios", "prompts", "mod", "scripts", "README.md", "LICENSE"]
|
|
25
|
+
_IGNORE = shutil.ignore_patterns(
|
|
26
|
+
"__pycache__", "*.pyc", "runs", "dist", "build", ".venv", "node_modules", "*.log"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CustomBuildHook(BuildHookInterface):
|
|
31
|
+
PLUGIN_NAME = "custom"
|
|
32
|
+
|
|
33
|
+
def initialize(self, version, build_data):
|
|
34
|
+
root = Path(self.root) # project root = harness/
|
|
35
|
+
repo = root.parent # repo root: the siblings live here
|
|
36
|
+
bundled = root / "src" / "ftl_bench" / "_bundled"
|
|
37
|
+
|
|
38
|
+
if (repo / "adapter").is_dir():
|
|
39
|
+
# building from the repo working tree: refresh a clean vendored copy
|
|
40
|
+
if bundled.exists():
|
|
41
|
+
shutil.rmtree(bundled)
|
|
42
|
+
bundled.mkdir(parents=True)
|
|
43
|
+
for name in _VENDOR:
|
|
44
|
+
src, dst = repo / name, bundled / name
|
|
45
|
+
if src.is_dir():
|
|
46
|
+
shutil.copytree(src, dst, ignore=_IGNORE)
|
|
47
|
+
elif src.is_file():
|
|
48
|
+
shutil.copy2(src, dst)
|
|
49
|
+
elif not (bundled / "adapter" / "run_benchmark.py").exists():
|
|
50
|
+
# not in the repo and nothing was pre-vendored: cannot produce a runnable package
|
|
51
|
+
raise RuntimeError(
|
|
52
|
+
"ftl_bench build: neither the repo data dirs nor a pre-vendored "
|
|
53
|
+
"src/ftl_bench/_bundled/ are present — build from the repo or a complete sdist."
|
|
54
|
+
)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "ftl-bench"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A turn-based, intent-level environment + scenario suite for evaluating LLM agents on FTL: Faster Than Light"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
requires-python = ">=3.11"
|
|
8
|
+
authors = [{ name = "ogabrielluiz" }]
|
|
9
|
+
keywords = ["benchmark", "agents", "llm", "evaluation", "ftl", "game-playing"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 3 - Alpha",
|
|
12
|
+
"Intended Audience :: Science/Research",
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"Programming Language :: Python :: 3.11",
|
|
15
|
+
"Programming Language :: Python :: 3.12",
|
|
16
|
+
"Programming Language :: Python :: 3.13",
|
|
17
|
+
"Operating System :: Microsoft :: Windows",
|
|
18
|
+
"Operating System :: MacOS",
|
|
19
|
+
"Operating System :: POSIX :: Linux",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
]
|
|
22
|
+
dependencies = []
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://github.com/ogabrielluiz/ftl_bench"
|
|
26
|
+
Repository = "https://github.com/ogabrielluiz/ftl_bench"
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
ftlbench = "ftl_bench.cli:main"
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
dev = ["pytest>=8.0"]
|
|
33
|
+
# Occlusion-proof FTL window capture for the `screenshot` action (macOS). Without it the
|
|
34
|
+
# capture falls back to an AppleScript-bounds region grab (may catch an overlapping window).
|
|
35
|
+
screenshot = ["pyobjc-framework-Quartz>=10.0; sys_platform=='darwin'"]
|
|
36
|
+
|
|
37
|
+
[build-system]
|
|
38
|
+
requires = ["hatchling"]
|
|
39
|
+
build-backend = "hatchling.build"
|
|
40
|
+
|
|
41
|
+
# A build hook vendors the repo's sibling data dirs (adapter/, scenarios/, prompts/, mod/,
|
|
42
|
+
# scripts/, README) into src/ftl_bench/_bundled/ so BOTH the wheel and the sdist are
|
|
43
|
+
# self-contained. See harness/hatch_build.py for why force-include alone cannot do this.
|
|
44
|
+
[tool.hatch.build.hooks.custom]
|
|
45
|
+
path = "hatch_build.py"
|
|
46
|
+
|
|
47
|
+
# Include the build-vendored data even though it is .gitignored.
|
|
48
|
+
[tool.hatch.build]
|
|
49
|
+
artifacts = ["src/ftl_bench/_bundled/**"]
|
|
50
|
+
|
|
51
|
+
[tool.hatch.build.targets.wheel]
|
|
52
|
+
packages = ["src/ftl_bench"]
|
|
53
|
+
exclude = ["**/__pycache__", "**/*.pyc", "**/runs/**"]
|
|
54
|
+
|
|
55
|
+
[tool.hatch.build.targets.sdist]
|
|
56
|
+
include = ["/src", "/tests", "/hatch_build.py", "/README.md", "/pyproject.toml"]
|
|
57
|
+
|
|
58
|
+
[tool.pytest.ini_options]
|
|
59
|
+
testpaths = ["tests"]
|
|
60
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from ftl_bench.observation import (
|
|
2
|
+
Observation,
|
|
3
|
+
ObservationClient,
|
|
4
|
+
ObservationValidationError,
|
|
5
|
+
)
|
|
6
|
+
from ftl_bench.scenario import Scenario, SubObjective, load_suite
|
|
7
|
+
from ftl_bench.scoring import (
|
|
8
|
+
achieved_metrics,
|
|
9
|
+
score_instance,
|
|
10
|
+
score_observation,
|
|
11
|
+
score_trajectory,
|
|
12
|
+
)
|
|
13
|
+
from ftl_bench.session import (
|
|
14
|
+
AgentSession,
|
|
15
|
+
choose_event,
|
|
16
|
+
fire_weapon,
|
|
17
|
+
jump,
|
|
18
|
+
move_crew,
|
|
19
|
+
set_system_power,
|
|
20
|
+
start_game,
|
|
21
|
+
store_buy,
|
|
22
|
+
store_sell,
|
|
23
|
+
upgrade_system,
|
|
24
|
+
cloak,
|
|
25
|
+
set_doors,
|
|
26
|
+
mind_control,
|
|
27
|
+
battery,
|
|
28
|
+
fire_beam,
|
|
29
|
+
hack_system,
|
|
30
|
+
deploy_drone,
|
|
31
|
+
recall_drones,
|
|
32
|
+
teleport_crew,
|
|
33
|
+
)
|
|
34
|
+
from ftl_bench.trajectory import TrajectoryRecorder, load_trajectory
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
"Observation",
|
|
38
|
+
"ObservationClient",
|
|
39
|
+
"ObservationValidationError",
|
|
40
|
+
"AgentSession",
|
|
41
|
+
"set_system_power",
|
|
42
|
+
"move_crew",
|
|
43
|
+
"jump",
|
|
44
|
+
"choose_event",
|
|
45
|
+
"fire_weapon",
|
|
46
|
+
"start_game",
|
|
47
|
+
"store_buy",
|
|
48
|
+
"store_sell",
|
|
49
|
+
"upgrade_system",
|
|
50
|
+
"cloak",
|
|
51
|
+
"set_doors",
|
|
52
|
+
"mind_control",
|
|
53
|
+
"battery",
|
|
54
|
+
"fire_beam",
|
|
55
|
+
"hack_system",
|
|
56
|
+
"deploy_drone",
|
|
57
|
+
"recall_drones",
|
|
58
|
+
"teleport_crew",
|
|
59
|
+
"TrajectoryRecorder",
|
|
60
|
+
"load_trajectory",
|
|
61
|
+
"score_observation",
|
|
62
|
+
"score_trajectory",
|
|
63
|
+
"Scenario",
|
|
64
|
+
"SubObjective",
|
|
65
|
+
"load_suite",
|
|
66
|
+
"score_instance",
|
|
67
|
+
"achieved_metrics",
|
|
68
|
+
]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ftl_bench Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
License notes (informational; not part of the MIT grant above)
|
|
26
|
+
|
|
27
|
+
This license covers the ftl_bench source code in this repository only. It does
|
|
28
|
+
not grant rights to, and explicitly excludes:
|
|
29
|
+
|
|
30
|
+
- FTL: Faster Than Light, a commercial game by Subset Games, including any of
|
|
31
|
+
its assets or data. You must own and install the game separately.
|
|
32
|
+
- FTL-Hyperspace, the third-party modding framework this project builds on,
|
|
33
|
+
which is distributed under its own separate license and terms.
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# ftl_bench
|
|
2
|
+
|
|
3
|
+
An agent-evaluation benchmark that lets LLM coding agents **play FTL: Faster Than Light** through a clean, intent-level interface built on the [FTL-Hyperspace](https://github.com/FTL-Hyperspace/FTL-Hyperspace) Lua API.
|
|
4
|
+
|
|
5
|
+
FTL is a real-time-with-pause roguelike: resource management, risk under uncertainty, combat micro, and long-horizon planning across a branching map. That makes it a rich substrate for measuring agent decision-making. `ftl_bench` wraps it into a reproducible, turn-based environment with structured observations, an intent-level action space, seed-pinned runs, and full trajectory logging.
|
|
6
|
+
|
|
7
|
+
## The benchmark: a scenario suite scored by goal achievement
|
|
8
|
+
|
|
9
|
+
`ftl_bench` evaluates agents on a suite of reproducible **scenario instances**, not on raw play.
|
|
10
|
+
|
|
11
|
+
- **Instance** = a fully-specified, seeded scenario `(seed, ship, difficulty, goal)`. The seed pins the map + events; the goal is a set of weighted sub-objectives.
|
|
12
|
+
- **The agent decides everything in-game** (fight, flee, target, power, repair, navigate). The harness scores **only goal achievement** — no decision policy is baked into the env or scoring.
|
|
13
|
+
- **Goal-conditioned partial credit**: each instance earns `r ∈ [0,1]` = the weighted intersection of achieved vs. requested sub-objectives, × a legitimacy gate that collapses metric-gaming (e.g. jumping in place). `Score = 100·r`.
|
|
14
|
+
- **Headline metric: FTL score** = the mean of FTL's own native run score (scrap, kills, sectors, flagship, times difficulty) over the suite (± seed SE), alongside a strict **Solve / Win Rate** and an **efficiency** axis (jumps/turns per instance).
|
|
15
|
+
- **Anti-memorization split**: a `public` tier to tune against and a held-out `semi_private` tier that is the leaderboard number.
|
|
16
|
+
- **Baseline ladder**: a `random`-legal floor and a `scripted` heuristic floor, so a high agent score is interpretable.
|
|
17
|
+
|
|
18
|
+
**Scenario types:** `survive_n_jumps` (make N jumps alive), `reach_sector` (advance to sector K), `reach_sector_healthy` (reach K with hull + crew intact — a multi-attribute goal), `full_run` (milestone progress toward beating the flagship — the unsaturated ceiling). Higher-signal micro-encounters (`win_this_combat`, `escape_a_crisis`, `event_risk_choice`) and the flagship/store tiers are next.
|
|
19
|
+
|
|
20
|
+
**Run it** (the harness runs on native Windows, WSL, or macOS, and drives FTL for you):
|
|
21
|
+
```bash
|
|
22
|
+
# One-time setup, per platform:
|
|
23
|
+
# Windows: install FTL via Steam + the bench Hyperspace mod (scripts/setup_pc.sh). The runner
|
|
24
|
+
# launches and restarts FTL through Steam itself, so no env vars are needed.
|
|
25
|
+
# macOS: defaults write com.example.FTL NSAppSleepDisabled -bool YES # keep it ticking unfocused
|
|
26
|
+
# scripts/restart_ftl.sh none # launch FTL to the menu
|
|
27
|
+
|
|
28
|
+
cd harness && uv run python ../adapter/run_benchmark.py --agent scripted # scripted floor
|
|
29
|
+
cd harness && uv run python ../adapter/run_benchmark.py --agent random # random floor
|
|
30
|
+
cd harness && uv run python ../adapter/run_benchmark.py --agent scripted --tier semi_private # held-out leaderboard number
|
|
31
|
+
# A real frontier model plays the suite (the LLM track), two backends:
|
|
32
|
+
cd harness && uv run python ../adapter/run_benchmark.py --agent llm --backend anthropic --model claude-sonnet-4-6 # needs ANTHROPIC_API_KEY
|
|
33
|
+
cd harness && uv run python ../adapter/run_benchmark.py --agent llm --backend claude-cli --model claude-opus-4-8 # no key: local `claude -p`
|
|
34
|
+
```
|
|
35
|
+
The **LLM track** (`adapter/llm_agent.py`) drives the model over the same intent-level surface the baselines use: each turn it gets the decision-complete observation + the scenario goal + a short action history and replies with one command, dispatched through the shared `apply_command()` in `play_cli.py`. It decides everything — no scripted policy. `--backend anthropic` is the canonical, portable track (Anthropic Messages API); `--backend claude-cli` shells out to a local `claude -p` so you can run it with no API key. The agent's rules/instructions are a **version-controlled operating manual** at `prompts/ftl_agent_<v>.md` (select with `--prompt-version`); the version is recorded in each run's manifest and agent label, so a manual change is a distinct, comparable agent — not a silent drift.
|
|
36
|
+
Output: per-instance `ftl_score` + breakdown, then the aggregate `FTL score ± SE | Solve N/M` with per-type/tier breakdown. Each instance's trajectory + a reproducibility manifest (seed, ship, schema, runner/agent version) is saved under `runs/benchmark/`.
|
|
37
|
+
|
|
38
|
+
**Native baseline (scripted heuristic floor, 12-instance v1 suite, native Windows + Steam, no WSL).** The headline metric is FTL's own native run score (mean over the suite, ± seed SE):
|
|
39
|
+
|
|
40
|
+
| Agent | FTL score | Solve | survive_n_jumps | reach_sector | reach_sector_healthy | full_run |
|
|
41
|
+
|---|---|---|---|---|---|---|
|
|
42
|
+
| **scripted** (heuristic floor) | **143.75 ± 13.05** | 3/12 | 133.3 | 124.0 | 195.0 | 157.5 |
|
|
43
|
+
|
|
44
|
+
Median 11 jumps per instance; public tier 142.9, held-out `semi_private` tier 145.0. The full suite runs end to end on native Windows with no crashes. A native `random` floor and a frontier-LLM row (`--agent llm`, above, scored identically over the same observe/act surface) are the next rows to fill. Earlier macOS/Rosetta numbers used a goal-conditioned 0-100 score (scripted 70.2, random 5.2) and are not comparable to FTL's native score here.
|
|
45
|
+
|
|
46
|
+
## Why Hyperspace
|
|
47
|
+
|
|
48
|
+
Hyperspace is an open-source C++ "exe mod" that exposes FTL's engine to **Lua via SWIG bindings**. It already lets scripts *read* full game state (`Hyperspace.ships.player`/`.enemy`, crew, systems, weapons, map) and *drive* much of the simulation (move crew, allocate power, teleport, toggle cloak). It also supports **seeded runs** with the seed readable from Lua, the basis for reproducibility. Where capabilities aren't yet bound — the harness **transport** (the Lua sandbox disables `io`/sockets), JSON serialization, and a few UI-driven actions (weapon room-targeting, event-choice confirm, jump trigger, store) — we extend Hyperspace itself with new SWIG bindings rather than resorting to brittle screen/input automation. The source-grounded map of what's exposed vs. what we build is in [`docs/deepdive/hyperspace-lua-surface.md`](docs/deepdive/hyperspace-lua-surface.md).
|
|
49
|
+
|
|
50
|
+
## Architecture (four layers)
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
54
|
+
│ Coding agent (LLM) │
|
|
55
|
+
│ observes JSON, returns intent-level actions │
|
|
56
|
+
└───────────────▲──────────────────────────┬──────────────────┘
|
|
57
|
+
│ tools (MCP / func-calling)│
|
|
58
|
+
┌───────────────┴──────────────────────────▼──────────────────┐
|
|
59
|
+
│ adapter/ — exposes env as agent tools │
|
|
60
|
+
├──────────────────────────────────────────────────────────────┤
|
|
61
|
+
│ harness/ — gym-like env server (reset/observe/step), │
|
|
62
|
+
│ episode + seed + scoring + trajectory logging │
|
|
63
|
+
└───────────────▲──────────────────────────┬──────────────────┘
|
|
64
|
+
│ transport (file / socket) │
|
|
65
|
+
┌───────────────┴──────────────────────────▼──────────────────┐
|
|
66
|
+
│ mod/ftl_bench_bridge — Hyperspace Lua mod inside FTL: │
|
|
67
|
+
│ • per-frame hook gates the sim (event-driven pause) │
|
|
68
|
+
│ • serializes Observation JSON │
|
|
69
|
+
│ • applies Action commands via the Lua API │
|
|
70
|
+
│ (+ extended Hyperspace C++/SWIG bindings for action gaps) │
|
|
71
|
+
└──────────────────────────────────────────────────────────────┘
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
| Dir | Purpose |
|
|
75
|
+
|-----|---------|
|
|
76
|
+
| `mod/ftl_bench_bridge/` | In-game Hyperspace Lua mod: state serialization, action application, sim gating |
|
|
77
|
+
| `harness/` | External environment server (Python): `reset()/observe()/step()`, episodes, seeds, scoring, logging |
|
|
78
|
+
| `adapter/` | Exposes the env to a coding agent as MCP / function-calling tools |
|
|
79
|
+
| `scenarios/` | Benchmark scenario definitions + pinned seeds (full runs and cheap micro-encounters) |
|
|
80
|
+
| `docs/deepdive/` | Source-grounded analysis of the Hyperspace Lua surface |
|
|
81
|
+
|
|
82
|
+
## Core idea: making a real-time game turn-based
|
|
83
|
+
|
|
84
|
+
The harness keeps the game **paused by default** and unpauses in controlled increments. The default **event-driven gating** mode runs the sim until the next significant decision point (enemy weapon about to fire, system damaged, projectile incoming, event/store/jump screen) then re-pauses and requests an action — mirroring how a skilled human micro-pauses. A simpler **fixed-tick** mode is available for cheaper runs.
|
|
85
|
+
|
|
86
|
+
## Platform notes
|
|
87
|
+
|
|
88
|
+
On **native Windows**, FTL must be launched through Steam (`steam.exe -applaunch 212680`), which the runner does for you; a direct executable launch skips the Hyperspace injection and the bridge never loads. Windows Defender can briefly lock the observation/action files, so the harness retries those file operations. On **macOS**, keep FTL from being App-Napped so it keeps ticking when unfocused: `defaults write com.example.FTL NSAppSleepDisabled -bool YES`.
|
|
89
|
+
|
|
90
|
+
## Documentation
|
|
91
|
+
|
|
92
|
+
A documentation site covering the scoring model, the action set and observation schema, the per-platform install guides, and the architecture is in [`site/`](site/) (Astro Starlight). The Hyperspace Lua state/action surface is mapped in [`docs/deepdive/hyperspace-lua-surface.md`](docs/deepdive/hyperspace-lua-surface.md).
|
|
93
|
+
|
|
94
|
+
## Known gaps
|
|
95
|
+
|
|
96
|
+
- **Beam weapons** are not yet in the action set; they need two-point, room-to-room targeting.
|
|
97
|
+
- The higher-signal micro-encounter scenarios and the flagship tier are planned but not yet in the suite.
|
|
98
|
+
|
|
99
|
+
## Related
|
|
100
|
+
|
|
101
|
+
- [FTL-Hyperspace](https://github.com/FTL-Hyperspace/FTL-Hyperspace) — the modding API this is built on
|
|
102
|
+
- [FTLAV](https://github.com/Niels-NTG/FTLAV) — savefile parser (basis for the state fallback)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# adapter
|
|
2
|
+
|
|
3
|
+
Exposes the `ftl_bench` environment to an LLM agent as **MCP tools**, so a
|
|
4
|
+
tool-capable model can play FTL directly.
|
|
5
|
+
|
|
6
|
+
`ftl_mcp_server.py` wraps `AgentSession` and serves these tools (each returns a
|
|
7
|
+
compact, agent-readable summary of the resulting state):
|
|
8
|
+
|
|
9
|
+
| Tool | What it does |
|
|
10
|
+
|------|--------------|
|
|
11
|
+
| `observe()` | Current state: context (menu/in_space/combat/event), your ship (hull, reactor, systems+power, crew, weapons+charge), enemy (hull, shields, rooms), jump beacons, event choices |
|
|
12
|
+
| `reset(mode)` | Start a run — `'new'` (fresh seeded) or `'continue'` (resume the save) |
|
|
13
|
+
| `do_jump(beacon_index)` | FTL-jump to a connected beacon |
|
|
14
|
+
| `pick_choice(choice_index)` | Choose an event option |
|
|
15
|
+
| `power_system(system_id, level)` | Set a system's power (0=shields 1=engines 3=weapons …) |
|
|
16
|
+
| `send_crew(crew_id, room_id)` | Move a crew member to one of your rooms |
|
|
17
|
+
| `shoot(weapon_slot, enemy_room_id)` | Aim+fire a weapon at an enemy room (auto-fires as it charges) |
|
|
18
|
+
| `advance(frames)` | Let game time pass (charge weapons, finish a jump/combat) then re-pause |
|
|
19
|
+
| `run_strategy(code)` | **Code mode**: run agent-authored Python against the env (loops, whole combats) in one call — fewer round-trips than per-action tool calls |
|
|
20
|
+
|
|
21
|
+
## Prerequisites
|
|
22
|
+
|
|
23
|
+
The game must be running with the bridge live and not napping in the background:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
defaults write com.example.FTL NSAppSleepDisabled -bool YES # one-time: keep ticking unfocused
|
|
27
|
+
scripts/restart_ftl.sh continue # launch + start a run
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Run the server
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
cd harness && uv run --with "mcp[cli]" python ../adapter/ftl_mcp_server.py
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
To register with Claude Code (`.mcp.json` / `claude mcp add`), point the command at
|
|
37
|
+
that line. The server drives a single live FTL instance.
|
|
38
|
+
|
|
39
|
+
## Baseline agent
|
|
40
|
+
|
|
41
|
+
`baseline_agent.py` is a scripted heuristic agent (no LLM) that plays a few jumps —
|
|
42
|
+
powers shields/weapons, resolves events, and fights — useful as a smoke test and a
|
|
43
|
+
scoring baseline:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
cd harness && uv run python ../adapter/baseline_agent.py --jumps 5
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Eval harness
|
|
50
|
+
|
|
51
|
+
`eval.py` runs N seeded episodes (fresh `reset_episode` between each — no FTL restart),
|
|
52
|
+
records a trajectory per episode, and aggregates scores (survival rate, mean kills/hull/…):
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
cd harness && uv run python ../adapter/eval.py --seeds 1,2,3 --jumps 6
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Two ways an agent plays
|
|
59
|
+
|
|
60
|
+
- **Per-tool MCP** — the model calls `observe`/`do_jump`/`shoot`/… step-by-step (good for
|
|
61
|
+
introspection). No built-in Claude Code "code mode" toggle exists for MCP; tools are normal calls.
|
|
62
|
+
- **Code mode** — the model writes Python against this env and runs it. Either via `run_strategy`
|
|
63
|
+
(an MCP tool that execs agent code with `session` + action builders in scope) or, in a
|
|
64
|
+
code-execution agent (Claude Code), by writing+running a script that imports `ftl_bench` directly.
|
|
65
|
+
Recommended for FTL: one script can drive a whole combat without flooding context with
|
|
66
|
+
intermediate observations.
|
|
67
|
+
|