sib-agent 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sib_agent-0.1.0/PKG-INFO +66 -0
- sib_agent-0.1.0/README.md +47 -0
- sib_agent-0.1.0/pyproject.toml +31 -0
- sib_agent-0.1.0/setup.cfg +4 -0
- sib_agent-0.1.0/sib/__init__.py +8 -0
- sib_agent-0.1.0/sib/generations.py +101 -0
- sib_agent-0.1.0/sib/metrics.py +77 -0
- sib_agent-0.1.0/sib_agent.egg-info/PKG-INFO +66 -0
- sib_agent-0.1.0/sib_agent.egg-info/SOURCES.txt +9 -0
- sib_agent-0.1.0/sib_agent.egg-info/dependency_links.txt +1 -0
- sib_agent-0.1.0/sib_agent.egg-info/top_level.txt +1 -0
sib_agent-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sib-agent
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: SIB: Self-Improving Benchmark agent framework
|
|
5
|
+
Author-email: Hexo <sib@hexo.ai>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/hexo/sib
|
|
8
|
+
Project-URL: Repository, https://github.com/hexo/sib
|
|
9
|
+
Keywords: ai,benchmarks,self-improving,agents
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# SIB (Self-Improving Benchmark)
|
|
21
|
+
|
|
22
|
+
A framework for building self-improving AI agents that autonomously refine their performance on benchmark tasks.
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install sib-agent
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from sib import Generation, GenerationLog, ScoreTracker
|
|
34
|
+
|
|
35
|
+
# Track generations in a self-improvement loop
|
|
36
|
+
log = GenerationLog()
|
|
37
|
+
|
|
38
|
+
gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.7})
|
|
39
|
+
# ... run your agent ...
|
|
40
|
+
gen.finish(result={"accuracy": 0.82})
|
|
41
|
+
|
|
42
|
+
gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.5})
|
|
43
|
+
# ... run improved agent ...
|
|
44
|
+
gen.finish(result={"accuracy": 0.91})
|
|
45
|
+
|
|
46
|
+
# Find the best generation
|
|
47
|
+
best = log.best("accuracy")
|
|
48
|
+
print(best.generation_id, best.result) # 1 {'accuracy': 0.91}
|
|
49
|
+
|
|
50
|
+
# Save and reload logs
|
|
51
|
+
log.save("run_log.json")
|
|
52
|
+
log = GenerationLog.load("run_log.json")
|
|
53
|
+
|
|
54
|
+
# Track scores across generations
|
|
55
|
+
tracker = ScoreTracker()
|
|
56
|
+
tracker.record("accuracy", 0.82)
|
|
57
|
+
tracker.record("accuracy", 0.91)
|
|
58
|
+
|
|
59
|
+
summary = tracker.summarise("accuracy")
|
|
60
|
+
print(summary.improvement) # 0.09
|
|
61
|
+
print(summary.is_improving) # True
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## License
|
|
65
|
+
|
|
66
|
+
MIT
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# SIB (Self-Improving Benchmark)
|
|
2
|
+
|
|
3
|
+
A framework for building self-improving AI agents that autonomously refine their performance on benchmark tasks.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install sib-agent
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from sib import Generation, GenerationLog, ScoreTracker
|
|
15
|
+
|
|
16
|
+
# Track generations in a self-improvement loop
|
|
17
|
+
log = GenerationLog()
|
|
18
|
+
|
|
19
|
+
gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.7})
|
|
20
|
+
# ... run your agent ...
|
|
21
|
+
gen.finish(result={"accuracy": 0.82})
|
|
22
|
+
|
|
23
|
+
gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.5})
|
|
24
|
+
# ... run improved agent ...
|
|
25
|
+
gen.finish(result={"accuracy": 0.91})
|
|
26
|
+
|
|
27
|
+
# Find the best generation
|
|
28
|
+
best = log.best("accuracy")
|
|
29
|
+
print(best.generation_id, best.result) # 1 {'accuracy': 0.91}
|
|
30
|
+
|
|
31
|
+
# Save and reload logs
|
|
32
|
+
log.save("run_log.json")
|
|
33
|
+
log = GenerationLog.load("run_log.json")
|
|
34
|
+
|
|
35
|
+
# Track scores across generations
|
|
36
|
+
tracker = ScoreTracker()
|
|
37
|
+
tracker.record("accuracy", 0.82)
|
|
38
|
+
tracker.record("accuracy", 0.91)
|
|
39
|
+
|
|
40
|
+
summary = tracker.summarise("accuracy")
|
|
41
|
+
print(summary.improvement) # 0.09
|
|
42
|
+
print(summary.is_improving) # True
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## License
|
|
46
|
+
|
|
47
|
+
MIT
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sib-agent"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "SIB: Self-Improving Benchmark agent framework"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Hexo", email = "sib@hexo.ai" },
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
]
|
|
24
|
+
keywords = ["ai", "benchmarks", "self-improving", "agents"]
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
Homepage = "https://github.com/hexo/sib"
|
|
28
|
+
Repository = "https://github.com/hexo/sib"
|
|
29
|
+
|
|
30
|
+
[tool.setuptools.packages.find]
|
|
31
|
+
include = ["sib*"]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Track and manage iterative agent generations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class Generation:
|
|
14
|
+
"""A single generation in a self-improvement loop.
|
|
15
|
+
|
|
16
|
+
Captures the inputs, outputs, and timing of one iteration so that
|
|
17
|
+
downstream analysis can compare generations side-by-side.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
generation_id: int
|
|
21
|
+
config: dict[str, Any] = field(default_factory=dict)
|
|
22
|
+
result: dict[str, Any] = field(default_factory=dict)
|
|
23
|
+
started_at: float = field(default_factory=time.time)
|
|
24
|
+
finished_at: float | None = None
|
|
25
|
+
|
|
26
|
+
def finish(self, result: dict[str, Any] | None = None) -> None:
|
|
27
|
+
"""Mark the generation as complete and optionally attach a result."""
|
|
28
|
+
self.finished_at = time.time()
|
|
29
|
+
if result is not None:
|
|
30
|
+
self.result = result
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def elapsed_seconds(self) -> float | None:
|
|
34
|
+
if self.finished_at is None:
|
|
35
|
+
return None
|
|
36
|
+
return self.finished_at - self.started_at
|
|
37
|
+
|
|
38
|
+
def to_dict(self) -> dict[str, Any]:
|
|
39
|
+
return {
|
|
40
|
+
"generation_id": self.generation_id,
|
|
41
|
+
"config": self.config,
|
|
42
|
+
"result": self.result,
|
|
43
|
+
"started_at": self.started_at,
|
|
44
|
+
"finished_at": self.finished_at,
|
|
45
|
+
"elapsed_seconds": self.elapsed_seconds,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class GenerationLog:
|
|
50
|
+
"""Append-only log of generations, persistable to JSON."""
|
|
51
|
+
|
|
52
|
+
def __init__(self) -> None:
|
|
53
|
+
self._generations: list[Generation] = []
|
|
54
|
+
|
|
55
|
+
def new_generation(self, config: dict[str, Any] | None = None) -> Generation:
|
|
56
|
+
"""Create and register the next generation."""
|
|
57
|
+
gen = Generation(
|
|
58
|
+
generation_id=len(self._generations),
|
|
59
|
+
config=config or {},
|
|
60
|
+
)
|
|
61
|
+
self._generations.append(gen)
|
|
62
|
+
return gen
|
|
63
|
+
|
|
64
|
+
def __len__(self) -> int:
|
|
65
|
+
return len(self._generations)
|
|
66
|
+
|
|
67
|
+
def __getitem__(self, idx: int) -> Generation:
|
|
68
|
+
return self._generations[idx]
|
|
69
|
+
|
|
70
|
+
def __iter__(self):
|
|
71
|
+
return iter(self._generations)
|
|
72
|
+
|
|
73
|
+
def best(self, metric: str, higher_is_better: bool = True) -> Generation | None:
|
|
74
|
+
"""Return the generation with the best value for *metric* in its result."""
|
|
75
|
+
scored = [g for g in self._generations if metric in g.result]
|
|
76
|
+
if not scored:
|
|
77
|
+
return None
|
|
78
|
+
return (max if higher_is_better else min)(
|
|
79
|
+
scored, key=lambda g: g.result[metric]
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def save(self, path: str | Path) -> None:
|
|
83
|
+
"""Write the full log to a JSON file."""
|
|
84
|
+
Path(path).write_text(
|
|
85
|
+
json.dumps([g.to_dict() for g in self._generations], indent=2)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
@classmethod
|
|
89
|
+
def load(cls, path: str | Path) -> GenerationLog:
|
|
90
|
+
"""Load a log from a JSON file previously created by *save*."""
|
|
91
|
+
log = cls()
|
|
92
|
+
for entry in json.loads(Path(path).read_text()):
|
|
93
|
+
gen = Generation(
|
|
94
|
+
generation_id=entry["generation_id"],
|
|
95
|
+
config=entry.get("config", {}),
|
|
96
|
+
result=entry.get("result", {}),
|
|
97
|
+
started_at=entry.get("started_at", 0),
|
|
98
|
+
finished_at=entry.get("finished_at"),
|
|
99
|
+
)
|
|
100
|
+
log._generations.append(gen)
|
|
101
|
+
return log
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Lightweight score tracking across self-improvement generations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import statistics
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class ScoreSummary:
|
|
11
|
+
"""Immutable summary statistics for a sequence of scores."""
|
|
12
|
+
|
|
13
|
+
metric: str
|
|
14
|
+
values: tuple[float, ...]
|
|
15
|
+
mean: float
|
|
16
|
+
median: float
|
|
17
|
+
stdev: float | None
|
|
18
|
+
best: float
|
|
19
|
+
worst: float
|
|
20
|
+
higher_is_better: bool
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def latest(self) -> float:
|
|
24
|
+
return self.values[-1]
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def improvement(self) -> float:
|
|
28
|
+
"""Absolute change from first to last value."""
|
|
29
|
+
return self.values[-1] - self.values[0]
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def is_improving(self) -> bool:
|
|
33
|
+
"""True if the latest value is better than the first."""
|
|
34
|
+
delta = self.improvement
|
|
35
|
+
return delta > 0 if self.higher_is_better else delta < 0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ScoreTracker:
|
|
39
|
+
"""Collect per-generation scores for one or more metrics and summarise them."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, higher_is_better: bool = True) -> None:
|
|
42
|
+
self.higher_is_better = higher_is_better
|
|
43
|
+
self._scores: dict[str, list[float]] = {}
|
|
44
|
+
|
|
45
|
+
def record(self, metric: str, value: float) -> None:
|
|
46
|
+
"""Append a score for *metric*."""
|
|
47
|
+
self._scores.setdefault(metric, []).append(value)
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def metrics(self) -> list[str]:
|
|
51
|
+
return list(self._scores)
|
|
52
|
+
|
|
53
|
+
def values(self, metric: str) -> list[float]:
|
|
54
|
+
return list(self._scores.get(metric, []))
|
|
55
|
+
|
|
56
|
+
def summarise(self, metric: str) -> ScoreSummary:
|
|
57
|
+
"""Return a :class:`ScoreSummary` for the given metric.
|
|
58
|
+
|
|
59
|
+
Raises ``KeyError`` if the metric has never been recorded.
|
|
60
|
+
"""
|
|
61
|
+
vals = self._scores[metric]
|
|
62
|
+
cmp = max if self.higher_is_better else min
|
|
63
|
+
worst_fn = min if self.higher_is_better else max
|
|
64
|
+
return ScoreSummary(
|
|
65
|
+
metric=metric,
|
|
66
|
+
values=tuple(vals),
|
|
67
|
+
mean=statistics.mean(vals),
|
|
68
|
+
median=statistics.median(vals),
|
|
69
|
+
stdev=statistics.stdev(vals) if len(vals) > 1 else None,
|
|
70
|
+
best=cmp(vals),
|
|
71
|
+
worst=worst_fn(vals),
|
|
72
|
+
higher_is_better=self.higher_is_better,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def summarise_all(self) -> dict[str, ScoreSummary]:
|
|
76
|
+
"""Return summaries for every recorded metric."""
|
|
77
|
+
return {m: self.summarise(m) for m in self._scores}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sib-agent
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: SIB: Self-Improving Benchmark agent framework
|
|
5
|
+
Author-email: Hexo <sib@hexo.ai>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/hexo/sib
|
|
8
|
+
Project-URL: Repository, https://github.com/hexo/sib
|
|
9
|
+
Keywords: ai,benchmarks,self-improving,agents
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# SIB (Self-Improving Benchmark)
|
|
21
|
+
|
|
22
|
+
A framework for building self-improving AI agents that autonomously refine their performance on benchmark tasks.
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install sib-agent
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from sib import Generation, GenerationLog, ScoreTracker
|
|
34
|
+
|
|
35
|
+
# Track generations in a self-improvement loop
|
|
36
|
+
log = GenerationLog()
|
|
37
|
+
|
|
38
|
+
gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.7})
|
|
39
|
+
# ... run your agent ...
|
|
40
|
+
gen.finish(result={"accuracy": 0.82})
|
|
41
|
+
|
|
42
|
+
gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.5})
|
|
43
|
+
# ... run improved agent ...
|
|
44
|
+
gen.finish(result={"accuracy": 0.91})
|
|
45
|
+
|
|
46
|
+
# Find the best generation
|
|
47
|
+
best = log.best("accuracy")
|
|
48
|
+
print(best.generation_id, best.result) # 1 {'accuracy': 0.91}
|
|
49
|
+
|
|
50
|
+
# Save and reload logs
|
|
51
|
+
log.save("run_log.json")
|
|
52
|
+
log = GenerationLog.load("run_log.json")
|
|
53
|
+
|
|
54
|
+
# Track scores across generations
|
|
55
|
+
tracker = ScoreTracker()
|
|
56
|
+
tracker.record("accuracy", 0.82)
|
|
57
|
+
tracker.record("accuracy", 0.91)
|
|
58
|
+
|
|
59
|
+
summary = tracker.summarise("accuracy")
|
|
60
|
+
print(summary.improvement) # 0.09
|
|
61
|
+
print(summary.is_improving) # True
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## License
|
|
65
|
+
|
|
66
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sib
|