sib-agent 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: sib-agent
3
+ Version: 0.1.0
4
+ Summary: SIB: Self-Improving Benchmark agent framework
5
+ Author-email: Hexo <sib@hexo.ai>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/hexo/sib
8
+ Project-URL: Repository, https://github.com/hexo/sib
9
+ Keywords: ai,benchmarks,self-improving,agents
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+
20
+ # SIB (Self-Improving Benchmark)
21
+
22
+ A framework for building self-improving AI agents that autonomously refine their performance on benchmark tasks.
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install sib-agent
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ ```python
33
+ from sib import Generation, GenerationLog, ScoreTracker
34
+
35
+ # Track generations in a self-improvement loop
36
+ log = GenerationLog()
37
+
38
+ gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.7})
39
+ # ... run your agent ...
40
+ gen.finish(result={"accuracy": 0.82})
41
+
42
+ gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.5})
43
+ # ... run improved agent ...
44
+ gen.finish(result={"accuracy": 0.91})
45
+
46
+ # Find the best generation
47
+ best = log.best("accuracy")
48
+ print(best.generation_id, best.result) # 1 {'accuracy': 0.91}
49
+
50
+ # Save and reload logs
51
+ log.save("run_log.json")
52
+ log = GenerationLog.load("run_log.json")
53
+
54
+ # Track scores across generations
55
+ tracker = ScoreTracker()
56
+ tracker.record("accuracy", 0.82)
57
+ tracker.record("accuracy", 0.91)
58
+
59
+ summary = tracker.summarise("accuracy")
60
+ print(summary.improvement) # 0.09
61
+ print(summary.is_improving) # True
62
+ ```
63
+
64
+ ## License
65
+
66
+ MIT
@@ -0,0 +1,47 @@
1
+ # SIB (Self-Improving Benchmark)
2
+
3
+ A framework for building self-improving AI agents that autonomously refine their performance on benchmark tasks.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install sib-agent
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from sib import Generation, GenerationLog, ScoreTracker
15
+
16
+ # Track generations in a self-improvement loop
17
+ log = GenerationLog()
18
+
19
+ gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.7})
20
+ # ... run your agent ...
21
+ gen.finish(result={"accuracy": 0.82})
22
+
23
+ gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.5})
24
+ # ... run improved agent ...
25
+ gen.finish(result={"accuracy": 0.91})
26
+
27
+ # Find the best generation
28
+ best = log.best("accuracy")
29
+ print(best.generation_id, best.result) # 1 {'accuracy': 0.91}
30
+
31
+ # Save and reload logs
32
+ log.save("run_log.json")
33
+ log = GenerationLog.load("run_log.json")
34
+
35
+ # Track scores across generations
36
+ tracker = ScoreTracker()
37
+ tracker.record("accuracy", 0.82)
38
+ tracker.record("accuracy", 0.91)
39
+
40
+ summary = tracker.summarise("accuracy")
41
+ print(summary.improvement) # 0.09
42
+ print(summary.is_improving) # True
43
+ ```
44
+
45
+ ## License
46
+
47
+ MIT
@@ -0,0 +1,31 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sib-agent"
7
+ version = "0.1.0"
8
+ description = "SIB: Self-Improving Benchmark agent framework"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "Hexo", email = "sib@hexo.ai" },
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Science/Research",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
23
+ ]
24
+ keywords = ["ai", "benchmarks", "self-improving", "agents"]
25
+
26
+ [project.urls]
27
+ Homepage = "https://github.com/hexo/sib"
28
+ Repository = "https://github.com/hexo/sib"
29
+
30
+ [tool.setuptools.packages.find]
31
+ include = ["sib*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,8 @@
1
+ """SIB: Self-Improving Benchmark agent framework."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from sib.generations import Generation, GenerationLog
6
+ from sib.metrics import ScoreTracker
7
+
8
+ __all__ = ["Generation", "GenerationLog", "ScoreTracker"]
@@ -0,0 +1,101 @@
1
+ """Track and manage iterative agent generations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+
12
+ @dataclass
13
+ class Generation:
14
+ """A single generation in a self-improvement loop.
15
+
16
+ Captures the inputs, outputs, and timing of one iteration so that
17
+ downstream analysis can compare generations side-by-side.
18
+ """
19
+
20
+ generation_id: int
21
+ config: dict[str, Any] = field(default_factory=dict)
22
+ result: dict[str, Any] = field(default_factory=dict)
23
+ started_at: float = field(default_factory=time.time)
24
+ finished_at: float | None = None
25
+
26
+ def finish(self, result: dict[str, Any] | None = None) -> None:
27
+ """Mark the generation as complete and optionally attach a result."""
28
+ self.finished_at = time.time()
29
+ if result is not None:
30
+ self.result = result
31
+
32
+ @property
33
+ def elapsed_seconds(self) -> float | None:
34
+ if self.finished_at is None:
35
+ return None
36
+ return self.finished_at - self.started_at
37
+
38
+ def to_dict(self) -> dict[str, Any]:
39
+ return {
40
+ "generation_id": self.generation_id,
41
+ "config": self.config,
42
+ "result": self.result,
43
+ "started_at": self.started_at,
44
+ "finished_at": self.finished_at,
45
+ "elapsed_seconds": self.elapsed_seconds,
46
+ }
47
+
48
+
49
+ class GenerationLog:
50
+ """Append-only log of generations, persistable to JSON."""
51
+
52
+ def __init__(self) -> None:
53
+ self._generations: list[Generation] = []
54
+
55
+ def new_generation(self, config: dict[str, Any] | None = None) -> Generation:
56
+ """Create and register the next generation."""
57
+ gen = Generation(
58
+ generation_id=len(self._generations),
59
+ config=config or {},
60
+ )
61
+ self._generations.append(gen)
62
+ return gen
63
+
64
+ def __len__(self) -> int:
65
+ return len(self._generations)
66
+
67
+ def __getitem__(self, idx: int) -> Generation:
68
+ return self._generations[idx]
69
+
70
+ def __iter__(self):
71
+ return iter(self._generations)
72
+
73
+ def best(self, metric: str, higher_is_better: bool = True) -> Generation | None:
74
+ """Return the generation with the best value for *metric* in its result."""
75
+ scored = [g for g in self._generations if metric in g.result]
76
+ if not scored:
77
+ return None
78
+ return (max if higher_is_better else min)(
79
+ scored, key=lambda g: g.result[metric]
80
+ )
81
+
82
+ def save(self, path: str | Path) -> None:
83
+ """Write the full log to a JSON file."""
84
+ Path(path).write_text(
85
+ json.dumps([g.to_dict() for g in self._generations], indent=2)
86
+ )
87
+
88
+ @classmethod
89
+ def load(cls, path: str | Path) -> GenerationLog:
90
+ """Load a log from a JSON file previously created by *save*."""
91
+ log = cls()
92
+ for entry in json.loads(Path(path).read_text()):
93
+ gen = Generation(
94
+ generation_id=entry["generation_id"],
95
+ config=entry.get("config", {}),
96
+ result=entry.get("result", {}),
97
+ started_at=entry.get("started_at", 0),
98
+ finished_at=entry.get("finished_at"),
99
+ )
100
+ log._generations.append(gen)
101
+ return log
@@ -0,0 +1,77 @@
1
+ """Lightweight score tracking across self-improvement generations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import statistics
6
+ from dataclasses import dataclass, field
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class ScoreSummary:
11
+ """Immutable summary statistics for a sequence of scores."""
12
+
13
+ metric: str
14
+ values: tuple[float, ...]
15
+ mean: float
16
+ median: float
17
+ stdev: float | None
18
+ best: float
19
+ worst: float
20
+ higher_is_better: bool
21
+
22
+ @property
23
+ def latest(self) -> float:
24
+ return self.values[-1]
25
+
26
+ @property
27
+ def improvement(self) -> float:
28
+ """Absolute change from first to last value."""
29
+ return self.values[-1] - self.values[0]
30
+
31
+ @property
32
+ def is_improving(self) -> bool:
33
+ """True if the latest value is better than the first."""
34
+ delta = self.improvement
35
+ return delta > 0 if self.higher_is_better else delta < 0
36
+
37
+
38
+ class ScoreTracker:
39
+ """Collect per-generation scores for one or more metrics and summarise them."""
40
+
41
+ def __init__(self, higher_is_better: bool = True) -> None:
42
+ self.higher_is_better = higher_is_better
43
+ self._scores: dict[str, list[float]] = {}
44
+
45
+ def record(self, metric: str, value: float) -> None:
46
+ """Append a score for *metric*."""
47
+ self._scores.setdefault(metric, []).append(value)
48
+
49
+ @property
50
+ def metrics(self) -> list[str]:
51
+ return list(self._scores)
52
+
53
+ def values(self, metric: str) -> list[float]:
54
+ return list(self._scores.get(metric, []))
55
+
56
+ def summarise(self, metric: str) -> ScoreSummary:
57
+ """Return a :class:`ScoreSummary` for the given metric.
58
+
59
+ Raises ``KeyError`` if the metric has never been recorded.
60
+ """
61
+ vals = self._scores[metric]
62
+ cmp = max if self.higher_is_better else min
63
+ worst_fn = min if self.higher_is_better else max
64
+ return ScoreSummary(
65
+ metric=metric,
66
+ values=tuple(vals),
67
+ mean=statistics.mean(vals),
68
+ median=statistics.median(vals),
69
+ stdev=statistics.stdev(vals) if len(vals) > 1 else None,
70
+ best=cmp(vals),
71
+ worst=worst_fn(vals),
72
+ higher_is_better=self.higher_is_better,
73
+ )
74
+
75
+ def summarise_all(self) -> dict[str, ScoreSummary]:
76
+ """Return summaries for every recorded metric."""
77
+ return {m: self.summarise(m) for m in self._scores}
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: sib-agent
3
+ Version: 0.1.0
4
+ Summary: SIB: Self-Improving Benchmark agent framework
5
+ Author-email: Hexo <sib@hexo.ai>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/hexo/sib
8
+ Project-URL: Repository, https://github.com/hexo/sib
9
+ Keywords: ai,benchmarks,self-improving,agents
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+
20
+ # SIB (Self-Improving Benchmark)
21
+
22
+ A framework for building self-improving AI agents that autonomously refine their performance on benchmark tasks.
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install sib-agent
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ ```python
33
+ from sib import Generation, GenerationLog, ScoreTracker
34
+
35
+ # Track generations in a self-improvement loop
36
+ log = GenerationLog()
37
+
38
+ gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.7})
39
+ # ... run your agent ...
40
+ gen.finish(result={"accuracy": 0.82})
41
+
42
+ gen = log.new_generation(config={"model": "claude-sonnet", "temperature": 0.5})
43
+ # ... run improved agent ...
44
+ gen.finish(result={"accuracy": 0.91})
45
+
46
+ # Find the best generation
47
+ best = log.best("accuracy")
48
+ print(best.generation_id, best.result) # 1 {'accuracy': 0.91}
49
+
50
+ # Save and reload logs
51
+ log.save("run_log.json")
52
+ log = GenerationLog.load("run_log.json")
53
+
54
+ # Track scores across generations
55
+ tracker = ScoreTracker()
56
+ tracker.record("accuracy", 0.82)
57
+ tracker.record("accuracy", 0.91)
58
+
59
+ summary = tracker.summarise("accuracy")
60
+ print(summary.improvement) # 0.09
61
+ print(summary.is_improving) # True
62
+ ```
63
+
64
+ ## License
65
+
66
+ MIT
@@ -0,0 +1,9 @@
1
+ README.md
2
+ pyproject.toml
3
+ sib/__init__.py
4
+ sib/generations.py
5
+ sib/metrics.py
6
+ sib_agent.egg-info/PKG-INFO
7
+ sib_agent.egg-info/SOURCES.txt
8
+ sib_agent.egg-info/dependency_links.txt
9
+ sib_agent.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ sib