loopgain 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {loopgain-0.3.0 → loopgain-0.4.0}/PKG-INFO +34 -3
- {loopgain-0.3.0 → loopgain-0.4.0}/README.md +33 -2
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/_version.py +1 -1
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/classifier.py +37 -3
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/core.py +16 -3
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain.egg-info/PKG-INFO +34 -3
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain.egg-info/SOURCES.txt +2 -1
- {loopgain-0.3.0 → loopgain-0.4.0}/pyproject.toml +1 -1
- {loopgain-0.3.0 → loopgain-0.4.0}/tests/test_classifier_synthetic.py +14 -4
- loopgain-0.4.0/tests/test_termination_safety.py +115 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/LICENSE +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/__init__.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/__main__.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/cli.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/funnel.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/integrations/__init__.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/integrations/autogen.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/integrations/claude_agent_sdk.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/integrations/crewai.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/integrations/langchain.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/integrations/langgraph.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/integrations/openai_agents.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain/telemetry.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain.egg-info/dependency_links.txt +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain.egg-info/entry_points.txt +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain.egg-info/requires.txt +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/loopgain.egg-info/top_level.txt +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/setup.cfg +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/tests/test_classifier_mock_validation.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/tests/test_core.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/tests/test_funnel.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/tests/test_integrations.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/tests/test_stress.py +0 -0
- {loopgain-0.3.0 → loopgain-0.4.0}/tests/test_telemetry.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: loopgain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Barkhausen stability monitor for AI agent loops. Real-time loop-gain (Aβ) monitoring with five named threshold bands, best-so-far rollback, and ETA prediction.
|
|
5
5
|
Author-email: Dave Fitzsimmons <hello@loopgain.ai>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -108,6 +108,28 @@ print(result.savings_vs_fixed_cap)
|
|
|
108
108
|
|
|
109
109
|
---
|
|
110
110
|
|
|
111
|
+
## Defining your error signal
|
|
112
|
+
|
|
113
|
+
The one thing you provide is the **error signal**: a single non-negative number, every iteration, that says how wrong the current output is. **Lower is better; zero means done.** LoopGain doesn't know what your loop does — it just watches that number's trajectory and decides whether to keep going, stop, or roll back.
|
|
114
|
+
|
|
115
|
+
Your loop already has some way of knowing the output isn't good yet (or it wouldn't keep revising). Turn that into a number:
|
|
116
|
+
|
|
117
|
+
| Loop | Error signal = |
|
|
118
|
+
| --- | --- |
|
|
119
|
+
| Agentic coding (write code → run tests) | number of **failing tests** (10 → 3 → 0) |
|
|
120
|
+
| JSON / structured extraction | number of **schema violations** |
|
|
121
|
+
| RAG with self-correction | number of **required facts still missing** |
|
|
122
|
+
| Self-refinement with an LLM judge | judge's **gap to target** (e.g. `10 − quality_score`) |
|
|
123
|
+
| Lint / format loop | **lint error count** |
|
|
124
|
+
|
|
125
|
+
The only rules: non-negative, and **smaller as the output gets better**. Returning the raw list of problems works directly — `observe()` uses its length as the magnitude (e.g. hand it the list of failing tests).
|
|
126
|
+
|
|
127
|
+
If your quality is fuzzy and has no natural "zero," run with `target_error=None`: LoopGain then stops when the number **stops improving**, wherever that plateau is, instead of waiting for an exact target.
|
|
128
|
+
|
|
129
|
+
Every stop/continue decision is made from this one number, so **LoopGain is only as good as the error signal you give it** — pick one that genuinely tracks output quality.
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
111
133
|
## How it works
|
|
112
134
|
|
|
113
135
|
LoopGain measures empirical loop gain (`Aβ = E(n) / E(n-1)`) at every iteration and exposes it as a smoothed time series for visualization. The decision engine, however, classifies the **full error trajectory** using four features:
|
|
@@ -165,14 +187,23 @@ This transforms divergence detection from "abort with garbage" into "abort with
|
|
|
165
187
|
|
|
166
188
|
---
|
|
167
189
|
|
|
190
|
+
## What LoopGain does and doesn't guarantee
|
|
191
|
+
|
|
192
|
+
LoopGain saves money by stopping a loop once it stops improving — fewer iterations, fewer tokens. In our [public benchmark](https://github.com/loopgain-ai/loopgain-bench), that was a **93.5% median cut in API spend** vs `max_iterations=20`, with output quality preserved. Two honest limits:
|
|
193
|
+
|
|
194
|
+
- **Savings depend on your workload.** Loops that usually succeed fast save the most (~96%); adversarial, failure-prone loops save less (~84%). The headline is a blend — run the benchmark on your own loops before quoting a number.
|
|
195
|
+
- **LoopGain detects convergence, not correctness.** It stops when your error signal stops improving — which means more iterations won't help, *not* that the loop succeeded. On the benchmark this preserved quality (it rarely stopped early on a worse output; false-stop rate ≤3.5%), but a loop can stall with the error still above zero — a plateau at, say, 2 failing tests. So check `result.best_error` (or your own pass/fail) before you trust the output: if it plateaued short of your target, that's a quality gap LoopGain can't see, and a false stop that forces a rerun is the one way it eats into the savings. LoopGain decides *when to stop*; you decide *whether the answer is good enough*.
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
168
199
|
## API reference
|
|
169
200
|
|
|
170
|
-
### `LoopGain(target_error=0.0, max_iterations=
|
|
201
|
+
### `LoopGain(target_error=0.0, max_iterations=50, thresholds=None, trajectory_thresholds=None, classifier='trajectory', smoothing_window=3, assumed_fixed_cap=10)`
|
|
171
202
|
|
|
172
203
|
Construct the monitor.
|
|
173
204
|
|
|
174
205
|
- `target_error` — Stop when an observed error drops at or below this. Default `0.0` short-circuits on exactly zero error (the natural completion signal for verifier-driven loops). Pass `None` to disable the short-circuit entirely.
|
|
175
|
-
- `max_iterations` — Hard safety
|
|
206
|
+
- `max_iterations` — Hard safety backstop. Default `50` so the loop can never run unbounded; a stability verdict normally terminates it well before this. Pass `None` to opt into a fully unbounded loop (only safe if your loop is guaranteed to reach `target_error` or a stop-state), or a smaller integer to cap tighter.
|
|
176
207
|
- `thresholds` — Custom `ThresholdBands` for the legacy single-Aβ-band classifier. Ignored when `classifier='trajectory'`.
|
|
177
208
|
- `trajectory_thresholds` — Custom `TrajectoryThresholds` for the multi-feature classifier (the default). Override only with workload-specific evidence.
|
|
178
209
|
- `classifier` — `'trajectory'` (default, v0.2 multi-feature classifier) or `'legacy_bands'` (v0.1 single-Aβ-band classifier).
|
|
@@ -59,6 +59,28 @@ print(result.savings_vs_fixed_cap)
|
|
|
59
59
|
|
|
60
60
|
---
|
|
61
61
|
|
|
62
|
+
## Defining your error signal
|
|
63
|
+
|
|
64
|
+
The one thing you provide is the **error signal**: a single non-negative number, every iteration, that says how wrong the current output is. **Lower is better; zero means done.** LoopGain doesn't know what your loop does — it just watches that number's trajectory and decides whether to keep going, stop, or roll back.
|
|
65
|
+
|
|
66
|
+
Your loop already has some way of knowing the output isn't good yet (or it wouldn't keep revising). Turn that into a number:
|
|
67
|
+
|
|
68
|
+
| Loop | Error signal = |
|
|
69
|
+
| --- | --- |
|
|
70
|
+
| Agentic coding (write code → run tests) | number of **failing tests** (10 → 3 → 0) |
|
|
71
|
+
| JSON / structured extraction | number of **schema violations** |
|
|
72
|
+
| RAG with self-correction | number of **required facts still missing** |
|
|
73
|
+
| Self-refinement with an LLM judge | judge's **gap to target** (e.g. `10 − quality_score`) |
|
|
74
|
+
| Lint / format loop | **lint error count** |
|
|
75
|
+
|
|
76
|
+
The only rules: non-negative, and **smaller as the output gets better**. Returning the raw list of problems works directly — `observe()` uses its length as the magnitude (e.g. hand it the list of failing tests).
|
|
77
|
+
|
|
78
|
+
If your quality is fuzzy and has no natural "zero," run with `target_error=None`: LoopGain then stops when the number **stops improving**, wherever that plateau is, instead of waiting for an exact target.
|
|
79
|
+
|
|
80
|
+
Every stop/continue decision is made from this one number, so **LoopGain is only as good as the error signal you give it** — pick one that genuinely tracks output quality.
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
62
84
|
## How it works
|
|
63
85
|
|
|
64
86
|
LoopGain measures empirical loop gain (`Aβ = E(n) / E(n-1)`) at every iteration and exposes it as a smoothed time series for visualization. The decision engine, however, classifies the **full error trajectory** using four features:
|
|
@@ -116,14 +138,23 @@ This transforms divergence detection from "abort with garbage" into "abort with
|
|
|
116
138
|
|
|
117
139
|
---
|
|
118
140
|
|
|
141
|
+
## What LoopGain does and doesn't guarantee
|
|
142
|
+
|
|
143
|
+
LoopGain saves money by stopping a loop once it stops improving — fewer iterations, fewer tokens. In our [public benchmark](https://github.com/loopgain-ai/loopgain-bench), that was a **93.5% median cut in API spend** vs `max_iterations=20`, with output quality preserved. Two honest limits:
|
|
144
|
+
|
|
145
|
+
- **Savings depend on your workload.** Loops that usually succeed fast save the most (~96%); adversarial, failure-prone loops save less (~84%). The headline is a blend — run the benchmark on your own loops before quoting a number.
|
|
146
|
+
- **LoopGain detects convergence, not correctness.** It stops when your error signal stops improving — which means more iterations won't help, *not* that the loop succeeded. On the benchmark this preserved quality (it rarely stopped early on a worse output; false-stop rate ≤3.5%), but a loop can stall with the error still above zero — a plateau at, say, 2 failing tests. So check `result.best_error` (or your own pass/fail) before you trust the output: if it plateaued short of your target, that's a quality gap LoopGain can't see, and a false stop that forces a rerun is the one way it eats into the savings. LoopGain decides *when to stop*; you decide *whether the answer is good enough*.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
119
150
|
## API reference
|
|
120
151
|
|
|
121
|
-
### `LoopGain(target_error=0.0, max_iterations=
|
|
152
|
+
### `LoopGain(target_error=0.0, max_iterations=50, thresholds=None, trajectory_thresholds=None, classifier='trajectory', smoothing_window=3, assumed_fixed_cap=10)`
|
|
122
153
|
|
|
123
154
|
Construct the monitor.
|
|
124
155
|
|
|
125
156
|
- `target_error` — Stop when an observed error drops at or below this. Default `0.0` short-circuits on exactly zero error (the natural completion signal for verifier-driven loops). Pass `None` to disable the short-circuit entirely.
|
|
126
|
-
- `max_iterations` — Hard safety
|
|
157
|
+
- `max_iterations` — Hard safety backstop. Default `50` so the loop can never run unbounded; a stability verdict normally terminates it well before this. Pass `None` to opt into a fully unbounded loop (only safe if your loop is guaranteed to reach `target_error` or a stop-state), or a smaller integer to cap tighter.
|
|
127
158
|
- `thresholds` — Custom `ThresholdBands` for the legacy single-Aβ-band classifier. Ignored when `classifier='trajectory'`.
|
|
128
159
|
- `trajectory_thresholds` — Custom `TrajectoryThresholds` for the multi-feature classifier (the default). Override only with workload-specific evidence.
|
|
129
160
|
- `classifier` — `'trajectory'` (default, v0.2 multi-feature classifier) or `'legacy_bands'` (v0.1 single-Aβ-band classifier).
|
|
@@ -66,6 +66,20 @@ DEFAULT_OSC_STD_THRESHOLD = 0.30
|
|
|
66
66
|
# for the oscillation gate.
|
|
67
67
|
DEFAULT_SLOPE_TOL = 0.05
|
|
68
68
|
|
|
69
|
+
# Liveness gate: number of iterations a loop may go without achieving a new
|
|
70
|
+
# best (lowest) error before its "continue" verdicts (FAST_CONVERGE /
|
|
71
|
+
# CONVERGING) are withdrawn so it can reach STALLING / OSCILLATING and
|
|
72
|
+
# terminate. Without this, a loop that drops a lot and then plateaus or
|
|
73
|
+
# oscillates *below* the cumulative thresholds keeps its historical win
|
|
74
|
+
# forever and never terminates. Derivation: the continue-states are claims
|
|
75
|
+
# about *ongoing* progress; cumulative reduction (E_current/E_first) and a
|
|
76
|
+
# whole-history slope are claims about the *past* and do not expire. We treat
|
|
77
|
+
# "no new low in N steps" as the loop having stopped improving. N is small
|
|
78
|
+
# (3) so a sustained plateau is caught quickly, but the consecutive-STALLING
|
|
79
|
+
# termination rule (2 readings) still protects a loop that briefly stalls and
|
|
80
|
+
# then resumes hitting new lows.
|
|
81
|
+
DEFAULT_STALL_PATIENCE = 3
|
|
82
|
+
|
|
69
83
|
# Numerical floor to avoid log(0).
|
|
70
84
|
_EPS = 1e-12
|
|
71
85
|
|
|
@@ -85,6 +99,7 @@ class TrajectoryThresholds:
|
|
|
85
99
|
div_margin: float = DEFAULT_DIV_MARGIN
|
|
86
100
|
osc_std_threshold: float = DEFAULT_OSC_STD_THRESHOLD
|
|
87
101
|
slope_tol: float = DEFAULT_SLOPE_TOL
|
|
102
|
+
stall_patience: int = DEFAULT_STALL_PATIENCE
|
|
88
103
|
|
|
89
104
|
|
|
90
105
|
@dataclass(frozen=True)
|
|
@@ -276,6 +291,18 @@ def classify_trajectory(
|
|
|
276
291
|
|
|
277
292
|
f = extract_features(error_history)
|
|
278
293
|
|
|
294
|
+
# Liveness signal: how many iterations since the loop last achieved a new
|
|
295
|
+
# best (lowest) error. A genuinely converging loop keeps hitting new lows,
|
|
296
|
+
# so this stays small; a loop that dropped a lot and then plateaued (or is
|
|
297
|
+
# oscillating below the cumulative thresholds) has a large value. We use it
|
|
298
|
+
# to withdraw the "continue" verdicts (FAST_CONVERGE / CONVERGING) once a
|
|
299
|
+
# loop has stopped improving, so it can reach STALLING / OSCILLATING and
|
|
300
|
+
# terminate instead of riding its historical cumulative win forever. See
|
|
301
|
+
# DEFAULT_STALL_PATIENCE.
|
|
302
|
+
hist = list(error_history)
|
|
303
|
+
iters_since_best = (n - 1) - hist.index(min(hist))
|
|
304
|
+
still_improving = iters_since_best < th.stall_patience
|
|
305
|
+
|
|
279
306
|
# n == 2 special case: with two observations, the slope is well defined
|
|
280
307
|
# but its p-value is not (zero residual degrees of freedom). Fall back to
|
|
281
308
|
# the sign of the change. This is the same conservatism as a Wilcoxon
|
|
@@ -291,13 +318,20 @@ def classify_trajectory(
|
|
|
291
318
|
return STALLING
|
|
292
319
|
|
|
293
320
|
# Order matters: FAST_CONVERGE precedes CONVERGING; both precede the
|
|
294
|
-
# remaining gates.
|
|
295
|
-
|
|
321
|
+
# remaining gates. Both continue-verdicts are gated on `still_improving`:
|
|
322
|
+
# a loop that has stopped hitting new lows is no longer "converging" no
|
|
323
|
+
# matter how large its historical cumulative reduction was, and must be
|
|
324
|
+
# allowed to fall through to STALLING / OSCILLATING so it can terminate.
|
|
325
|
+
if f.e_ratio <= th.e_ratio_fast and still_improving:
|
|
296
326
|
return FAST_CONVERGE
|
|
297
327
|
|
|
298
328
|
slope_significant = f.slope_p < th.p_sig
|
|
299
329
|
|
|
300
|
-
if
|
|
330
|
+
if (
|
|
331
|
+
f.slope_log < 0
|
|
332
|
+
and still_improving
|
|
333
|
+
and (slope_significant or f.e_ratio <= th.e_ratio_conv)
|
|
334
|
+
):
|
|
301
335
|
return CONVERGING
|
|
302
336
|
|
|
303
337
|
if f.slope_log > 0 and slope_significant and f.e_ratio > 1.0 + th.div_margin:
|
|
@@ -40,6 +40,16 @@ DEFAULT_STALLING = 0.95
|
|
|
40
40
|
DEFAULT_OSCILLATING_UPPER = 1.05
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
# Bounded-by-default safety backstop. The loop should normally terminate on a
|
|
44
|
+
# stability verdict (target met / oscillating / diverging / stalled) long
|
|
45
|
+
# before this; it exists only so the library can never run truly unbounded if
|
|
46
|
+
# a loop never converges and never stalls (e.g. infinitesimal-but-real progress
|
|
47
|
+
# with target_error=None). Generous relative to typical loop lengths (the
|
|
48
|
+
# bench capped at 20). Pass max_iterations=None to opt into a fully unbounded
|
|
49
|
+
# loop, or a smaller integer to cap tighter.
|
|
50
|
+
DEFAULT_MAX_ITERATIONS = 50
|
|
51
|
+
|
|
52
|
+
|
|
43
53
|
# State names. Exported for use in switch/case in user code.
|
|
44
54
|
INIT = "INIT"
|
|
45
55
|
FAST_CONVERGE = "FAST_CONVERGE"
|
|
@@ -165,8 +175,11 @@ class LoopGain:
|
|
|
165
175
|
tests, no validation errors, etc.). Pass ``None`` to disable
|
|
166
176
|
the short-circuit entirely and rely only on stability
|
|
167
177
|
detection and ``max_iterations``.
|
|
168
|
-
max_iterations: Hard safety
|
|
169
|
-
|
|
178
|
+
max_iterations: Hard safety backstop. Default
|
|
179
|
+
``DEFAULT_MAX_ITERATIONS`` (50) so the loop can never run
|
|
180
|
+
unbounded; normally a stability verdict terminates it long
|
|
181
|
+
before this. Pass ``None`` to opt into a fully unbounded loop,
|
|
182
|
+
or a smaller integer to cap tighter.
|
|
170
183
|
thresholds: Custom ``ThresholdBands`` (legacy single-feature
|
|
171
184
|
classifier only). Default is the canonical 0.3 / 0.85 / 0.95 /
|
|
172
185
|
1.05. Ignored when ``classifier='trajectory'``.
|
|
@@ -190,7 +203,7 @@ class LoopGain:
|
|
|
190
203
|
def __init__(
|
|
191
204
|
self,
|
|
192
205
|
target_error: Optional[float] = 0.0,
|
|
193
|
-
max_iterations: Optional[int] =
|
|
206
|
+
max_iterations: Optional[int] = DEFAULT_MAX_ITERATIONS,
|
|
194
207
|
thresholds: Optional[ThresholdBands] = None,
|
|
195
208
|
trajectory_thresholds: Optional[TrajectoryThresholds] = None,
|
|
196
209
|
classifier: str = "trajectory",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: loopgain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Barkhausen stability monitor for AI agent loops. Real-time loop-gain (Aβ) monitoring with five named threshold bands, best-so-far rollback, and ETA prediction.
|
|
5
5
|
Author-email: Dave Fitzsimmons <hello@loopgain.ai>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -108,6 +108,28 @@ print(result.savings_vs_fixed_cap)
|
|
|
108
108
|
|
|
109
109
|
---
|
|
110
110
|
|
|
111
|
+
## Defining your error signal
|
|
112
|
+
|
|
113
|
+
The one thing you provide is the **error signal**: a single non-negative number, every iteration, that says how wrong the current output is. **Lower is better; zero means done.** LoopGain doesn't know what your loop does — it just watches that number's trajectory and decides whether to keep going, stop, or roll back.
|
|
114
|
+
|
|
115
|
+
Your loop already has some way of knowing the output isn't good yet (or it wouldn't keep revising). Turn that into a number:
|
|
116
|
+
|
|
117
|
+
| Loop | Error signal = |
|
|
118
|
+
| --- | --- |
|
|
119
|
+
| Agentic coding (write code → run tests) | number of **failing tests** (10 → 3 → 0) |
|
|
120
|
+
| JSON / structured extraction | number of **schema violations** |
|
|
121
|
+
| RAG with self-correction | number of **required facts still missing** |
|
|
122
|
+
| Self-refinement with an LLM judge | judge's **gap to target** (e.g. `10 − quality_score`) |
|
|
123
|
+
| Lint / format loop | **lint error count** |
|
|
124
|
+
|
|
125
|
+
The only rules: non-negative, and **smaller as the output gets better**. Returning the raw list of problems works directly — `observe()` uses its length as the magnitude (e.g. hand it the list of failing tests).
|
|
126
|
+
|
|
127
|
+
If your quality is fuzzy and has no natural "zero," run with `target_error=None`: LoopGain then stops when the number **stops improving**, wherever that plateau is, instead of waiting for an exact target.
|
|
128
|
+
|
|
129
|
+
Every stop/continue decision is made from this one number, so **LoopGain is only as good as the error signal you give it** — pick one that genuinely tracks output quality.
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
111
133
|
## How it works
|
|
112
134
|
|
|
113
135
|
LoopGain measures empirical loop gain (`Aβ = E(n) / E(n-1)`) at every iteration and exposes it as a smoothed time series for visualization. The decision engine, however, classifies the **full error trajectory** using four features:
|
|
@@ -165,14 +187,23 @@ This transforms divergence detection from "abort with garbage" into "abort with
|
|
|
165
187
|
|
|
166
188
|
---
|
|
167
189
|
|
|
190
|
+
## What LoopGain does and doesn't guarantee
|
|
191
|
+
|
|
192
|
+
LoopGain saves money by stopping a loop once it stops improving — fewer iterations, fewer tokens. In our [public benchmark](https://github.com/loopgain-ai/loopgain-bench), that was a **93.5% median cut in API spend** vs `max_iterations=20`, with output quality preserved. Two honest limits:
|
|
193
|
+
|
|
194
|
+
- **Savings depend on your workload.** Loops that usually succeed fast save the most (~96%); adversarial, failure-prone loops save less (~84%). The headline is a blend — run the benchmark on your own loops before quoting a number.
|
|
195
|
+
- **LoopGain detects convergence, not correctness.** It stops when your error signal stops improving — which means more iterations won't help, *not* that the loop succeeded. On the benchmark this preserved quality (it rarely stopped early on a worse output; false-stop rate ≤3.5%), but a loop can stall with the error still above zero — a plateau at, say, 2 failing tests. So check `result.best_error` (or your own pass/fail) before you trust the output: if it plateaued short of your target, that's a quality gap LoopGain can't see, and a false stop that forces a rerun is the one way it eats into the savings. LoopGain decides *when to stop*; you decide *whether the answer is good enough*.
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
168
199
|
## API reference
|
|
169
200
|
|
|
170
|
-
### `LoopGain(target_error=0.0, max_iterations=
|
|
201
|
+
### `LoopGain(target_error=0.0, max_iterations=50, thresholds=None, trajectory_thresholds=None, classifier='trajectory', smoothing_window=3, assumed_fixed_cap=10)`
|
|
171
202
|
|
|
172
203
|
Construct the monitor.
|
|
173
204
|
|
|
174
205
|
- `target_error` — Stop when an observed error drops at or below this. Default `0.0` short-circuits on exactly zero error (the natural completion signal for verifier-driven loops). Pass `None` to disable the short-circuit entirely.
|
|
175
|
-
- `max_iterations` — Hard safety
|
|
206
|
+
- `max_iterations` — Hard safety backstop. Default `50` so the loop can never run unbounded; a stability verdict normally terminates it well before this. Pass `None` to opt into a fully unbounded loop (only safe if your loop is guaranteed to reach `target_error` or a stop-state), or a smaller integer to cap tighter.
|
|
176
207
|
- `thresholds` — Custom `ThresholdBands` for the legacy single-Aβ-band classifier. Ignored when `classifier='trajectory'`.
|
|
177
208
|
- `trajectory_thresholds` — Custom `TrajectoryThresholds` for the multi-feature classifier (the default). Override only with workload-specific evidence.
|
|
178
209
|
- `classifier` — `'trajectory'` (default, v0.2 multi-feature classifier) or `'legacy_bands'` (v0.1 single-Aβ-band classifier).
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "loopgain"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0"
|
|
8
8
|
description = "Barkhausen stability monitor for AI agent loops. Real-time loop-gain (Aβ) monitoring with five named threshold bands, best-so-far rollback, and ETA prediction."
|
|
9
9
|
authors = [{name = "Dave Fitzsimmons", email = "hello@loopgain.ai"}]
|
|
10
10
|
readme = "README.md"
|
|
@@ -158,12 +158,22 @@ def test_pure_stall_no_trend():
|
|
|
158
158
|
)
|
|
159
159
|
|
|
160
160
|
|
|
161
|
-
def
|
|
162
|
-
"""
|
|
163
|
-
|
|
161
|
+
def test_floor_convergence_already_flat_at_floor_stalls():
|
|
162
|
+
"""A loop already pinned at the numerical floor from iteration 0, flat,
|
|
163
|
+
classifies as STALLING — not FAST_CONVERGE.
|
|
164
|
+
|
|
165
|
+
Updated 2026-06 with the liveness-gate fix (see DEFAULT_STALL_PATIENCE).
|
|
166
|
+
Previously this returned FAST_CONVERGE on the strength of cumulative
|
|
167
|
+
reduction alone — but FAST_CONVERGE is a *continue* verdict, so an
|
|
168
|
+
at-floor flat loop would have continued (and, with no max_iterations,
|
|
169
|
+
run unbounded) instead of stopping. STALLING is the correct verdict: the
|
|
170
|
+
loop has made no progress for `stall_patience` iterations, so it
|
|
171
|
+
terminates via the consecutive-stall rule and returns best-so-far (the
|
|
172
|
+
floor value — a fine answer). In real use the `target_error`
|
|
173
|
+
short-circuit (next test) handles the at-target case directly."""
|
|
164
174
|
trajectory = [1e-15] * 5
|
|
165
175
|
state = classify_trajectory(trajectory)
|
|
166
|
-
assert state ==
|
|
176
|
+
assert state == STALLING
|
|
167
177
|
|
|
168
178
|
|
|
169
179
|
def test_target_met_short_circuit():
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Termination-safety tests: a loop must not run unbounded.
|
|
2
|
+
|
|
3
|
+
Regression coverage for the FAST_CONVERGE/CONVERGING liveness bug (2026-06):
|
|
4
|
+
the trajectory classifier used *cumulative* reduction (E_current/E_first) and a
|
|
5
|
+
*whole-history* slope to emit the "continue" verdicts FAST_CONVERGE and
|
|
6
|
+
CONVERGING. A loop that reduced its error and then plateaued (or oscillated)
|
|
7
|
+
*below* the cumulative thresholds kept its historical win forever — it was
|
|
8
|
+
pinned in a continue-state, never reached STALLING/OSCILLATING, and with the
|
|
9
|
+
(then-default) max_iterations=None it ran forever.
|
|
10
|
+
|
|
11
|
+
The fix has two independent layers, each tested here:
|
|
12
|
+
1. A liveness gate on the continue-verdicts: a loop that has not achieved a
|
|
13
|
+
new best error in `stall_patience` iterations is no longer treated as
|
|
14
|
+
"improving", so it can reach STALLING/OSCILLATING and terminate.
|
|
15
|
+
2. A bounded default max_iterations backstop, so the library can never run
|
|
16
|
+
truly unbounded even if a future classifier path regresses.
|
|
17
|
+
|
|
18
|
+
Output quality was never at risk (best-so-far rollback held the good answer);
|
|
19
|
+
the bug was a *liveness* failure — the loop never returned to hand it back.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import pytest
|
|
25
|
+
|
|
26
|
+
from loopgain import CONVERGING, FAST_CONVERGE, LoopGain, classify_trajectory
|
|
27
|
+
|
|
28
|
+
# Hard test guard: large enough that a *correctly* terminating loop never hits
|
|
29
|
+
# it, small enough that a regression (unbounded loop) fails fast instead of
|
|
30
|
+
# hanging the suite.
|
|
31
|
+
GUARD = 500
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _run_to_termination(lg: LoopGain, errors, guard: int = GUARD):
|
|
35
|
+
"""Drive a loop, plateauing/repeating the last error, until it terminates
|
|
36
|
+
or hits the guard. Returns (iterations_run, hit_guard)."""
|
|
37
|
+
i = 0
|
|
38
|
+
while lg.should_continue():
|
|
39
|
+
e = errors[i] if i < len(errors) else errors[-1]
|
|
40
|
+
lg.observe(e, output=f"o{i}")
|
|
41
|
+
i += 1
|
|
42
|
+
if i >= guard:
|
|
43
|
+
return i, True
|
|
44
|
+
return i, False
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ----- Layer 1: classifier liveness gate -----
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_plateau_below_fast_floor_terminates_without_max_iter():
|
|
51
|
+
"""Error drops to 8% of initial then plateaus. e_ratio<=0.1 used to pin
|
|
52
|
+
FAST_CONVERGE forever. Must now terminate via STALLING."""
|
|
53
|
+
lg = LoopGain(max_iterations=None, target_error=None)
|
|
54
|
+
n, hit_guard = _run_to_termination(lg, [100, 8, 8, 8, 8, 8, 8, 8])
|
|
55
|
+
assert not hit_guard, f"loop did not terminate within {GUARD} iters (unbounded)"
|
|
56
|
+
assert not lg.should_continue()
|
|
57
|
+
assert lg.result.best_error == 8.0 # best-so-far still returned
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_plateau_above_fast_floor_terminates_without_max_iter():
|
|
61
|
+
"""Error drops to 30% of initial (below E_RATIO_CONV=0.5) then plateaus.
|
|
62
|
+
e_ratio<=0.5 with a whole-history negative slope used to pin CONVERGING
|
|
63
|
+
forever. Must now terminate."""
|
|
64
|
+
lg = LoopGain(max_iterations=None, target_error=None)
|
|
65
|
+
n, hit_guard = _run_to_termination(lg, [100, 30, 30, 30, 30, 30, 30, 30])
|
|
66
|
+
assert not hit_guard, f"loop did not terminate within {GUARD} iters (unbounded)"
|
|
67
|
+
assert not lg.should_continue()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_oscillation_below_floor_terminates_without_max_iter():
|
|
71
|
+
"""Oscillation entirely below the 10% cumulative floor used to be shadowed
|
|
72
|
+
by FAST_CONVERGE. Must now terminate (OSCILLATING or STALLING)."""
|
|
73
|
+
lg = LoopGain(max_iterations=None, target_error=None)
|
|
74
|
+
n, hit_guard = _run_to_termination(lg, [100, 5, 8, 5, 8, 5, 8, 5, 8])
|
|
75
|
+
assert not hit_guard, f"loop did not terminate within {GUARD} iters (unbounded)"
|
|
76
|
+
assert not lg.should_continue()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_classifier_flags_plateau_after_big_drop_as_terminable():
|
|
80
|
+
"""Direct classifier check: a big drop followed by a flat tail must NOT be
|
|
81
|
+
reported as a continue-state (FAST_CONVERGE/CONVERGING)."""
|
|
82
|
+
plateau_low = [100, 8, 8, 8, 8, 8]
|
|
83
|
+
plateau_mid = [100, 30, 30, 30, 30, 30]
|
|
84
|
+
assert classify_trajectory(plateau_low) not in (FAST_CONVERGE, CONVERGING)
|
|
85
|
+
assert classify_trajectory(plateau_mid) not in (FAST_CONVERGE, CONVERGING)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_genuine_fast_converge_still_continues():
|
|
89
|
+
"""Guard against over-correction: a monotone steep decline that keeps
|
|
90
|
+
hitting new lows must still read FAST_CONVERGE (continue), not be
|
|
91
|
+
prematurely stalled."""
|
|
92
|
+
monotone = [100, 25, 6, 1.5, 0.4, 0.1] # new low every step
|
|
93
|
+
assert classify_trajectory(monotone) == FAST_CONVERGE
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def test_genuine_converging_still_continues():
|
|
97
|
+
"""A steady decline landing between the two cumulative thresholds must
|
|
98
|
+
still read CONVERGING while it is still hitting new lows."""
|
|
99
|
+
converging = [10.0, 8.0, 6.4, 5.1, 4.1, 3.3] # ~0.8x/step, new low every step
|
|
100
|
+
assert classify_trajectory(converging) == CONVERGING
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ----- Layer 2: bounded default backstop -----
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def test_default_max_iterations_is_a_bounded_backstop():
|
|
107
|
+
"""The default config must not be able to run unbounded. A never-improving
|
|
108
|
+
loop under all-default construction must terminate at the backstop."""
|
|
109
|
+
lg = LoopGain() # all defaults
|
|
110
|
+
assert lg.max_iterations is not None, "default max_iterations must be bounded"
|
|
111
|
+
# A strictly increasing error never converges/stalls into best-so-far early
|
|
112
|
+
# under every classifier path; the backstop must still stop it.
|
|
113
|
+
i, hit_guard = _run_to_termination(lg, list(range(1, GUARD + 5)))
|
|
114
|
+
assert not hit_guard, "default backstop failed to bound the loop"
|
|
115
|
+
assert not lg.should_continue()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|