llm-feedback-control 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_feedback_control-0.1.0/LICENSE +28 -0
- llm_feedback_control-0.1.0/PKG-INFO +262 -0
- llm_feedback_control-0.1.0/README.md +197 -0
- llm_feedback_control-0.1.0/pyproject.toml +74 -0
- llm_feedback_control-0.1.0/setup.cfg +4 -0
- llm_feedback_control-0.1.0/src/llm_feedback_control/__init__.py +106 -0
- llm_feedback_control-0.1.0/src/llm_feedback_control/__main__.py +98 -0
- llm_feedback_control-0.1.0/src/llm_feedback_control/auditor.py +337 -0
- llm_feedback_control-0.1.0/src/llm_feedback_control/feedback.py +151 -0
- llm_feedback_control-0.1.0/src/llm_feedback_control/llm.py +143 -0
- llm_feedback_control-0.1.0/src/llm_feedback_control.egg-info/PKG-INFO +262 -0
- llm_feedback_control-0.1.0/src/llm_feedback_control.egg-info/SOURCES.txt +15 -0
- llm_feedback_control-0.1.0/src/llm_feedback_control.egg-info/dependency_links.txt +1 -0
- llm_feedback_control-0.1.0/src/llm_feedback_control.egg-info/entry_points.txt +2 -0
- llm_feedback_control-0.1.0/src/llm_feedback_control.egg-info/requires.txt +8 -0
- llm_feedback_control-0.1.0/src/llm_feedback_control.egg-info/top_level.txt +1 -0
- llm_feedback_control-0.1.0/tests/test_core.py +125 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
llm-feedback-control
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Edward Chalk (sapientronic.ai)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to use,
|
|
7
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
8
|
+
the Software, and to permit persons to whom the Software is furnished to do
|
|
9
|
+
so, subject to the following conditions:
|
|
10
|
+
|
|
11
|
+
1. The above copyright notice and this permission notice shall be included
|
|
12
|
+
in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
2. Attribution. Any publication, presentation, derivative work, or product
|
|
15
|
+
that uses or builds on this Software must include visible attribution to
|
|
16
|
+
Edward Chalk and sapientronic.ai. The phrase "Built with llm-feedback-control
|
|
17
|
+
by Edward Chalk (sapientronic.ai)" or equivalent is acceptable.
|
|
18
|
+
|
|
19
|
+
3. The Software is provided "AS IS", without warranty of any kind, express
|
|
20
|
+
or implied, including but not limited to the warranties of merchantability,
|
|
21
|
+
fitness for a particular purpose, and noninfringement. In no event shall
|
|
22
|
+
the authors or copyright holders be liable for any claim, damages, or
|
|
23
|
+
other liability, whether in an action of contract, tort, or otherwise,
|
|
24
|
+
arising from, out of, or in connection with the Software or the use or
|
|
25
|
+
other dealings in the Software.
|
|
26
|
+
|
|
27
|
+
This license is modeled on the MIT License with an explicit attribution
|
|
28
|
+
clause (clause 2).
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-feedback-control
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Reliable, checkable structured output from a small local LLM, by wrapping it in a deterministic feedback loop: a regime gate + exact graph analysis + explicit refusal, plus a bounded re-extraction loop. Zero runtime dependencies; runs with no model at all.
|
|
5
|
+
Author-email: Edward Chalk <edward.chalk@sapientronic.ai>
|
|
6
|
+
License: llm-feedback-control
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Edward Chalk (sapientronic.ai)
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to use,
|
|
12
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
13
|
+
the Software, and to permit persons to whom the Software is furnished to do
|
|
14
|
+
so, subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
1. The above copyright notice and this permission notice shall be included
|
|
17
|
+
in all copies or substantial portions of the Software.
|
|
18
|
+
|
|
19
|
+
2. Attribution. Any publication, presentation, derivative work, or product
|
|
20
|
+
that uses or builds on this Software must include visible attribution to
|
|
21
|
+
Edward Chalk and sapientronic.ai. The phrase "Built with llm-feedback-control
|
|
22
|
+
by Edward Chalk (sapientronic.ai)" or equivalent is acceptable.
|
|
23
|
+
|
|
24
|
+
3. The Software is provided "AS IS", without warranty of any kind, express
|
|
25
|
+
or implied, including but not limited to the warranties of merchantability,
|
|
26
|
+
fitness for a particular purpose, and noninfringement. In no event shall
|
|
27
|
+
the authors or copyright holders be liable for any claim, damages, or
|
|
28
|
+
other liability, whether in an action of contract, tort, or otherwise,
|
|
29
|
+
arising from, out of, or in connection with the Software or the use or
|
|
30
|
+
other dealings in the Software.
|
|
31
|
+
|
|
32
|
+
This license is modeled on the MIT License with an explicit attribution
|
|
33
|
+
clause (clause 2).
|
|
34
|
+
|
|
35
|
+
Project-URL: Homepage, https://github.com/pcoz/llm-feedback-control
|
|
36
|
+
Project-URL: Repository, https://github.com/pcoz/llm-feedback-control
|
|
37
|
+
Project-URL: Issues, https://github.com/pcoz/llm-feedback-control/issues
|
|
38
|
+
Project-URL: Changelog, https://github.com/pcoz/llm-feedback-control/blob/main/CHANGELOG.md
|
|
39
|
+
Keywords: llm,feedback-control,structured-extraction,state-machine,workflow,hallucination,reliability,ollama,small-language-model,auditable,refusal
|
|
40
|
+
Classifier: Development Status :: 4 - Beta
|
|
41
|
+
Classifier: Intended Audience :: Developers
|
|
42
|
+
Classifier: Intended Audience :: Science/Research
|
|
43
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
44
|
+
Classifier: Operating System :: OS Independent
|
|
45
|
+
Classifier: Programming Language :: Python :: 3
|
|
46
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
47
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
48
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
49
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
50
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
51
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
52
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
53
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
54
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
55
|
+
Requires-Python: >=3.8
|
|
56
|
+
Description-Content-Type: text/markdown
|
|
57
|
+
License-File: LICENSE
|
|
58
|
+
Provides-Extra: aws
|
|
59
|
+
Requires-Dist: boto3>=1.26; extra == "aws"
|
|
60
|
+
Provides-Extra: dev
|
|
61
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
62
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
63
|
+
Requires-Dist: twine>=4.0; extra == "dev"
|
|
64
|
+
Dynamic: license-file
|
|
65
|
+
|
|
66
|
+
# llm-feedback-control
|
|
67
|
+
|
|
68
|
+
**Get reliable, checkable structured output from a small, local language model —
|
|
69
|
+
by wrapping it in ordinary deterministic code.**
|
|
70
|
+
|
|
71
|
+
[](https://github.com/pcoz/llm-feedback-control/actions/workflows/ci.yml)
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## What it actually does
|
|
76
|
+
|
|
77
|
+
You hand it a process written in plain English:
|
|
78
|
+
|
|
79
|
+
> "A claim enters Intake. From Intake it goes to Triage. Triage goes to FastTrack
|
|
80
|
+
> or to Investigation. FastTrack goes to Payout. Investigation goes to Payout or
|
|
81
|
+
> to Denied. Payout goes to Closed. Denied goes to Closed."
|
|
82
|
+
|
|
83
|
+
and it:
|
|
84
|
+
|
|
85
|
+
1. **turns that into a state machine** — the steps (states) and the arrows between
|
|
86
|
+
them (transitions);
|
|
87
|
+
2. **computes provable facts** about it — which steps are dead ends, whether
|
|
88
|
+
there are loops, which steps can't be reached from the start;
|
|
89
|
+
3. **writes a report where every statement is backed by one of those checked
|
|
90
|
+
facts** — so it can't quietly make things up;
|
|
91
|
+
4. **knows its own limits.** If the text isn't actually a finite step-by-step
|
|
92
|
+
process (e.g. *"prices drift up as confidence grows"*), it **refuses** instead
|
|
93
|
+
of inventing a fake state machine. And if the model's first pass missed part of
|
|
94
|
+
the process, it **loops to fill the gaps** — or refuses if it can't.
|
|
95
|
+
|
|
96
|
+
The point: you get **higher-quality, auditable structured output from a *small*
|
|
97
|
+
model**, trading a few extra passes (latency) for accuracy — no extra parameters,
|
|
98
|
+
no special mathematics, no cloud. It runs on a laptop, and the deterministic parts
|
|
99
|
+
run **with no model at all**.
|
|
100
|
+
|
|
101
|
+
## Quickstart (works with no model)
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pip install llm-feedback-control # zero dependencies — pulls nothing else
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from llm_feedback_control import run_audit
|
|
109
|
+
|
|
110
|
+
r = run_audit("A claim enters Intake. From Intake it goes to Triage. "
|
|
111
|
+
"Triage goes to FastTrack or to Investigation.")
|
|
112
|
+
print(r["result"]) # OK
|
|
113
|
+
print(r["report_facts"]) # terminals, loops, unreachable steps — all checked
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
That already works on a bare install: with no model reachable it uses a
|
|
117
|
+
deterministic regex extractor plus exact graph analysis. **Plug in a model and the
|
|
118
|
+
extraction quality goes up — nothing else changes.**
|
|
119
|
+
|
|
120
|
+
From the command line:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
lfc "A ticket opens in New. New goes to Assigned. Assigned goes to Resolved."
|
|
124
|
+
lfc --check # tells you exactly what backend is available and what to do
|
|
125
|
+
lfc --demo # runs the three worked demos
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Add a model (optional, recommended)
|
|
129
|
+
|
|
130
|
+
The library is **not tied to any provider.** Three ways to give it a model:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
# 1. Local, free, private — install Ollama (https://ollama.com), then:
|
|
134
|
+
ollama pull phi3:mini
|
|
135
|
+
|
|
136
|
+
# 2. OpenAI (stdlib HTTP, no SDK):
|
|
137
|
+
export CEILING_BACKEND=openai OPENAI_API_KEY=sk-...
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
# 3. Bring your own: pass any callable f(prompt, fmt=None) -> str
|
|
142
|
+
def my_llm(prompt, fmt=None):
|
|
143
|
+
... # call Anthropic, a local server, anything
|
|
144
|
+
run_audit(text, generate=my_llm)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
Run `lfc --check` any time to see what's wired up.
|
|
148
|
+
|
|
149
|
+
## How it works — "feedback control", explained
|
|
150
|
+
|
|
151
|
+
The design is borrowed from **electronics.** A raw LLM is like a very high-gain
|
|
152
|
+
amplifier: hugely powerful, but left to run "open-loop" it overshoots — fluent,
|
|
153
|
+
yet it drifts and hallucinates. Engineers tame such an amplifier by adding a
|
|
154
|
+
**feedback loop**: feed the output back, compare it against a stable reference,
|
|
155
|
+
and trade some raw power for precision and stability. This library is that
|
|
156
|
+
feedback loop for an LLM. The "reference" is plain deterministic code — graph
|
|
157
|
+
checks and schema rules — that the model's output is measured against.
|
|
158
|
+
|
|
159
|
+
There are two kinds of feedback, and the library uses both:
|
|
160
|
+
|
|
161
|
+
### Negative feedback — the stabilising checks (`run_audit`)
|
|
162
|
+
|
|
163
|
+
This is the half that *grounds and refuses*. In plain terms:
|
|
164
|
+
|
|
165
|
+
| step | what it means |
|
|
166
|
+
|---|---|
|
|
167
|
+
| **regime gate** | First decide whether the text is even the kind of thing we can analyse exactly (a finite, step-by-step process) versus something fuzzy and continuous. Refuse the fuzzy ones. |
|
|
168
|
+
| **extraction + schema** | Ask the model for the state machine, but force the answer into a strict shape — and fall back to a deterministic regex extractor if it won't comply (or if there's no model). |
|
|
169
|
+
| **exact analysis** | Compute provable facts about the graph: dead ends, loops, unreachable steps. (Plus an *optional* finite-field "spectral fingerprint" — see below.) |
|
|
170
|
+
| **grounded report** | Write the summary using only those verified facts, naming only real states. |
|
|
171
|
+
| **explicit refusal** | When the input is out of regime, or a result can't be made exact, say so — don't guess. |
|
|
172
|
+
|
|
173
|
+
### Positive feedback — the gap-filling loop (`extract_iterative`)
|
|
174
|
+
|
|
175
|
+
A one-shot extraction often silently **drops a branch** — the model says "OK"
|
|
176
|
+
while quietly missing *Investigation → Denied*. Positive feedback fixes that: it
|
|
177
|
+
**re-asks the model about anything the source text mentions that's missing from
|
|
178
|
+
the answer**, and repeats until nothing is missing (a *fixed point*).
|
|
179
|
+
|
|
180
|
+
Positive feedback is where capability *and* instability both live, so it's bounded
|
|
181
|
+
by two negative-feedback safeguards: a deterministic consistency check (does the
|
|
182
|
+
graph cover everything the text mentions?) and a **refusal clamp** — if it can't
|
|
183
|
+
converge within a few passes, it refuses to report a confident-but-incomplete
|
|
184
|
+
result rather than running away. This **refusal-as-stabilizer** is what makes the
|
|
185
|
+
regenerative loop safe.
|
|
186
|
+
|
|
187
|
+
## What's measured so far
|
|
188
|
+
|
|
189
|
+
Indicative results, not benchmarks — small corpora, a 3.8B local model
|
|
190
|
+
(`phi3:mini`), greedy decoding. See [`docs/results.md`](docs/results.md) for the
|
|
191
|
+
full tables and method.
|
|
192
|
+
|
|
193
|
+
**Headline (run on EC2 against a ~28 GB ceiling model, mixtral 8x7B):** on a
|
|
194
|
+
messy, branchy, distractor-laden workflow corpus, the small model **+ the feedback
|
|
195
|
+
loop essentially matches a model ~7× its size.**
|
|
196
|
+
|
|
197
|
+
| configuration | states F1 | transitions F1 |
|
|
198
|
+
|---|---|---|
|
|
199
|
+
| small model (phi3:mini), one-shot | 0.98 | 0.89 |
|
|
200
|
+
| **small model + feedback loop** | **1.00** | **0.90** |
|
|
201
|
+
| big ceiling model (mixtral, ~28 GB), one-shot | 1.00 | 0.91 |
|
|
202
|
+
|
|
203
|
+
→ the loop recovers **100%** of the small→big gap on states and **77%** on
|
|
204
|
+
transitions — and on several individual workflows the closed-loop small model
|
|
205
|
+
*beat* the big model, because the deterministic reference catches edges that raw
|
|
206
|
+
fluency invents or drops.
|
|
207
|
+
|
|
208
|
+
Other measured pieces: extraction states precision/recall ≈ 1.00 / 0.92; the
|
|
209
|
+
regime gate scores 1.00 precision/recall separating finite from continuous on a
|
|
210
|
+
clean corpus (it's brittle on deliberately *mixed* inputs — an open problem).
|
|
211
|
+
|
|
212
|
+
## Documentation
|
|
213
|
+
|
|
214
|
+
| doc | contents |
|
|
215
|
+
|---|---|
|
|
216
|
+
| [`docs/index.md`](docs/index.md) | overview and where to start |
|
|
217
|
+
| [`docs/architecture.md`](docs/architecture.md) | the op-amp model in depth; the pipeline; refusal-as-stabilizer |
|
|
218
|
+
| [`docs/usage.md`](docs/usage.md) | install, the API, the CLI, configuration, bring-your-own-backend |
|
|
219
|
+
| [`docs/results.md`](docs/results.md) | the measured results, method, and honest scope |
|
|
220
|
+
| [`docs/api.md`](docs/api.md) | reference for every public function |
|
|
221
|
+
| [`docs/faq.md`](docs/faq.md) | "do I need a GPU?", "what models?", "does it work offline?" … |
|
|
222
|
+
|
|
223
|
+
## Repository layout
|
|
224
|
+
|
|
225
|
+
```
|
|
226
|
+
src/llm_feedback_control/ the package (zero-dependency, pure standard library)
|
|
227
|
+
llm.py the LLM client + injectable backend + a doctor()
|
|
228
|
+
auditor.py the negative-feedback pipeline (run_audit)
|
|
229
|
+
feedback.py the bounded positive-feedback loop (extract_iterative)
|
|
230
|
+
__main__.py the `lfc` command-line tool
|
|
231
|
+
experiments/ repro scripts for the measured results (not shipped)
|
|
232
|
+
aws/ optional: run a large ceiling model on EC2 (not shipped)
|
|
233
|
+
docs/ the documentation suite
|
|
234
|
+
tests/ deterministic tests (no model / no network)
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## Honest scope
|
|
238
|
+
|
|
239
|
+
- **A reliability architecture, not a model improvement.** The win is "the system
|
|
240
|
+
knows what it can compute exactly and refuses the rest" — orthogonal to model
|
|
241
|
+
scale. It helps on the *structured / verifiable slice* (workflows, state
|
|
242
|
+
machines, configs), not open-ended generation.
|
|
243
|
+
- **It uses no special mathematics.** The deterministic reference is plain
|
|
244
|
+
graph/text consistency. (The finite-field "spectral fingerprint" is an *optional*
|
|
245
|
+
extra exact check, honestly redundant with graph analysis for most workflow
|
|
246
|
+
audits — keep it or ignore it.)
|
|
247
|
+
- **Needs a deterministic reference.** Where there's nothing to check against, the
|
|
248
|
+
gate (correctly) refuses to claim exactness.
|
|
249
|
+
- **Results are indicative.** Small corpora; treat the numbers as direction, not
|
|
250
|
+
guarantees.
|
|
251
|
+
|
|
252
|
+
## Origin
|
|
253
|
+
|
|
254
|
+
This project is the practical, validated spin-off of an internal research
|
|
255
|
+
investigation. The investigation's grander mathematical claims did not hold up
|
|
256
|
+
under measurement; this engineering architecture — LLM feedback control with
|
|
257
|
+
refusal-as-stabilizer — is the part that did. It stands on its own.
|
|
258
|
+
|
|
259
|
+
## License
|
|
260
|
+
|
|
261
|
+
MIT with an attribution clause — see [`LICENSE`](LICENSE).
|
|
262
|
+
Built with llm-feedback-control by Edward Chalk (sapientronic.ai).
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# llm-feedback-control
|
|
2
|
+
|
|
3
|
+
**Get reliable, checkable structured output from a small, local language model —
|
|
4
|
+
by wrapping it in ordinary deterministic code.**
|
|
5
|
+
|
|
6
|
+
[](https://github.com/pcoz/llm-feedback-control/actions/workflows/ci.yml)
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## What it actually does
|
|
11
|
+
|
|
12
|
+
You hand it a process written in plain English:
|
|
13
|
+
|
|
14
|
+
> "A claim enters Intake. From Intake it goes to Triage. Triage goes to FastTrack
|
|
15
|
+
> or to Investigation. FastTrack goes to Payout. Investigation goes to Payout or
|
|
16
|
+
> to Denied. Payout goes to Closed. Denied goes to Closed."
|
|
17
|
+
|
|
18
|
+
and it:
|
|
19
|
+
|
|
20
|
+
1. **turns that into a state machine** — the steps (states) and the arrows between
|
|
21
|
+
them (transitions);
|
|
22
|
+
2. **computes provable facts** about it — which steps are dead ends, whether
|
|
23
|
+
there are loops, which steps can't be reached from the start;
|
|
24
|
+
3. **writes a report where every statement is backed by one of those checked
|
|
25
|
+
facts** — so it can't quietly make things up;
|
|
26
|
+
4. **knows its own limits.** If the text isn't actually a finite step-by-step
|
|
27
|
+
process (e.g. *"prices drift up as confidence grows"*), it **refuses** instead
|
|
28
|
+
of inventing a fake state machine. And if the model's first pass missed part of
|
|
29
|
+
the process, it **loops to fill the gaps** — or refuses if it can't.
|
|
30
|
+
|
|
31
|
+
The point: you get **higher-quality, auditable structured output from a *small*
|
|
32
|
+
model**, trading a few extra passes (latency) for accuracy — no extra parameters,
|
|
33
|
+
no special mathematics, no cloud. It runs on a laptop, and the deterministic parts
|
|
34
|
+
run **with no model at all**.
|
|
35
|
+
|
|
36
|
+
## Quickstart (works with no model)
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install llm-feedback-control # zero dependencies — pulls nothing else
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from llm_feedback_control import run_audit
|
|
44
|
+
|
|
45
|
+
r = run_audit("A claim enters Intake. From Intake it goes to Triage. "
|
|
46
|
+
"Triage goes to FastTrack or to Investigation.")
|
|
47
|
+
print(r["result"]) # OK
|
|
48
|
+
print(r["report_facts"]) # terminals, loops, unreachable steps — all checked
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
That already works on a bare install: with no model reachable it uses a
|
|
52
|
+
deterministic regex extractor plus exact graph analysis. **Plug in a model and the
|
|
53
|
+
extraction quality goes up — nothing else changes.**
|
|
54
|
+
|
|
55
|
+
From the command line:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
lfc "A ticket opens in New. New goes to Assigned. Assigned goes to Resolved."
|
|
59
|
+
lfc --check # tells you exactly what backend is available and what to do
|
|
60
|
+
lfc --demo # runs the three worked demos
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Add a model (optional, recommended)
|
|
64
|
+
|
|
65
|
+
The library is **not tied to any provider.** Three ways to give it a model:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# 1. Local, free, private — install Ollama (https://ollama.com), then:
|
|
69
|
+
ollama pull phi3:mini
|
|
70
|
+
|
|
71
|
+
# 2. OpenAI (stdlib HTTP, no SDK):
|
|
72
|
+
export CEILING_BACKEND=openai OPENAI_API_KEY=sk-...
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
# 3. Bring your own: pass any callable f(prompt, fmt=None) -> str
|
|
77
|
+
def my_llm(prompt, fmt=None):
|
|
78
|
+
... # call Anthropic, a local server, anything
|
|
79
|
+
run_audit(text, generate=my_llm)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Run `lfc --check` any time to see what's wired up.
|
|
83
|
+
|
|
84
|
+
## How it works — "feedback control", explained
|
|
85
|
+
|
|
86
|
+
The design is borrowed from **electronics.** A raw LLM is like a very high-gain
|
|
87
|
+
amplifier: hugely powerful, but left to run "open-loop" it overshoots — fluent,
|
|
88
|
+
yet it drifts and hallucinates. Engineers tame such an amplifier by adding a
|
|
89
|
+
**feedback loop**: feed the output back, compare it against a stable reference,
|
|
90
|
+
and trade some raw power for precision and stability. This library is that
|
|
91
|
+
feedback loop for an LLM. The "reference" is plain deterministic code — graph
|
|
92
|
+
checks and schema rules — that the model's output is measured against.
|
|
93
|
+
|
|
94
|
+
There are two kinds of feedback, and the library uses both:
|
|
95
|
+
|
|
96
|
+
### Negative feedback — the stabilising checks (`run_audit`)
|
|
97
|
+
|
|
98
|
+
This is the half that *grounds and refuses*. In plain terms:
|
|
99
|
+
|
|
100
|
+
| step | what it means |
|
|
101
|
+
|---|---|
|
|
102
|
+
| **regime gate** | First decide whether the text is even the kind of thing we can analyse exactly (a finite, step-by-step process) versus something fuzzy and continuous. Refuse the fuzzy ones. |
|
|
103
|
+
| **extraction + schema** | Ask the model for the state machine, but force the answer into a strict shape — and fall back to a deterministic regex extractor if it won't comply (or if there's no model). |
|
|
104
|
+
| **exact analysis** | Compute provable facts about the graph: dead ends, loops, unreachable steps. (Plus an *optional* finite-field "spectral fingerprint" — see below.) |
|
|
105
|
+
| **grounded report** | Write the summary using only those verified facts, naming only real states. |
|
|
106
|
+
| **explicit refusal** | When the input is out of regime, or a result can't be made exact, say so — don't guess. |
|
|
107
|
+
|
|
108
|
+
### Positive feedback — the gap-filling loop (`extract_iterative`)
|
|
109
|
+
|
|
110
|
+
A one-shot extraction often silently **drops a branch** — the model says "OK"
|
|
111
|
+
while quietly missing *Investigation → Denied*. Positive feedback fixes that: it
|
|
112
|
+
**re-asks the model about anything the source text mentions that's missing from
|
|
113
|
+
the answer**, and repeats until nothing is missing (a *fixed point*).
|
|
114
|
+
|
|
115
|
+
Positive feedback is where capability *and* instability both live, so it's bounded
|
|
116
|
+
by two negative-feedback safeguards: a deterministic consistency check (does the
|
|
117
|
+
graph cover everything the text mentions?) and a **refusal clamp** — if it can't
|
|
118
|
+
converge within a few passes, it refuses to report a confident-but-incomplete
|
|
119
|
+
result rather than running away. This **refusal-as-stabilizer** is what makes the
|
|
120
|
+
regenerative loop safe.
|
|
121
|
+
|
|
122
|
+
## What's measured so far
|
|
123
|
+
|
|
124
|
+
Indicative results, not benchmarks — small corpora, a 3.8B local model
|
|
125
|
+
(`phi3:mini`), greedy decoding. See [`docs/results.md`](docs/results.md) for the
|
|
126
|
+
full tables and method.
|
|
127
|
+
|
|
128
|
+
**Headline (run on EC2 against a ~28 GB ceiling model, mixtral 8x7B):** on a
|
|
129
|
+
messy, branchy, distractor-laden workflow corpus, the small model **+ the feedback
|
|
130
|
+
loop essentially matches a model ~7× its size.**
|
|
131
|
+
|
|
132
|
+
| configuration | states F1 | transitions F1 |
|
|
133
|
+
|---|---|---|
|
|
134
|
+
| small model (phi3:mini), one-shot | 0.98 | 0.89 |
|
|
135
|
+
| **small model + feedback loop** | **1.00** | **0.90** |
|
|
136
|
+
| big ceiling model (mixtral, ~28 GB), one-shot | 1.00 | 0.91 |
|
|
137
|
+
|
|
138
|
+
→ the loop recovers **100%** of the small→big gap on states and **77%** on
|
|
139
|
+
transitions — and on several individual workflows the closed-loop small model
|
|
140
|
+
*beat* the big model, because the deterministic reference catches edges that raw
|
|
141
|
+
fluency invents or drops.
|
|
142
|
+
|
|
143
|
+
Other measured pieces: extraction states precision/recall ≈ 1.00 / 0.92; the
|
|
144
|
+
regime gate scores 1.00 precision/recall separating finite from continuous on a
|
|
145
|
+
clean corpus (it's brittle on deliberately *mixed* inputs — an open problem).
|
|
146
|
+
|
|
147
|
+
## Documentation
|
|
148
|
+
|
|
149
|
+
| doc | contents |
|
|
150
|
+
|---|---|
|
|
151
|
+
| [`docs/index.md`](docs/index.md) | overview and where to start |
|
|
152
|
+
| [`docs/architecture.md`](docs/architecture.md) | the op-amp model in depth; the pipeline; refusal-as-stabilizer |
|
|
153
|
+
| [`docs/usage.md`](docs/usage.md) | install, the API, the CLI, configuration, bring-your-own-backend |
|
|
154
|
+
| [`docs/results.md`](docs/results.md) | the measured results, method, and honest scope |
|
|
155
|
+
| [`docs/api.md`](docs/api.md) | reference for every public function |
|
|
156
|
+
| [`docs/faq.md`](docs/faq.md) | "do I need a GPU?", "what models?", "does it work offline?" … |
|
|
157
|
+
|
|
158
|
+
## Repository layout
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
src/llm_feedback_control/ the package (zero-dependency, pure standard library)
|
|
162
|
+
llm.py the LLM client + injectable backend + a doctor()
|
|
163
|
+
auditor.py the negative-feedback pipeline (run_audit)
|
|
164
|
+
feedback.py the bounded positive-feedback loop (extract_iterative)
|
|
165
|
+
__main__.py the `lfc` command-line tool
|
|
166
|
+
experiments/ repro scripts for the measured results (not shipped)
|
|
167
|
+
aws/ optional: run a large ceiling model on EC2 (not shipped)
|
|
168
|
+
docs/ the documentation suite
|
|
169
|
+
tests/ deterministic tests (no model / no network)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Honest scope
|
|
173
|
+
|
|
174
|
+
- **A reliability architecture, not a model improvement.** The win is "the system
|
|
175
|
+
knows what it can compute exactly and refuses the rest" — orthogonal to model
|
|
176
|
+
scale. It helps on the *structured / verifiable slice* (workflows, state
|
|
177
|
+
machines, configs), not open-ended generation.
|
|
178
|
+
- **It uses no special mathematics.** The deterministic reference is plain
|
|
179
|
+
graph/text consistency. (The finite-field "spectral fingerprint" is an *optional*
|
|
180
|
+
extra exact check, honestly redundant with graph analysis for most workflow
|
|
181
|
+
audits — keep it or ignore it.)
|
|
182
|
+
- **Needs a deterministic reference.** Where there's nothing to check against, the
|
|
183
|
+
gate (correctly) refuses to claim exactness.
|
|
184
|
+
- **Results are indicative.** Small corpora; treat the numbers as direction, not
|
|
185
|
+
guarantees.
|
|
186
|
+
|
|
187
|
+
## Origin
|
|
188
|
+
|
|
189
|
+
This project is the practical, validated spin-off of an internal research
|
|
190
|
+
investigation. The investigation's grander mathematical claims did not hold up
|
|
191
|
+
under measurement; this engineering architecture — LLM feedback control with
|
|
192
|
+
refusal-as-stabilizer — is the part that did. It stands on its own.
|
|
193
|
+
|
|
194
|
+
## License
|
|
195
|
+
|
|
196
|
+
MIT with an attribution clause — see [`LICENSE`](LICENSE).
|
|
197
|
+
Built with llm-feedback-control by Edward Chalk (sapientronic.ai).
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "llm-feedback-control"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Reliable, checkable structured output from a small local LLM, by wrapping it in a deterministic feedback loop: a regime gate + exact graph analysis + explicit refusal, plus a bounded re-extraction loop. Zero runtime dependencies; runs with no model at all."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Edward Chalk", email = "edward.chalk@sapientronic.ai" },
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"llm",
|
|
17
|
+
"feedback-control",
|
|
18
|
+
"structured-extraction",
|
|
19
|
+
"state-machine",
|
|
20
|
+
"workflow",
|
|
21
|
+
"hallucination",
|
|
22
|
+
"reliability",
|
|
23
|
+
"ollama",
|
|
24
|
+
"small-language-model",
|
|
25
|
+
"auditable",
|
|
26
|
+
"refusal",
|
|
27
|
+
]
|
|
28
|
+
classifiers = [
|
|
29
|
+
"Development Status :: 4 - Beta",
|
|
30
|
+
"Intended Audience :: Developers",
|
|
31
|
+
"Intended Audience :: Science/Research",
|
|
32
|
+
"License :: OSI Approved :: MIT License",
|
|
33
|
+
"Operating System :: OS Independent",
|
|
34
|
+
"Programming Language :: Python :: 3",
|
|
35
|
+
"Programming Language :: Python :: 3.8",
|
|
36
|
+
"Programming Language :: Python :: 3.9",
|
|
37
|
+
"Programming Language :: Python :: 3.10",
|
|
38
|
+
"Programming Language :: Python :: 3.11",
|
|
39
|
+
"Programming Language :: Python :: 3.12",
|
|
40
|
+
"Programming Language :: Python :: 3.13",
|
|
41
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
42
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
43
|
+
"Topic :: Text Processing :: Linguistic",
|
|
44
|
+
]
|
|
45
|
+
# No third-party runtime dependencies: the core is pure standard library.
|
|
46
|
+
dependencies = []
|
|
47
|
+
|
|
48
|
+
[project.optional-dependencies]
|
|
49
|
+
# An external LLM is NOT a pip dependency — install Ollama (https://ollama.com)
|
|
50
|
+
# separately, or pass your own generate=... callable (e.g. an OpenAI client).
|
|
51
|
+
aws = ["boto3>=1.26"] # only for the optional EC2 ceiling-model tooling in aws/
|
|
52
|
+
dev = ["pytest>=7.0", "build>=1.0", "twine>=4.0"]
|
|
53
|
+
|
|
54
|
+
[project.urls]
|
|
55
|
+
Homepage = "https://github.com/pcoz/llm-feedback-control"
|
|
56
|
+
Repository = "https://github.com/pcoz/llm-feedback-control"
|
|
57
|
+
Issues = "https://github.com/pcoz/llm-feedback-control/issues"
|
|
58
|
+
Changelog = "https://github.com/pcoz/llm-feedback-control/blob/main/CHANGELOG.md"
|
|
59
|
+
|
|
60
|
+
[project.scripts]
|
|
61
|
+
lfc = "llm_feedback_control.__main__:main"
|
|
62
|
+
|
|
63
|
+
[tool.setuptools.dynamic]
|
|
64
|
+
version = { attr = "llm_feedback_control.__version__" }
|
|
65
|
+
|
|
66
|
+
[tool.setuptools.packages.find]
|
|
67
|
+
where = ["src"]
|
|
68
|
+
include = ["llm_feedback_control*"]
|
|
69
|
+
|
|
70
|
+
[tool.pytest.ini_options]
|
|
71
|
+
minversion = "7.0"
|
|
72
|
+
testpaths = ["tests"]
|
|
73
|
+
python_files = ["test_*.py"]
|
|
74
|
+
addopts = ["-ra"]
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""llm-feedback-control — get reliable, checkable structured output from a small
|
|
2
|
+
local language model by wrapping it in ordinary deterministic code.
|
|
3
|
+
|
|
4
|
+
WHAT IT DOES, concretely
|
|
5
|
+
------------------------
|
|
6
|
+
You hand it a process described in plain English::
|
|
7
|
+
|
|
8
|
+
"A claim enters Intake. From Intake it goes to Triage. Triage goes to
|
|
9
|
+
FastTrack or to Investigation. ..."
|
|
10
|
+
|
|
11
|
+
and it:
|
|
12
|
+
1. turns that into a state machine (states + transitions);
|
|
13
|
+
2. computes *provable* facts about it — which steps are dead ends, whether
|
|
14
|
+
there are loops, which steps can't be reached;
|
|
15
|
+
3. writes a report in which every statement is backed by one of those
|
|
16
|
+
checked facts (so it can't quietly make things up);
|
|
17
|
+
4. knows its own limits: if the text isn't actually a finite step-by-step
|
|
18
|
+
process (e.g. "prices drift up as confidence grows"), it REFUSES instead
|
|
19
|
+
of inventing a fake state machine; and if the model's first pass missed
|
|
20
|
+
part of the process, it loops to fill the gaps — or refuses if it can't.
|
|
21
|
+
|
|
22
|
+
WHY "FEEDBACK CONTROL" (the analogy, explained)
|
|
23
|
+
-----------------------------------------------
|
|
24
|
+
The design is borrowed from electronics. A raw LLM is like a very high-gain
|
|
25
|
+
amplifier: hugely powerful, but left to run "open-loop" it overshoots — fluent,
|
|
26
|
+
yet it drifts and hallucinates. Engineers tame such an amplifier by adding a
|
|
27
|
+
*feedback loop*: feed the output back, compare it to a stable reference, and
|
|
28
|
+
trade some raw power for precision and stability. This library is that feedback
|
|
29
|
+
loop for an LLM. The "reference" is plain, deterministic code — graph checks
|
|
30
|
+
and schema rules the model's output is measured against.
|
|
31
|
+
|
|
32
|
+
Two kinds of feedback, in plain terms:
|
|
33
|
+
|
|
34
|
+
- NEGATIVE feedback = the stabilising checks (``run_audit``):
|
|
35
|
+
* decide first whether the text is even the kind of thing we can analyse
|
|
36
|
+
exactly, and refuse the fuzzy ones;
|
|
37
|
+
* force the model's answer into a strict shape (with a no-model fallback);
|
|
38
|
+
* compute the provable graph facts;
|
|
39
|
+
* say "I can't do this exactly" rather than guess.
|
|
40
|
+
- POSITIVE feedback = the gap-filling loop (``extract_iterative``):
|
|
41
|
+
re-ask the model about anything the text mentions that's missing from
|
|
42
|
+
its answer, repeating until nothing is missing (a "fixed point") — and
|
|
43
|
+
refuse if it never settles.
|
|
44
|
+
|
|
45
|
+
Zero third-party runtime dependencies. The deterministic core runs with no model
|
|
46
|
+
at all; an LLM is a pure upgrade and is fully injectable (pass ``generate=``).
|
|
47
|
+
|
|
48
|
+
Quickstart (works with no model)::
|
|
49
|
+
|
|
50
|
+
from llm_feedback_control import run_audit
|
|
51
|
+
r = run_audit("A claim enters Intake. From Intake it goes to Triage. "
|
|
52
|
+
"Triage goes to FastTrack or to Investigation.")
|
|
53
|
+
print(r["result"]); print(r["report_facts"])
|
|
54
|
+
"""
|
|
55
|
+
from .llm import gen, gen_ceiling, info, doctor, BackendError
|
|
56
|
+
from .auditor import (
|
|
57
|
+
run_audit,
|
|
58
|
+
regime_gate,
|
|
59
|
+
gate_heuristic,
|
|
60
|
+
extract_workflow,
|
|
61
|
+
exact_analysis,
|
|
62
|
+
graph_facts,
|
|
63
|
+
transfer_operator,
|
|
64
|
+
fp_orbit,
|
|
65
|
+
grounded_report,
|
|
66
|
+
valid,
|
|
67
|
+
fallback_extract,
|
|
68
|
+
norm,
|
|
69
|
+
)
|
|
70
|
+
from .feedback import (
|
|
71
|
+
extract_iterative,
|
|
72
|
+
consistency_gaps,
|
|
73
|
+
candidate_states,
|
|
74
|
+
candidate_trans,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
__version__ = "0.1.0"
|
|
78
|
+
|
|
79
|
+
__all__ = [
|
|
80
|
+
# headline
|
|
81
|
+
"run_audit",
|
|
82
|
+
"extract_iterative",
|
|
83
|
+
# negative-feedback pipeline parts
|
|
84
|
+
"regime_gate",
|
|
85
|
+
"gate_heuristic",
|
|
86
|
+
"extract_workflow",
|
|
87
|
+
"exact_analysis",
|
|
88
|
+
"graph_facts",
|
|
89
|
+
"transfer_operator",
|
|
90
|
+
"fp_orbit",
|
|
91
|
+
"grounded_report",
|
|
92
|
+
"valid",
|
|
93
|
+
"fallback_extract",
|
|
94
|
+
"norm",
|
|
95
|
+
# positive-feedback parts
|
|
96
|
+
"consistency_gaps",
|
|
97
|
+
"candidate_states",
|
|
98
|
+
"candidate_trans",
|
|
99
|
+
# client
|
|
100
|
+
"gen",
|
|
101
|
+
"gen_ceiling",
|
|
102
|
+
"info",
|
|
103
|
+
"doctor",
|
|
104
|
+
"BackendError",
|
|
105
|
+
"__version__",
|
|
106
|
+
]
|