ledge-lang 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ledge_lang-1.1.0/LICENSE +21 -0
- ledge_lang-1.1.0/PKG-INFO +418 -0
- ledge_lang-1.1.0/README.md +385 -0
- ledge_lang-1.1.0/ledge_lang/__init__.py +192 -0
- ledge_lang-1.1.0/ledge_lang/__main__.py +3 -0
- ledge_lang-1.1.0/ledge_lang/ai_types.py +931 -0
- ledge_lang-1.1.0/ledge_lang/ast_nodes.py +331 -0
- ledge_lang-1.1.0/ledge_lang/audit_store.py +534 -0
- ledge_lang-1.1.0/ledge_lang/backends.py +545 -0
- ledge_lang-1.1.0/ledge_lang/calibration.py +387 -0
- ledge_lang-1.1.0/ledge_lang/cli.py +586 -0
- ledge_lang-1.1.0/ledge_lang/comparison.py +353 -0
- ledge_lang-1.1.0/ledge_lang/compiler/__init__.py +27 -0
- ledge_lang-1.1.0/ledge_lang/compiler/ccodegen.py +943 -0
- ledge_lang-1.1.0/ledge_lang/compiler/codegen.py +656 -0
- ledge_lang-1.1.0/ledge_lang/compiler/targets.py +414 -0
- ledge_lang-1.1.0/ledge_lang/core_types.py +413 -0
- ledge_lang-1.1.0/ledge_lang/debugger.py +356 -0
- ledge_lang-1.1.0/ledge_lang/formatter.py +444 -0
- ledge_lang-1.1.0/ledge_lang/interpreter.py +1432 -0
- ledge_lang-1.1.0/ledge_lang/jit.py +226 -0
- ledge_lang-1.1.0/ledge_lang/lexer.py +483 -0
- ledge_lang-1.1.0/ledge_lang/linter.py +315 -0
- ledge_lang-1.1.0/ledge_lang/lsp.py +655 -0
- ledge_lang-1.1.0/ledge_lang/nl_interface.py +222 -0
- ledge_lang-1.1.0/ledge_lang/parser.py +913 -0
- ledge_lang-1.1.0/ledge_lang/profiler.py +179 -0
- ledge_lang-1.1.0/ledge_lang/stdlib.py +611 -0
- ledge_lang-1.1.0/ledge_lang/studio/__init__.py +0 -0
- ledge_lang-1.1.0/ledge_lang/studio/server.py +174 -0
- ledge_lang-1.1.0/ledge_lang/test_runner.py +103 -0
- ledge_lang-1.1.0/ledge_lang/typechecker.py +536 -0
- ledge_lang-1.1.0/ledge_lang/vm.py +940 -0
- ledge_lang-1.1.0/ledge_lang.egg-info/PKG-INFO +418 -0
- ledge_lang-1.1.0/ledge_lang.egg-info/SOURCES.txt +42 -0
- ledge_lang-1.1.0/ledge_lang.egg-info/dependency_links.txt +1 -0
- ledge_lang-1.1.0/ledge_lang.egg-info/entry_points.txt +2 -0
- ledge_lang-1.1.0/ledge_lang.egg-info/requires.txt +9 -0
- ledge_lang-1.1.0/ledge_lang.egg-info/top_level.txt +1 -0
- ledge_lang-1.1.0/pyproject.toml +47 -0
- ledge_lang-1.1.0/setup.cfg +11 -0
- ledge_lang-1.1.0/tests/test_ledge.py +397 -0
- ledge_lang-1.1.0/tests/test_security.py +185 -0
ledge_lang-1.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Ledge Language Project
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ledge-lang
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Ledge is a programming language and governance runtime for auditable, uncertainty-aware AI decisions.
|
|
5
|
+
Author: Ledge Language Project
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ledge-lang/ledge
|
|
8
|
+
Project-URL: Documentation, https://github.com/ledge-lang/ledge/blob/main/README.md
|
|
9
|
+
Project-URL: Issue Tracker, https://github.com/ledge-lang/ledge/issues
|
|
10
|
+
Keywords: programming-language,interpreter,ai,language-design
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Software Development :: Interpreters
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest; extra == "dev"
|
|
27
|
+
Requires-Dist: black; extra == "dev"
|
|
28
|
+
Requires-Dist: mypy; extra == "dev"
|
|
29
|
+
Provides-Extra: studio
|
|
30
|
+
Requires-Dist: flask>=3.0; extra == "studio"
|
|
31
|
+
Requires-Dist: flask-socketio>=5.0; extra == "studio"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# Ledge
|
|
35
|
+
|
|
36
|
+
**Ledge is a governance runtime and programming language for AI decisions.**
|
|
37
|
+
|
|
38
|
+
It does not trust model confidence blindly.
|
|
39
|
+
It forces confidence handling, records decisions, captures outcomes,
|
|
40
|
+
and calibrates thresholds from real-world accuracy.
|
|
41
|
+
|
|
42
|
+
> Ledge turns AI uncertainty from an informal engineering convention
|
|
43
|
+
> into a typed, auditable, empirically calibrated control system.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## What is Ledge?
|
|
48
|
+
|
|
49
|
+
**For non-technical readers:**
|
|
50
|
+
Every AI system — ChatGPT, Claude, any model — sometimes gives answers it is not sure about. The problem is it does not always tell you when. Ledge is a programming language where, if a developer tries to use an AI answer without first checking how confident the AI was, the program refuses to run. Not as a suggestion. As a hard rule. And every decision the system makes is recorded in a tamper-evident log that can be verified mathematically.
|
|
51
|
+
|
|
52
|
+
**For technical readers:**
|
|
53
|
+
Ledge enforces `Uncertain[T]` as a first-class type. Unsafe use of an uncertain value is caught by a static pre-execution typechecker — the program does not run. The type system, runtime, cryptographic audit trail, and calibration layer form a governance stack for AI decisions. Confidence is not trusted; it is recorded, compared against outcomes, and used to derive statistically grounded thresholds per model and domain.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Why does this matter?
|
|
58
|
+
|
|
59
|
+
Most general-purpose languages allow code like this to run with no error:
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
# Python — valid code, no warning
|
|
63
|
+
result = model.classify(patient_symptoms)
|
|
64
|
+
send_treatment_recommendation(result["diagnosis"])
|
|
65
|
+
# What if the model was 30% confident? Nobody checked.
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
In Ledge, this is caught before the program runs:
|
|
69
|
+
|
|
70
|
+
```ledge
|
|
71
|
+
define result as classify(symptoms) using ["urgent", "routine", "monitor"]
|
|
72
|
+
show result
|
|
73
|
+
# STATIC ANALYSIS ERROR: Unsafe use of Uncertain value
|
|
74
|
+
# Confidence was never verified.
|
|
75
|
+
# Fix: check confidence_of(result) before using it.
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
The only way to use the result is to handle uncertainty explicitly:
|
|
79
|
+
|
|
80
|
+
```ledge
|
|
81
|
+
define result as classify(symptoms) using ["urgent", "routine", "monitor"]
|
|
82
|
+
if confidence_of(result) >= 0.85:
|
|
83
|
+
show value_of(result)
|
|
84
|
+
else:
|
|
85
|
+
show "Refer to specialist — confidence too low"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Core position
|
|
91
|
+
|
|
92
|
+
Ledge does not claim that model confidence is inherently trustworthy.
|
|
93
|
+
Although confidence quantification in LLMs remains an open research problem — with active challenges in scalability, interpretability, and robustness — Ledge takes a different stance:
|
|
94
|
+
|
|
95
|
+
**"Although confidence may be imperfect, your system cannot ignore it. And Ledge gives you infrastructure to measure empirically whether it is useful."**
|
|
96
|
+
|
|
97
|
+
That is the claim. It is defensible.
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## Install and run in 2 minutes
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pip install ledge-lang
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Then run a real example — no API key needed:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
ledge run examples/showcase/medical_triage.ledge
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Expected output:
|
|
114
|
+
```
|
|
115
|
+
=== MEDICAL TRIAGE SYSTEM ===
|
|
116
|
+
PATIENT P001: ESCALATE TO HUMAN (confidence=0)
|
|
117
|
+
PATIENT P002: ESCALATE TO HUMAN (confidence=0)
|
|
118
|
+
PATIENT P003: ESCALATE TO HUMAN (confidence=0)
|
|
119
|
+
Decisions logged in audit trail: 3
|
|
120
|
+
Cryptographic chain intact: true
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Without a real AI backend connected, every patient escalates to human review.
|
|
124
|
+
Connect a real backend and it will classify with backend-provided confidence estimates.
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## Four verifiable guarantees
|
|
129
|
+
|
|
130
|
+
Run these yourself. No API key. No setup. Under 5 minutes.
|
|
131
|
+
|
|
132
|
+
### G1 — Zero confidence without a backend
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
python demo_guarantee1.py
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
classify without backend -> confidence = 0
|
|
140
|
+
analyze without backend -> confidence = 0
|
|
141
|
+
generate without backend -> confidence = 0
|
|
142
|
+
Guarantee verified: without backend, confidence = 0
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Without a real model connected, confidence is always exactly zero. The system cannot invent certainty.
|
|
146
|
+
|
|
147
|
+
### G2 — Unsafe AI use is caught before execution
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
python demo_guarantee2.py
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
```
|
|
154
|
+
Unsafe code errors detected: 1
|
|
155
|
+
Message: Unsafe use of Uncertain value 'r' in 'show'
|
|
156
|
+
confidence was never verified
|
|
157
|
+
Safe code errors detected: 0
|
|
158
|
+
Guarantee verified.
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
The static typechecker catches unsafe code before the program runs — not after something goes wrong in production. If the typechecker itself encounters an internal bug, it raises `TypecheckerInternalError` with a full stack trace — it never silently returns an empty result.
|
|
162
|
+
|
|
163
|
+
*Note on terminology: Ledge is currently interpreted, not compiled in the traditional sense. "Compile-time" refers to the pre-execution static analysis phase that runs before any code executes.*
|
|
164
|
+
|
|
165
|
+
### G3 — Cryptographic audit trail
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
python demo_guarantee3.py
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
Entries recorded: 3
|
|
173
|
+
Chain intact (initial): True
|
|
174
|
+
After modifying confidence: False
|
|
175
|
+
After inserting fake entry: False
|
|
176
|
+
Guarantee verified: any modification breaks the chain.
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Every AI decision is recorded with a SHA-256 hash chain. Changing any field — confidence, timestamp, result — breaks the chain and is detected immediately. An external anchor file (`~/.ledge/anchors.jsonl`) records chain state every 10 decisions — if the SQLite database is deleted and regenerated, the anchors detect the inconsistency.
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
ledge audit --verify-anchors # verify anchor file against current database
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### G4 — Safe failure by design
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
python demo_guarantee4.py
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
```
|
|
192
|
+
Patients escalated to human: 3
|
|
193
|
+
Patients classified automatic: 0
|
|
194
|
+
Guarantee verified: without backend, zero automatic decisions.
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Without a backend, the system escalates to human. It does not approve. It does not classify. It waits.
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Five layers of AI governance
|
|
202
|
+
|
|
203
|
+
```
|
|
204
|
+
Uncertain output → forced handling → logged decision
|
|
205
|
+
→ recorded outcome → calibrated threshold
|
|
206
|
+
→ safer future decision
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Layer 1 — Pre-execution uncertainty safety
|
|
210
|
+
|
|
211
|
+
Unsafe AI use is a static analysis error. The program does not run. *(See G2 above.)*
|
|
212
|
+
|
|
213
|
+
### Layer 2 — Runtime confidence provenance
|
|
214
|
+
|
|
215
|
+
Confidence comes from the backend you connect. Without one, it is always exactly 0.0. *(See G1 above.)*
|
|
216
|
+
|
|
217
|
+
**How confidence is computed per backend:**
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
from ledge_lang.backends import openai_backend, anthropic_backend
|
|
221
|
+
|
|
222
|
+
# OpenAI: uses token log-probabilities over classification labels.
|
|
223
|
+
# This is a token-probability-derived confidence estimate.
|
|
224
|
+
# It is more grounded than self-reported confidence, but it is
|
|
225
|
+
# not assumed to be calibrated. The calibration layer (Layer 3)
|
|
226
|
+
# still measures it against real outcomes per model and domain.
|
|
227
|
+
backend = openai_backend(api_key="sk-...", model="gpt-4o-mini")
|
|
228
|
+
|
|
229
|
+
# Anthropic: uses structured self-assessment — the model is asked
|
|
230
|
+
# to return a confidence score alongside its answer as part of
|
|
231
|
+
# the structured output. This is not a native probability score
|
|
232
|
+
# from the model weights.
|
|
233
|
+
backend = anthropic_backend(api_key="sk-ant-...", model="claude-3-haiku-20240307")
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
**Important:** "Confidence from backend" does not mean "the model is 92% sure in an absolute sense." It means the backend returned a score using the method above. For OpenAI logprobs, this is a token-probability-derived confidence estimate — more grounded than self-reported confidence, but not assumed to be calibrated for your domain. For Anthropic structured output, it is a self-reported score. Treat both as signals, not ground truth. The calibration layer (Layer 3) exists precisely because these scores may not reflect real accuracy.
|
|
237
|
+
|
|
238
|
+
### Layer 3 — Domain calibration
|
|
239
|
+
|
|
240
|
+
Ledge compares declared confidence against real outcomes per model and domain:
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
ledge audit --calibration gpt-4 medical
|
|
244
|
+
ledge audit --calibration-metrics gpt-4 medical
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
Real output from the audit system:
|
|
248
|
+
|
|
249
|
+
```
|
|
250
|
+
Calibration Report: gpt-4 / medical (n=30)
|
|
251
|
+
RANGE COUNT MEAN_CONF ACCURACY CAL_ERROR
|
|
252
|
+
0.8-0.9 20 0.848 0.850 0.002
|
|
253
|
+
0.9-1.0 10 0.924 0.700 0.224 <- overconfident
|
|
254
|
+
WARNING: only 10 samples in 0.9-1.0 bucket
|
|
255
|
+
Threshold in this range is provisional (n < 30)
|
|
256
|
+
|
|
257
|
+
Brier score : 0.1711 (lower is better; 0.0 is perfect)
|
|
258
|
+
ECE : 0.0756 (lower is better; <0.10 is a rough heuristic)
|
|
259
|
+
False accept rate : 0.1429 (accepted when wrong)
|
|
260
|
+
False reject rate : 0.7826 (rejected when right)
|
|
261
|
+
Calibrated threshold: 0.921 (provisional — see warning above)
|
|
262
|
+
Well calibrated : False (overconfident in 0.9-1.0 range)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
The backend reports 0.9+ confidence in the medical domain but achieves only 70% accuracy there. Ledge detects this and raises the threshold. The warning flags that 10 samples is below the recommended minimum of 30 for reliable threshold estimation.
|
|
266
|
+
|
|
267
|
+
See [CALIBRATION_GUIDE.md](CALIBRATION_GUIDE.md) for minimum sample requirements, limitations of self-reported outcomes, and drift handling.
|
|
268
|
+
|
|
269
|
+
### Layer 4 — Adaptive thresholds
|
|
270
|
+
|
|
271
|
+
The default 0.85 is arbitrary. Replace it with a value derived from your actual outcomes:
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
from ledge_lang.calibration import DomainCalibrator
|
|
275
|
+
|
|
276
|
+
threshold = calibrator.get_calibrated_threshold(
|
|
277
|
+
'gpt-4', 'medical', desired_accuracy=0.90, min_samples=30
|
|
278
|
+
)
|
|
279
|
+
# Returns calibrated threshold based on observed outcomes
|
|
280
|
+
# Returns default 0.85 with a warning if n < min_samples
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### Layer 5 — Compliance-supporting audit trail
|
|
284
|
+
|
|
285
|
+
Every decision exports in a structured format designed for EU AI Act Article 12/13 evidence documentation:
|
|
286
|
+
|
|
287
|
+
```bash
|
|
288
|
+
ledge audit --export-regulatory report.json
|
|
289
|
+
ledge audit --validate-regulatory report.json
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
```
|
|
293
|
+
VALIDATION PASSED — compliance-supporting Article 12/13 export is structurally valid
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
*What this means:* The export matches Ledge's evidence schema for Articles 12 (logging and monitoring) and 13 (transparency). Article 12 covers automatic logging capabilities during the system lifecycle. Article 13 covers transparency sufficient for deployers to understand and appropriately use AI outputs. Generating a structurally valid JSON-LD is a necessary but not sufficient condition for regulatory compliance. The schema structure is inspectable by running `--export-regulatory` and examining the output. Consult legal counsel for your specific use case and jurisdiction.
|
|
297
|
+
|
|
298
|
+
---
|
|
299
|
+
|
|
300
|
+
## Showcase examples
|
|
301
|
+
|
|
302
|
+
Each of these runs without an API key:
|
|
303
|
+
|
|
304
|
+
```bash
|
|
305
|
+
ledge run examples/showcase/financial_analysis.ledge # credit risk assessment
|
|
306
|
+
ledge run examples/showcase/legal_contracts.ledge # contract clause review
|
|
307
|
+
ledge run examples/showcase/email_scanner.ledge # phishing detection
|
|
308
|
+
ledge run examples/showcase/hiring_screen.ledge # candidate screening
|
|
309
|
+
ledge run examples/showcase/loan_approval.ledge # Basel III + EU AI Act Article 14
|
|
310
|
+
ledge run examples/showcase/medical_record.ledge # diagnosis with audit trail
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
All show the same pattern: without a real AI backend, every decision escalates to human review. No automatic decisions without evidence.
|
|
314
|
+
|
|
315
|
+
---
|
|
316
|
+
|
|
317
|
+
## Frequently asked questions
|
|
318
|
+
|
|
319
|
+
**Who is this for?**
|
|
320
|
+
Developers building systems that use AI models in production — especially in regulated industries: healthcare, finance, legal, insurance.
|
|
321
|
+
|
|
322
|
+
**Does it replace Python?**
|
|
323
|
+
No. Ledge is for the layer where AI decisions are made and must be governed. Python and other languages handle everything else.
|
|
324
|
+
|
|
325
|
+
**What about the confidence score — is it actually accurate?**
|
|
326
|
+
Ledge does not claim confidence scores are accurate. It enforces that they are handled, logged, and compared against real outcomes over time. The calibration layer measures how accurate they actually are. See [CALIBRATION_GUIDE.md](CALIBRATION_GUIDE.md).
|
|
327
|
+
|
|
328
|
+
**Is the audit trail legally acceptable?**
|
|
329
|
+
The export meets the structural requirements of Ledge's Article 12/13 evidence schema. Whether it satisfies legal compliance in your specific jurisdiction requires legal counsel.
|
|
330
|
+
|
|
331
|
+
**Is this production-ready?**
|
|
332
|
+
Honestly: it is a working prototype with real, verifiable guarantees.
|
|
333
|
+
|
|
334
|
+
What works today:
|
|
335
|
+
- The four guarantees (verified by 284 conformance tests + 338 unit tests)
|
|
336
|
+
- Cryptographic audit trail with hash chains and external anchor verification
|
|
337
|
+
- OpenAI backend using real token log-probabilities for confidence
|
|
338
|
+
- Domain calibration with Brier score, ECE, and false accept/reject rates
|
|
339
|
+
- Weighted chain confidence with position decay and weak-step penalization
|
|
340
|
+
- Compliance-supporting regulatory export
|
|
341
|
+
|
|
342
|
+
What does not yet exist:
|
|
343
|
+
- Distributed audit storage
|
|
344
|
+
- A mature package ecosystem
|
|
345
|
+
- Known production deployments
|
|
346
|
+
|
|
347
|
+
---
|
|
348
|
+
|
|
349
|
+
## How Ledge relates to existing work
|
|
350
|
+
|
|
351
|
+
**Turn** — Kizito, 2024
|
|
352
|
+
[arxiv:2603.08755](https://arxiv.org/abs/2603.08755)
|
|
353
|
+
Introduces typed LLM inference as a language primitive with a confidence operator. Designed for agentic systems where LLMs write code. Ledge targets developers building systems that *use* LLMs, and adds domain calibration, outcome tracking, and cryptographic audit trails not present in Turn.
|
|
354
|
+
|
|
355
|
+
**QUASAR** — 2025
|
|
356
|
+
[arxiv:2506.12202](https://arxiv.org/abs/2506.12202) | [OpenReview](https://openreview.net/forum?id=TvpaeQVTGQ)
|
|
357
|
+
A language for LLM code actions with uncertainty quantification via conformal prediction. Transpiles from Python written by LLMs. Ledge is written by developers and enforces confidence handling at analysis time.
|
|
358
|
+
|
|
359
|
+
**IMMACULATE** — Guo et al., 2026
|
|
360
|
+
[arxiv:2602.22700](https://arxiv.org/abs/2602.22700)
|
|
361
|
+
Audits whether LLM API providers execute the model they claim. Ledge audits whether the *code using* those models handles their output safely. Complementary, not competing.
|
|
362
|
+
|
|
363
|
+
**SAUP** — Zhao et al., 2024
|
|
364
|
+
[arxiv:2412.01033](https://arxiv.org/abs/2412.01033)
|
|
365
|
+
Uncertainty propagation through multi-step LLM agent reasoning at runtime using situational weights. Ledge implements transitive uncertainty propagation as `chain_confidence()` at the language level — using position-weighted confidence decay and weak-step penalization — and enforces it at analysis time.
|
|
366
|
+
|
|
367
|
+
**On the research gap:**
|
|
368
|
+
We found no published work combining pre-execution enforcement of AI confidence handling with empirical domain calibration and cryptographic audit trails in a single language. If you know of relevant work we missed, open an issue.
|
|
369
|
+
|
|
370
|
+
---
|
|
371
|
+
|
|
372
|
+
## Security model
|
|
373
|
+
|
|
374
|
+
No Python `eval()` or `exec()`. Ledge uses a custom tree-walker interpreter — Python's object introspection escape paths do not apply.
|
|
375
|
+
|
|
376
|
+
Python FFI imports are blocked by default:
|
|
377
|
+
|
|
378
|
+
```bash
|
|
379
|
+
ledge run program.ledge --safe-mode # blocks all imports
|
|
380
|
+
ledge run program.ledge --allow-import=math,json # whitelist specific modules
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
For server deployments where users submit Ledge code: run inside Docker.
|
|
384
|
+
`--safe-mode` is not a replacement for OS-level isolation.
|
|
385
|
+
|
|
386
|
+
---
|
|
387
|
+
|
|
388
|
+
## Tests
|
|
389
|
+
|
|
390
|
+
```bash
|
|
391
|
+
python tests/conformance.py # 284/284 passed
|
|
392
|
+
python -m pytest tests/unit/ # 338 passed, 1 pre-existing Windows encoding failure
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
---
|
|
396
|
+
|
|
397
|
+
## What Ledge does not do
|
|
398
|
+
|
|
399
|
+
- Does not prove that any individual confidence score is accurate at decision time — it measures and calibrates confidence empirically over time
|
|
400
|
+
- Does not replace Python for general-purpose programming
|
|
401
|
+
- No package ecosystem beyond 15 included packages
|
|
402
|
+
- Native compiler requires `gcc` (experimental)
|
|
403
|
+
- Ledge Studio requires `pip install "ledge-lang[studio]"`
|
|
404
|
+
- Known production deployments: zero
|
|
405
|
+
|
|
406
|
+
---
|
|
407
|
+
|
|
408
|
+
## License
|
|
409
|
+
|
|
410
|
+
MIT
|
|
411
|
+
|
|
412
|
+
---
|
|
413
|
+
|
|
414
|
+
## Questions and feedback
|
|
415
|
+
|
|
416
|
+
If something breaks, a claim does not hold up, or you know existing work that does this better — open an issue.
|
|
417
|
+
|
|
418
|
+
If you use Ledge in a real system, even experimentally — we want to hear about it.
|