ledge-lang 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. ledge_lang-1.1.0/LICENSE +21 -0
  2. ledge_lang-1.1.0/PKG-INFO +418 -0
  3. ledge_lang-1.1.0/README.md +385 -0
  4. ledge_lang-1.1.0/ledge_lang/__init__.py +192 -0
  5. ledge_lang-1.1.0/ledge_lang/__main__.py +3 -0
  6. ledge_lang-1.1.0/ledge_lang/ai_types.py +931 -0
  7. ledge_lang-1.1.0/ledge_lang/ast_nodes.py +331 -0
  8. ledge_lang-1.1.0/ledge_lang/audit_store.py +534 -0
  9. ledge_lang-1.1.0/ledge_lang/backends.py +545 -0
  10. ledge_lang-1.1.0/ledge_lang/calibration.py +387 -0
  11. ledge_lang-1.1.0/ledge_lang/cli.py +586 -0
  12. ledge_lang-1.1.0/ledge_lang/comparison.py +353 -0
  13. ledge_lang-1.1.0/ledge_lang/compiler/__init__.py +27 -0
  14. ledge_lang-1.1.0/ledge_lang/compiler/ccodegen.py +943 -0
  15. ledge_lang-1.1.0/ledge_lang/compiler/codegen.py +656 -0
  16. ledge_lang-1.1.0/ledge_lang/compiler/targets.py +414 -0
  17. ledge_lang-1.1.0/ledge_lang/core_types.py +413 -0
  18. ledge_lang-1.1.0/ledge_lang/debugger.py +356 -0
  19. ledge_lang-1.1.0/ledge_lang/formatter.py +444 -0
  20. ledge_lang-1.1.0/ledge_lang/interpreter.py +1432 -0
  21. ledge_lang-1.1.0/ledge_lang/jit.py +226 -0
  22. ledge_lang-1.1.0/ledge_lang/lexer.py +483 -0
  23. ledge_lang-1.1.0/ledge_lang/linter.py +315 -0
  24. ledge_lang-1.1.0/ledge_lang/lsp.py +655 -0
  25. ledge_lang-1.1.0/ledge_lang/nl_interface.py +222 -0
  26. ledge_lang-1.1.0/ledge_lang/parser.py +913 -0
  27. ledge_lang-1.1.0/ledge_lang/profiler.py +179 -0
  28. ledge_lang-1.1.0/ledge_lang/stdlib.py +611 -0
  29. ledge_lang-1.1.0/ledge_lang/studio/__init__.py +0 -0
  30. ledge_lang-1.1.0/ledge_lang/studio/server.py +174 -0
  31. ledge_lang-1.1.0/ledge_lang/test_runner.py +103 -0
  32. ledge_lang-1.1.0/ledge_lang/typechecker.py +536 -0
  33. ledge_lang-1.1.0/ledge_lang/vm.py +940 -0
  34. ledge_lang-1.1.0/ledge_lang.egg-info/PKG-INFO +418 -0
  35. ledge_lang-1.1.0/ledge_lang.egg-info/SOURCES.txt +42 -0
  36. ledge_lang-1.1.0/ledge_lang.egg-info/dependency_links.txt +1 -0
  37. ledge_lang-1.1.0/ledge_lang.egg-info/entry_points.txt +2 -0
  38. ledge_lang-1.1.0/ledge_lang.egg-info/requires.txt +9 -0
  39. ledge_lang-1.1.0/ledge_lang.egg-info/top_level.txt +1 -0
  40. ledge_lang-1.1.0/pyproject.toml +47 -0
  41. ledge_lang-1.1.0/setup.cfg +11 -0
  42. ledge_lang-1.1.0/tests/test_ledge.py +397 -0
  43. ledge_lang-1.1.0/tests/test_security.py +185 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Ledge Language Project
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,418 @@
1
+ Metadata-Version: 2.4
2
+ Name: ledge-lang
3
+ Version: 1.1.0
4
+ Summary: Ledge is a programming language and governance runtime for auditable, uncertainty-aware AI decisions.
5
+ Author: Ledge Language Project
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ledge-lang/ledge
8
+ Project-URL: Documentation, https://github.com/ledge-lang/ledge/blob/main/README.md
9
+ Project-URL: Issue Tracker, https://github.com/ledge-lang/ledge/issues
10
+ Keywords: programming-language,interpreter,ai,language-design
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Software Development :: Interpreters
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest; extra == "dev"
27
+ Requires-Dist: black; extra == "dev"
28
+ Requires-Dist: mypy; extra == "dev"
29
+ Provides-Extra: studio
30
+ Requires-Dist: flask>=3.0; extra == "studio"
31
+ Requires-Dist: flask-socketio>=5.0; extra == "studio"
32
+ Dynamic: license-file
33
+
34
+ # Ledge
35
+
36
+ **Ledge is a governance runtime and programming language for AI decisions.**
37
+
38
+ It does not trust model confidence blindly.
39
+ It forces confidence handling, records decisions, captures outcomes,
40
+ and calibrates thresholds from real-world accuracy.
41
+
42
+ > Ledge turns AI uncertainty from an informal engineering convention
43
+ > into a typed, auditable, empirically calibrated control system.
44
+
45
+ ---
46
+
47
+ ## What is Ledge?
48
+
49
+ **For non-technical readers:**
50
+ Every AI system — ChatGPT, Claude, any model — sometimes gives answers it is not sure about. The problem is it does not always tell you when. Ledge is a programming language where, if a developer tries to use an AI answer without first checking how confident the AI was, the program refuses to run. Not as a suggestion. As a hard rule. And every decision the system makes is recorded in a tamper-evident log that can be verified mathematically.
51
+
52
+ **For technical readers:**
53
+ Ledge enforces `Uncertain[T]` as a first-class type. Unsafe use of an uncertain value is caught by a static pre-execution typechecker — the program does not run. The type system, runtime, cryptographic audit trail, and calibration layer form a governance stack for AI decisions. Confidence is not trusted; it is recorded, compared against outcomes, and used to derive statistically grounded thresholds per model and domain.
54
+
55
+ ---
56
+
57
+ ## Why does this matter?
58
+
59
+ Most general-purpose languages allow code like this to run with no error:
60
+
61
+ ```python
62
+ # Python — valid code, no warning
63
+ result = model.classify(patient_symptoms)
64
+ send_treatment_recommendation(result["diagnosis"])
65
+ # What if the model was 30% confident? Nobody checked.
66
+ ```
67
+
68
+ In Ledge, this is caught before the program runs:
69
+
70
+ ```ledge
71
+ define result as classify(symptoms) using ["urgent", "routine", "monitor"]
72
+ show result
73
+ # STATIC ANALYSIS ERROR: Unsafe use of Uncertain value
74
+ # Confidence was never verified.
75
+ # Fix: check confidence_of(result) before using it.
76
+ ```
77
+
78
+ The only way to use the result is to handle uncertainty explicitly:
79
+
80
+ ```ledge
81
+ define result as classify(symptoms) using ["urgent", "routine", "monitor"]
82
+ if confidence_of(result) >= 0.85:
83
+ show value_of(result)
84
+ else:
85
+ show "Refer to specialist — confidence too low"
86
+ ```
87
+
88
+ ---
89
+
90
+ ## Core position
91
+
92
+ Ledge does not claim that model confidence is inherently trustworthy.
93
+ Although confidence quantification in LLMs remains an open research problem — with active challenges in scalability, interpretability, and robustness — Ledge takes a different stance:
94
+
95
+ **"Although confidence may be imperfect, your system cannot ignore it. And Ledge gives you infrastructure to measure empirically whether it is useful."**
96
+
97
+ That is the claim. It is defensible.
98
+
99
+ ---
100
+
101
+ ## Install and run in 2 minutes
102
+
103
+ ```bash
104
+ pip install ledge-lang
105
+ ```
106
+
107
+ Then run a real example — no API key needed:
108
+
109
+ ```bash
110
+ ledge run examples/showcase/medical_triage.ledge
111
+ ```
112
+
113
+ Expected output:
114
+ ```
115
+ === MEDICAL TRIAGE SYSTEM ===
116
+ PATIENT P001: ESCALATE TO HUMAN (confidence=0)
117
+ PATIENT P002: ESCALATE TO HUMAN (confidence=0)
118
+ PATIENT P003: ESCALATE TO HUMAN (confidence=0)
119
+ Decisions logged in audit trail: 3
120
+ Cryptographic chain intact: true
121
+ ```
122
+
123
+ Without a real AI backend connected, every patient escalates to human review.
124
+ Connect a real backend and it will classify with backend-provided confidence estimates.
125
+
126
+ ---
127
+
128
+ ## Four verifiable guarantees
129
+
130
+ Run these yourself. No API key. No setup. Under 5 minutes.
131
+
132
+ ### G1 — Zero confidence without a backend
133
+
134
+ ```bash
135
+ python demo_guarantee1.py
136
+ ```
137
+
138
+ ```
139
+ classify without backend -> confidence = 0
140
+ analyze without backend -> confidence = 0
141
+ generate without backend -> confidence = 0
142
+ Guarantee verified: without backend, confidence = 0
143
+ ```
144
+
145
+ Without a real model connected, confidence is always exactly zero. The system cannot invent certainty.
146
+
147
+ ### G2 — Unsafe AI use is caught before execution
148
+
149
+ ```bash
150
+ python demo_guarantee2.py
151
+ ```
152
+
153
+ ```
154
+ Unsafe code errors detected: 1
155
+ Message: Unsafe use of Uncertain value 'r' in 'show'
156
+ confidence was never verified
157
+ Safe code errors detected: 0
158
+ Guarantee verified.
159
+ ```
160
+
161
+ The static typechecker catches unsafe code before the program runs — not after something goes wrong in production. If the typechecker itself encounters an internal bug, it raises `TypecheckerInternalError` with a full stack trace — it never silently returns an empty result.
162
+
163
+ *Note on terminology: Ledge is currently interpreted, not compiled in the traditional sense. "Compile-time" refers to the pre-execution static analysis phase that runs before any code executes.*
164
+
165
+ ### G3 — Cryptographic audit trail
166
+
167
+ ```bash
168
+ python demo_guarantee3.py
169
+ ```
170
+
171
+ ```
172
+ Entries recorded: 3
173
+ Chain intact (initial): True
174
+ After modifying confidence: False
175
+ After inserting fake entry: False
176
+ Guarantee verified: any modification breaks the chain.
177
+ ```
178
+
179
+ Every AI decision is recorded with a SHA-256 hash chain. Changing any field — confidence, timestamp, result — breaks the chain and is detected immediately. An external anchor file (`~/.ledge/anchors.jsonl`) records chain state every 10 decisions — if the SQLite database is deleted and regenerated, the anchors detect the inconsistency.
180
+
181
+ ```bash
182
+ ledge audit --verify-anchors # verify anchor file against current database
183
+ ```
184
+
185
+ ### G4 — Safe failure by design
186
+
187
+ ```bash
188
+ python demo_guarantee4.py
189
+ ```
190
+
191
+ ```
192
+ Patients escalated to human: 3
193
+ Patients classified automatic: 0
194
+ Guarantee verified: without backend, zero automatic decisions.
195
+ ```
196
+
197
+ Without a backend, the system escalates to human. It does not approve. It does not classify. It waits.
198
+
199
+ ---
200
+
201
+ ## Five layers of AI governance
202
+
203
+ ```
204
+ Uncertain output → forced handling → logged decision
205
+ → recorded outcome → calibrated threshold
206
+ → safer future decision
207
+ ```
208
+
209
+ ### Layer 1 — Pre-execution uncertainty safety
210
+
211
+ Unsafe AI use is a static analysis error. The program does not run. *(See G2 above.)*
212
+
213
+ ### Layer 2 — Runtime confidence provenance
214
+
215
+ Confidence comes from the backend you connect. Without one, it is always exactly 0.0. *(See G1 above.)*
216
+
217
+ **How confidence is computed per backend:**
218
+
219
+ ```python
220
+ from ledge_lang.backends import openai_backend, anthropic_backend
221
+
222
+ # OpenAI: uses token log-probabilities over classification labels.
223
+ # This is a token-probability-derived confidence estimate.
224
+ # It is more grounded than self-reported confidence, but it is
225
+ # not assumed to be calibrated. The calibration layer (Layer 3)
226
+ # still measures it against real outcomes per model and domain.
227
+ backend = openai_backend(api_key="sk-...", model="gpt-4o-mini")
228
+
229
+ # Anthropic: uses structured self-assessment — the model is asked
230
+ # to return a confidence score alongside its answer as part of
231
+ # the structured output. This is not a native probability score
232
+ # from the model weights.
233
+ backend = anthropic_backend(api_key="sk-ant-...", model="claude-3-haiku-20240307")
234
+ ```
235
+
236
+ **Important:** "Confidence from backend" does not mean "the model is 92% sure in an absolute sense." It means the backend returned a score using the method above. For OpenAI logprobs, this is a token-probability-derived confidence estimate — more grounded than self-reported confidence, but not assumed to be calibrated for your domain. For Anthropic structured output, it is a self-reported score. Treat both as signals, not ground truth. The calibration layer (Layer 3) exists precisely because these scores may not reflect real accuracy.
237
+
238
+ ### Layer 3 — Domain calibration
239
+
240
+ Ledge compares declared confidence against real outcomes per model and domain:
241
+
242
+ ```bash
243
+ ledge audit --calibration gpt-4 medical
244
+ ledge audit --calibration-metrics gpt-4 medical
245
+ ```
246
+
247
+ Real output from the audit system:
248
+
249
+ ```
250
+ Calibration Report: gpt-4 / medical (n=30)
251
+ RANGE COUNT MEAN_CONF ACCURACY CAL_ERROR
252
+ 0.8-0.9 20 0.848 0.850 0.002
253
+ 0.9-1.0 10 0.924 0.700 0.224 <- overconfident
254
+ WARNING: only 10 samples in 0.9-1.0 bucket
255
+ Threshold in this range is provisional (n < 30)
256
+
257
+ Brier score : 0.1711 (lower is better; 0.0 is perfect)
258
+ ECE : 0.0756 (lower is better; <0.10 is a rough heuristic)
259
+ False accept rate : 0.1429 (accepted when wrong)
260
+ False reject rate : 0.7826 (rejected when right)
261
+ Calibrated threshold: 0.921 (provisional — see warning above)
262
+ Well calibrated : False (overconfident in 0.9-1.0 range)
263
+ ```
264
+
265
+ The backend reports 0.9+ confidence in the medical domain but achieves only 70% accuracy there. Ledge detects this and raises the threshold. The warning flags that 10 samples is below the recommended minimum of 30 for reliable threshold estimation.
266
+
267
+ See [CALIBRATION_GUIDE.md](CALIBRATION_GUIDE.md) for minimum sample requirements, limitations of self-reported outcomes, and drift handling.
268
+
269
+ ### Layer 4 — Adaptive thresholds
270
+
271
+ The default 0.85 is arbitrary. Replace it with a value derived from your actual outcomes:
272
+
273
+ ```python
274
+ from ledge_lang.calibration import DomainCalibrator
275
+
276
+ threshold = calibrator.get_calibrated_threshold(
277
+ 'gpt-4', 'medical', desired_accuracy=0.90, min_samples=30
278
+ )
279
+ # Returns calibrated threshold based on observed outcomes
280
+ # Returns default 0.85 with a warning if n < min_samples
281
+ ```
282
+
283
+ ### Layer 5 — Compliance-supporting audit trail
284
+
285
+ Every decision exports in a structured format designed for EU AI Act Article 12/13 evidence documentation:
286
+
287
+ ```bash
288
+ ledge audit --export-regulatory report.json
289
+ ledge audit --validate-regulatory report.json
290
+ ```
291
+
292
+ ```
293
+ VALIDATION PASSED — compliance-supporting Article 12/13 export is structurally valid
294
+ ```
295
+
296
+ *What this means:* The export matches Ledge's evidence schema for Articles 12 (logging and monitoring) and 13 (transparency). Article 12 covers automatic logging capabilities during the system lifecycle. Article 13 covers transparency sufficient for deployers to understand and appropriately use AI outputs. Generating a structurally valid JSON-LD is a necessary but not sufficient condition for regulatory compliance. The schema structure is inspectable by running `--export-regulatory` and examining the output. Consult legal counsel for your specific use case and jurisdiction.
297
+
298
+ ---
299
+
300
+ ## Showcase examples
301
+
302
+ Each of these runs without an API key:
303
+
304
+ ```bash
305
+ ledge run examples/showcase/financial_analysis.ledge # credit risk assessment
306
+ ledge run examples/showcase/legal_contracts.ledge # contract clause review
307
+ ledge run examples/showcase/email_scanner.ledge # phishing detection
308
+ ledge run examples/showcase/hiring_screen.ledge # candidate screening
309
+ ledge run examples/showcase/loan_approval.ledge # Basel III + EU AI Act Article 14
310
+ ledge run examples/showcase/medical_record.ledge # diagnosis with audit trail
311
+ ```
312
+
313
+ All show the same pattern: without a real AI backend, every decision escalates to human review. No automatic decisions without evidence.
314
+
315
+ ---
316
+
317
+ ## Frequently asked questions
318
+
319
+ **Who is this for?**
320
+ Developers building systems that use AI models in production — especially in regulated industries: healthcare, finance, legal, insurance.
321
+
322
+ **Does it replace Python?**
323
+ No. Ledge is for the layer where AI decisions are made and must be governed. Python and other languages handle everything else.
324
+
325
+ **What about the confidence score — is it actually accurate?**
326
+ Ledge does not claim confidence scores are accurate. It enforces that they are handled, logged, and compared against real outcomes over time. The calibration layer measures how accurate they actually are. See [CALIBRATION_GUIDE.md](CALIBRATION_GUIDE.md).
327
+
328
+ **Is the audit trail legally acceptable?**
329
+ The export meets the structural requirements of Ledge's Article 12/13 evidence schema. Whether it satisfies legal compliance in your specific jurisdiction requires legal counsel.
330
+
331
+ **Is this production-ready?**
332
+ Honestly: it is a working prototype with real, verifiable guarantees.
333
+
334
+ What works today:
335
+ - The four guarantees (verified by 284 conformance tests + 338 unit tests)
336
+ - Cryptographic audit trail with hash chains and external anchor verification
337
+ - OpenAI backend using real token log-probabilities for confidence
338
+ - Domain calibration with Brier score, ECE, and false accept/reject rates
339
+ - Weighted chain confidence with position decay and weak-step penalization
340
+ - Compliance-supporting regulatory export
341
+
342
+ What does not yet exist:
343
+ - Distributed audit storage
344
+ - A mature package ecosystem
345
+ - Known production deployments
346
+
347
+ ---
348
+
349
+ ## How Ledge relates to existing work
350
+
351
+ **Turn** — Kizito, 2024
352
+ [arxiv:2603.08755](https://arxiv.org/abs/2603.08755)
353
+ Introduces typed LLM inference as a language primitive with a confidence operator. Designed for agentic systems where LLMs write code. Ledge targets developers building systems that *use* LLMs, and adds domain calibration, outcome tracking, and cryptographic audit trails not present in Turn.
354
+
355
+ **QUASAR** — 2025
356
+ [arxiv:2506.12202](https://arxiv.org/abs/2506.12202) | [OpenReview](https://openreview.net/forum?id=TvpaeQVTGQ)
357
+ A language for LLM code actions with uncertainty quantification via conformal prediction. Transpiles from Python written by LLMs. Ledge is written by developers and enforces confidence handling at analysis time.
358
+
359
+ **IMMACULATE** — Guo et al., 2026
360
+ [arxiv:2602.22700](https://arxiv.org/abs/2602.22700)
361
+ Audits whether LLM API providers execute the model they claim. Ledge audits whether the *code using* those models handles their output safely. Complementary, not competing.
362
+
363
+ **SAUP** — Zhao et al., 2024
364
+ [arxiv:2412.01033](https://arxiv.org/abs/2412.01033)
365
+ Uncertainty propagation through multi-step LLM agent reasoning at runtime using situational weights. Ledge implements transitive uncertainty propagation as `chain_confidence()` at the language level — using position-weighted confidence decay and weak-step penalization — and enforces it at analysis time.
366
+
367
+ **On the research gap:**
368
+ We found no published work combining pre-execution enforcement of AI confidence handling with empirical domain calibration and cryptographic audit trails in a single language. If you know of relevant work we missed, open an issue.
369
+
370
+ ---
371
+
372
+ ## Security model
373
+
374
+ No Python `eval()` or `exec()`. Ledge uses a custom tree-walker interpreter — Python's object introspection escape paths do not apply.
375
+
376
+ Python FFI imports are blocked by default:
377
+
378
+ ```bash
379
+ ledge run program.ledge --safe-mode # blocks all imports
380
+ ledge run program.ledge --allow-import=math,json # whitelist specific modules
381
+ ```
382
+
383
+ For server deployments where users submit Ledge code: run inside Docker.
384
+ `--safe-mode` is not a replacement for OS-level isolation.
385
+
386
+ ---
387
+
388
+ ## Tests
389
+
390
+ ```bash
391
+ python tests/conformance.py # 284/284 passed
392
+ python -m pytest tests/unit/ # 338 passed, 1 pre-existing Windows encoding failure
393
+ ```
394
+
395
+ ---
396
+
397
+ ## What Ledge does not do
398
+
399
+ - Does not prove that any individual confidence score is accurate at decision time — it measures and calibrates confidence empirically over time
400
+ - Does not replace Python for general-purpose programming
401
+ - No package ecosystem beyond 15 included packages
402
+ - Native compiler requires `gcc` (experimental)
403
+ - Ledge Studio requires `pip install "ledge-lang[studio]"`
404
+ - Known production deployments: zero
405
+
406
+ ---
407
+
408
+ ## License
409
+
410
+ MIT
411
+
412
+ ---
413
+
414
+ ## Questions and feedback
415
+
416
+ If something breaks, a claim does not hold up, or you know existing work that does this better — open an issue.
417
+
418
+ If you use Ledge in a real system, even experimentally — we want to hear about it.