mizan 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mizan-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mousa Abumazin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
mizan-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,231 @@
1
+ Metadata-Version: 2.4
2
+ Name: mizan
3
+ Version: 0.1.0
4
+ Summary: The reliability scale for AI agents: restore, balance, classify, constrain, verify, weigh.
5
+ Author: Mousa Abumazin
6
+ License: MIT
7
+ Keywords: ai,agents,reliability,arabic,tool-calling,preflight
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Provides-Extra: test
12
+ Requires-Dist: pytest>=7; extra == "test"
13
+ Provides-Extra: mcpscan
14
+ Provides-Extra: otel
15
+ Requires-Dist: opentelemetry-api>=1.20; extra == "otel"
16
+ Requires-Dist: opentelemetry-sdk>=1.20; extra == "otel"
17
+ Dynamic: license-file
18
+
19
+ # Mizan ميزان
20
+
21
+ **The reliability scale for AI agents.**
22
+
23
+ Restore the prompt, balance contradictions, classify the case, constrain the arguments, verify the execution, then weigh the evidence.
24
+
25
+ Mizan is built Arabic-first because Arabic exposes failures English often hides: morphology, dialect drift, transliteration, right-to-left text, BiDi safety, and token cost. Those are the same blind spots that hide **tool-poisoning attacks generic English scanners miss** — which is why Mizan ships a multilingual MCP scanner (`mizan.mcpscan`) alongside the reliability pipeline.
26
+
27
+ This repository is the spine for the Mizan stack. It does not replace the existing repos. It makes them read as one system.
28
+
29
+ ## Thesis
30
+
31
+ Agents need a scale before autonomy. Every prompt transformation should be restorable, every contradiction should be balanced or escalated, every tool argument should be constrained, and every execution should leave a receipt that can be weighed against what the agent claims.
32
+
33
+ ## Quickstart — scan an MCP server for poisoning
34
+
35
+ The scanner is dependency-free (detectors are vendored), so it runs from a bare install:
36
+
37
+ ```bash
38
+ pip install "mizan[mcpscan]" # or just: pip install mizan
39
+ python -m mizan.mcpscan examples/mcp_tools_poisoned.json --mode audit
40
+ ```
41
+
42
+ You get a per-tool report — rule ID, severity, evidence, remediation — plus an
43
+ audit/warn/block decision. Try `examples/mcp_tools_clean.json` to see clean tools pass
44
+ (legitimate Arabic, benign "token"/"secret" names, and a `secret_key` param that only
45
+ *warns*, never blocks). The rest of the pipeline (preflight, verify) needs the optional
46
+ git extras; the scanner does not.
47
+
48
+ How well does it work? See the honest, three-tier benchmark (consistency / held-out /
49
+ fresh held-out): [**docs/MCP_POISONING_BENCHMARK.md**](docs/MCP_POISONING_BENCHMARK.md)
50
+ — 0 hard false positives across all tiers, ~63% recall on genuinely novel attacks.
51
+
52
+ ## Use
53
+
54
+ ```python
55
+ from mizan import preflight, PreflightContext
56
+
57
+ r = preflight(
58
+ "send it. cancel it.",
59
+ PreflightContext(contradiction_predicates=[("send", "cancel")]),
60
+ )
61
+ r.ok # False — contradiction is fail-loud, not silently resolved
62
+ r.contradiction # the conflict, surfaced for a clarifying question
63
+ r.receipt.to_dict() # the weighable trail (restore + balance stages)
64
+ ```
65
+
66
+ Scan an MCP tool descriptor for multilingual/Unicode poisoning (the `scan` step):
67
+
68
+ ```python
69
+ from mizan import scan_tool, decide, ScanConfig
70
+
71
+ res = scan_tool({"name": "get_weather", "description": "Weather. ‮ hidden reversed directive"})
72
+ res.ok # False — BiDi control flagged
73
+ [f.rule_id for f in res.findings] # ['R-BIDI-001']
74
+ decide(res, ScanConfig(mode="block")).action # 'block' (audit/warn/block modes)
75
+ ```
76
+
77
+ `mizan.mcpscan` catches BiDi, invisible/TAG, homoglyph, Arabizi, Arabic/English
78
+ code-switch, and (advisory) semantic-exfiltration vectors. Structural findings are
79
+ `high` (block-worthy); semantic-language findings are `medium` (warn — confirm
80
+ intent, since legitimate security tools mention these terms). Also a CLI:
81
+ `python -m mizan.mcpscan tools.json --mode audit`.
82
+
83
+ Export any receipt as OpenTelemetry-compatible spans (interop) with a signed receipt (the tamper-evidence OTel lacks):
84
+
85
+ ```python
86
+ from mizan import receipt_to_spans
87
+ spans = receipt_to_spans(result.receipt, secret="…") # one parent + one span per stage
88
+ spans[0]["attributes"]["mizan.receipt.signature"] # HMAC-SHA256 over the canonical receipt
89
+ # emit_otel(receipt, secret="…") # pushes real spans if `pip install mizan[otel]`
90
+ ```
91
+
92
+ See `examples/otel_trace.py` for a full scan → preflight → gate → constrain → verify trace.
93
+
94
+ Constraint-driven tool gating (the `qadiya` step):
95
+
96
+ ```python
97
+ from mizan import ToolGate, equals_constraint
98
+
99
+ gate = ToolGate(
100
+ [equals_constraint("tool", "tool_name", ["read_file", "search"])],
101
+ allowed_case_ids=["tool=read_file", "tool=search"],
102
+ )
103
+ gate.check({"tool_name": "rm_rf", "args": {}}).allowed # False — escalated, never silently run
104
+ ```
105
+
106
+ The three primitives (`jabr`, `muqabalah`, `qadiya`) are not yet on PyPI. In a dev tree, `mizan` adds local checkouts under `~/Projects` to `sys.path`; to install, run `pip install -e ../jabr -e ../muqabalah -e ../qadiya -e .`.
107
+
108
+ ### End to end — one receipt across all five stages
109
+
110
+ `mizan` folds the back half (`mtg` argument constraint, `toolproof` execution verification) into the same receipt via adapters (`constrain`, `record_from_mtg`, `record_from_toolproof`). [`examples/end_to_end.py`](examples/end_to_end.py) runs a tool call through the whole scale:
111
+
112
+ ```text
113
+ === Clean Arabic request — survives every stage ===
114
+ ok=True blocked_by=[]
115
+ [ok ] restore jabr
116
+ [ok ] balance muqabalah
117
+ [ok ] classify qadiya
118
+ [ok ] constrain mtg
119
+ [ok ] verify toolproof
120
+
121
+ === Failure path — transliteration + hallucinated claim ===
122
+ ok=False blocked_by=['mtg', 'toolproof']
123
+ [ok ] restore jabr
124
+ [ok ] balance muqabalah
125
+ [ok ] classify qadiya
126
+ [BLOCK] constrain mtg # "Riyadh" — Arabic argument transliterated
127
+ [BLOCK] verify toolproof # claimed a tool call that never ran
128
+ ```
129
+
130
+ ## Stack
131
+
132
+ ```mermaid
133
+ flowchart LR
134
+ A[User input] --> B[jabr: restore]
135
+ B --> C[muqabalah: balance]
136
+ C --> D[qadiya: classify + dispatch]
137
+ D --> E[MTG: constrain arguments]
138
+ E --> F[ToolProof: verify execution]
139
+ F --> G[Signed receipts]
140
+
141
+ H[case-eval] -. measures .-> B
142
+ H -. measures .-> C
143
+ H -. measures .-> D
144
+ I[arabic-agent-eval] -. scores .-> E
145
+ J[wasl] -. supplies tools .-> D
146
+ K[hurmoz + khwarizmi-hermes-plugin] -. operates inside Hermes .-> A
147
+ L[artok] -. shows Arabic token cost .-> A
148
+ M[faraid] -. demonstrates exact case method .-> D
149
+ ```
150
+
151
+ ## Repo Map
152
+
153
+ | Stage | Repo | Verb | Current state | Next improvement |
154
+ |---|---|---|---|---|
155
+ | Tool-surface inspection | `mizan.mcpscan` (this repo) | scan | Multilingual MCP poisoning scanner: 6 rule families, audit/warn/block modes, 43 tests, 25/25 corpus recall @ 0 high-FP | OTel export; held-out adversarial corpus; real mcp-scan comparison |
156
+ | Pre-LLM input integrity | [jabr](https://github.com/Moshe-ship/jabr) | restore | Reversible prompt-context restoration, 31 tests | Publish as part of one preflight package |
157
+ | Pre-LLM input integrity | [muqabalah](https://github.com/Moshe-ship/muqabalah) | balance | Reversible cancellation and fail-loud contradiction handling, 19 tests | Share a common receipt format with the rest of the stack |
158
+ | Pre-LLM input integrity | [qadiya](https://github.com/Moshe-ship/qadiya) | classify + dispatch | Constraint-driven case registry, 15 tests | Done — exposed as `mizan.ToolGate` and wired into the Hermes plugin |
159
+ | Proof it works | [case-eval](https://github.com/Moshe-ship/case-eval) | measure | 272 ambiguous prompts, deterministic and LLM-in-the-loop modes, 28 tests | Keep results reproducible and publish the key tables from fresh runs |
160
+ | During tool selection | [mtg](https://github.com/Moshe-ship/mtg) | constrain | Morphological Type Guards for multilingual tool arguments, v0.1 advisory mode. Emits a `mizan` receipt via `mizan.constrain` | Move from advisory diagnostics toward enforceable policy modes |
161
+ | Post execution | [toolproof](https://github.com/Moshe-ship/toolproof) | verify | Pre-execution gating, signed receipts, 95 tests, v0.5.0. Emits a `mizan` receipt via `mizan.record_from_toolproof` | Publish the adversarial dataset and methodology behind headline claims |
162
+ | Benchmark | [arabic-agent-eval](https://github.com/Moshe-ship/arabic-agent-eval) | score | 51 Arabic function-calling items, 6 categories, 5 dialect variants, 22 functions | Reframe as open/installable/dialect-split, publish HF dataset and leaderboard |
163
+ | Tool layer | [wasl](https://github.com/Moshe-ship/wasl) | connect | Arabic MCP server, 30 tools | Register and demo as the Arabic tool substrate for agents |
164
+ | Agent runtime | [hurmoz](https://github.com/Moshe-ship/hurmoz) | operate | 63 Arabic Hermes skills | Keep as the Arabic skills layer and link the reliability stack from relevant skills |
165
+ | Agent runtime | [khwarizmi-hermes-plugin](https://github.com/Moshe-ship/khwarizmi-hermes-plugin) | operate | Thin Hermes adapter over `mizan`: preflight + qadiya tool gate (all four ops) | Rename to `mizan-hermes-plugin` when stable |
166
+ | Funnel | [artok](https://github.com/Moshe-ship/artok) | reveal | Arabic Token Tax calculator across 18 tokenizers | Publish as a Hugging Face Space and use it as top-of-funnel |
167
+ | Method showcase | [faraid](https://github.com/Moshe-ship/faraid) | demonstrate | Working inheritance calculator plus al-Khwarizmi six-case algebra, 16 tests | Use as a precise public example of the case method |
168
+
169
+ ## Pipeline
170
+
171
+ ```text
172
+ tool surface
173
+ -> scan for multilingual/Unicode poisoning mizan.mcpscan
174
+ user input
175
+ -> restore missing context jabr
176
+ -> balance duplication and contradictions muqabalah
177
+ -> classify + dispatch into explicit cases qadiya
178
+ -> constrain multilingual tool arguments mtg
179
+ -> execute, verify, and sign the receipt toolproof + mizan.Receipt
180
+ -> export OTel-compatible spans mizan.otel
181
+ -> score and publish evidence case-eval + arabic-agent-eval
182
+ ```
183
+
184
+ ## Why It Is Called Mizan
185
+
186
+ A *mizan* is a scale: it brings two sides into balance and it measures. Both meanings are the point.
187
+
188
+ The operations that bring an agent's input into balance are the same operations that gave algebra its name. Al-Khwarizmi's book titled them `al-jabr` (restoration) and `al-muqabalah` (balancing):
189
+
190
+ - `jabr` restores missing terms instead of letting a model silently guess.
191
+ - `muqabalah` balances duplicates and contradictions instead of letting a model silently choose.
192
+ - `qadiya` turns the remaining request into explicit cases instead of vague intent routing.
193
+ - `mtg` gives multilingual tool arguments stronger types than plain strings.
194
+ - `toolproof` records what actually ran, then verifies claims against signed receipts.
195
+
196
+ Mizan is the scale those operations serve. The brand is useful only if the engineering stays literal: a scale for agents means explicit operations, complete cases, reversible transformations, and auditable, weighable outcomes.
197
+
198
+ ## Honest Boundaries
199
+
200
+ - This repo now ships a small `mizan` package (`preflight`, `ToolGate`, and the `mtg`/`toolproof` receipt adapters); the underlying primitives still live in their own repos.
201
+ - The full pipeline (restore → balance → classify → constrain → verify) chains into one `Receipt`; see `examples/end_to_end.py`. `mtg`/`toolproof` are optional imports — the adapters accept native results, so `mizan` installs without them.
202
+ - The Hermes plugin now runs all four operations: `jabr` + `muqabalah` via `mizan.preflight`, and `qadiya` via `mizan.ToolGate`. The tool gate is a tool-name allowlist today; richer constraints (arg scope, target sensitivity) are supported by `ToolGate` but not yet surfaced in config.
203
+ - MTG is advisory in v0.1.0. It logs violations but does not block calls.
204
+ - ToolProof's strongest headline claims need a published dataset and reproducible methodology before they should be used in investor/customer copy.
205
+ - `arabic-agent-eval`, `wasl`, and `hurmoz` should avoid "first" or "largest" claims unless those claims are actively re-verified. Safer framing: open, installable, Arabic-first, dialect-aware.
206
+
207
+ ## Classification Rule
208
+
209
+ Every repo should have one job:
210
+
211
+ | Class | Rule | Examples |
212
+ |---|---|---|
213
+ | Core | Part of the reliability pipeline | `jabr`, `muqabalah`, `qadiya`, `case-eval`, `mtg`, `toolproof`, `arabic-agent-eval`, `wasl`, `hurmoz`, `khwarizmi-hermes-plugin`, `artok` |
214
+ | Proof | Shows credibility or a worked method | `faraid`, `Tarminal`, `Lisan`, `bidi-guard` |
215
+ | Suite | Belongs under an Arabic AI developer toolkit umbrella | `samt`, `mukhtasar`, `sarih`, `safha`, `qalam`, `raqeeb`, `naql`, `majal`, `jadwal`, `khalas` |
216
+ | Port | Valuable but on the older runtime surface | `mkhlab` into Hermes/Hurmoz |
217
+ | Client/cash | Funds the work and tests it in production | `performancemax`, `localbiz`, `yalla-ads`, `pmax-core` |
218
+ | Archive | One-off with no role, no proof value, and no cash value | Decide after audit, not blindly |
219
+
220
+ ## Status & next moves
221
+
222
+ Done: preflight (all four ops) wired into the Hermes plugin · `arabic-agent-eval` published as a HF dataset + static leaderboard · receipts chained across `jabr`/`muqabalah`/`qadiya`/`mtg`/`toolproof` (`examples/end_to_end.py`) · `hurmoz`/plugin/`wasl` submitted to `awesome-hermes-agent` · `mizan.mcpscan` shipped with the labeled corpus eval + Hermes plugin audit mode · `mizan.otel` exports receipts as OTel-compatible spans with HMAC signatures.
223
+
224
+ Next:
225
+ 1. Harden `mcpscan` against v2 held-out gaps (ZWNJ/joiner, tab-spacing, semantic vocabulary), then author a fresh v3 set. Held-out generalization so far: ~63% recall on novel attacks, **0 hard false positives** across two sets (audit/warn-ready, not default-block). Run the real `mcp-scan` for the generic-scanner comparison when a public claim is wanted.
226
+ 2. `arabic-agent-eval` v2: format-instruction adherence, a code-switch split, and outcome/policy-level scoring.
227
+ 3. Eventually: real PyPI versions (or vendoring) for `jabr`/`muqabalah`/`qadiya`/`mtg` instead of git extras; a formal receipt spec once the shape is stable.
228
+
229
+ ## One-Line Pitch
230
+
231
+ **Mizan is an Arabic-first reliability scale for AI agents: restore the prompt, balance contradictions, classify the case, constrain the arguments, verify the execution, and weigh the evidence.**
mizan-0.1.0/README.md ADDED
@@ -0,0 +1,213 @@
1
+ # Mizan ميزان
2
+
3
+ **The reliability scale for AI agents.**
4
+
5
+ Restore the prompt, balance contradictions, classify the case, constrain the arguments, verify the execution, then weigh the evidence.
6
+
7
+ Mizan is built Arabic-first because Arabic exposes failures English often hides: morphology, dialect drift, transliteration, right-to-left text, BiDi safety, and token cost. Those are the same blind spots that hide **tool-poisoning attacks generic English scanners miss** — which is why Mizan ships a multilingual MCP scanner (`mizan.mcpscan`) alongside the reliability pipeline.
8
+
9
+ This repository is the spine for the Mizan stack. It does not replace the existing repos. It makes them read as one system.
10
+
11
+ ## Thesis
12
+
13
+ Agents need a scale before autonomy. Every prompt transformation should be restorable, every contradiction should be balanced or escalated, every tool argument should be constrained, and every execution should leave a receipt that can be weighed against what the agent claims.
14
+
15
+ ## Quickstart — scan an MCP server for poisoning
16
+
17
+ The scanner is dependency-free (detectors are vendored), so it runs from a bare install:
18
+
19
+ ```bash
20
+ pip install "mizan[mcpscan]" # or just: pip install mizan
21
+ python -m mizan.mcpscan examples/mcp_tools_poisoned.json --mode audit
22
+ ```
23
+
24
+ You get a per-tool report — rule ID, severity, evidence, remediation — plus an
25
+ audit/warn/block decision. Try `examples/mcp_tools_clean.json` to see clean tools pass
26
+ (legitimate Arabic, benign "token"/"secret" names, and a `secret_key` param that only
27
+ *warns*, never blocks). The rest of the pipeline (preflight, verify) needs the optional
28
+ git extras; the scanner does not.
29
+
30
+ How well does it work? See the honest, three-tier benchmark (consistency / held-out /
31
+ fresh held-out): [**docs/MCP_POISONING_BENCHMARK.md**](docs/MCP_POISONING_BENCHMARK.md)
32
+ — 0 hard false positives across all tiers, ~63% recall on genuinely novel attacks.
33
+
34
+ ## Use
35
+
36
+ ```python
37
+ from mizan import preflight, PreflightContext
38
+
39
+ r = preflight(
40
+ "send it. cancel it.",
41
+ PreflightContext(contradiction_predicates=[("send", "cancel")]),
42
+ )
43
+ r.ok # False — contradiction is fail-loud, not silently resolved
44
+ r.contradiction # the conflict, surfaced for a clarifying question
45
+ r.receipt.to_dict() # the weighable trail (restore + balance stages)
46
+ ```
47
+
48
+ Scan an MCP tool descriptor for multilingual/Unicode poisoning (the `scan` step):
49
+
50
+ ```python
51
+ from mizan import scan_tool, decide, ScanConfig
52
+
53
+ res = scan_tool({"name": "get_weather", "description": "Weather. ‮ hidden reversed directive"})
54
+ res.ok # False — BiDi control flagged
55
+ [f.rule_id for f in res.findings] # ['R-BIDI-001']
56
+ decide(res, ScanConfig(mode="block")).action # 'block' (audit/warn/block modes)
57
+ ```
58
+
59
+ `mizan.mcpscan` catches BiDi, invisible/TAG, homoglyph, Arabizi, Arabic/English
60
+ code-switch, and (advisory) semantic-exfiltration vectors. Structural findings are
61
+ `high` (block-worthy); semantic-language findings are `medium` (warn — confirm
62
+ intent, since legitimate security tools mention these terms). Also a CLI:
63
+ `python -m mizan.mcpscan tools.json --mode audit`.
64
+
65
+ Export any receipt as OpenTelemetry-compatible spans (interop) with a signed receipt (the tamper-evidence OTel lacks):
66
+
67
+ ```python
68
+ from mizan import receipt_to_spans
69
+ spans = receipt_to_spans(result.receipt, secret="…") # one parent + one span per stage
70
+ spans[0]["attributes"]["mizan.receipt.signature"] # HMAC-SHA256 over the canonical receipt
71
+ # emit_otel(receipt, secret="…") # pushes real spans if `pip install mizan[otel]`
72
+ ```
73
+
74
+ See `examples/otel_trace.py` for a full scan → preflight → gate → constrain → verify trace.
75
+
76
+ Constraint-driven tool gating (the `qadiya` step):
77
+
78
+ ```python
79
+ from mizan import ToolGate, equals_constraint
80
+
81
+ gate = ToolGate(
82
+ [equals_constraint("tool", "tool_name", ["read_file", "search"])],
83
+ allowed_case_ids=["tool=read_file", "tool=search"],
84
+ )
85
+ gate.check({"tool_name": "rm_rf", "args": {}}).allowed # False — escalated, never silently run
86
+ ```
87
+
88
+ The three primitives (`jabr`, `muqabalah`, `qadiya`) are not yet on PyPI. In a dev tree, `mizan` adds local checkouts under `~/Projects` to `sys.path`; to install, run `pip install -e ../jabr -e ../muqabalah -e ../qadiya -e .`.
89
+
90
+ ### End to end — one receipt across all five stages
91
+
92
+ `mizan` folds the back half (`mtg` argument constraint, `toolproof` execution verification) into the same receipt via adapters (`constrain`, `record_from_mtg`, `record_from_toolproof`). [`examples/end_to_end.py`](examples/end_to_end.py) runs a tool call through the whole scale:
93
+
94
+ ```text
95
+ === Clean Arabic request — survives every stage ===
96
+ ok=True blocked_by=[]
97
+ [ok ] restore jabr
98
+ [ok ] balance muqabalah
99
+ [ok ] classify qadiya
100
+ [ok ] constrain mtg
101
+ [ok ] verify toolproof
102
+
103
+ === Failure path — transliteration + hallucinated claim ===
104
+ ok=False blocked_by=['mtg', 'toolproof']
105
+ [ok ] restore jabr
106
+ [ok ] balance muqabalah
107
+ [ok ] classify qadiya
108
+ [BLOCK] constrain mtg # "Riyadh" — Arabic argument transliterated
109
+ [BLOCK] verify toolproof # claimed a tool call that never ran
110
+ ```
111
+
112
+ ## Stack
113
+
114
+ ```mermaid
115
+ flowchart LR
116
+ A[User input] --> B[jabr: restore]
117
+ B --> C[muqabalah: balance]
118
+ C --> D[qadiya: classify + dispatch]
119
+ D --> E[MTG: constrain arguments]
120
+ E --> F[ToolProof: verify execution]
121
+ F --> G[Signed receipts]
122
+
123
+ H[case-eval] -. measures .-> B
124
+ H -. measures .-> C
125
+ H -. measures .-> D
126
+ I[arabic-agent-eval] -. scores .-> E
127
+ J[wasl] -. supplies tools .-> D
128
+ K[hurmoz + khwarizmi-hermes-plugin] -. operates inside Hermes .-> A
129
+ L[artok] -. shows Arabic token cost .-> A
130
+ M[faraid] -. demonstrates exact case method .-> D
131
+ ```
132
+
133
+ ## Repo Map
134
+
135
+ | Stage | Repo | Verb | Current state | Next improvement |
136
+ |---|---|---|---|---|
137
+ | Tool-surface inspection | `mizan.mcpscan` (this repo) | scan | Multilingual MCP poisoning scanner: 6 rule families, audit/warn/block modes, 43 tests, 25/25 corpus recall @ 0 high-FP | OTel export; held-out adversarial corpus; real mcp-scan comparison |
138
+ | Pre-LLM input integrity | [jabr](https://github.com/Moshe-ship/jabr) | restore | Reversible prompt-context restoration, 31 tests | Publish as part of one preflight package |
139
+ | Pre-LLM input integrity | [muqabalah](https://github.com/Moshe-ship/muqabalah) | balance | Reversible cancellation and fail-loud contradiction handling, 19 tests | Share a common receipt format with the rest of the stack |
140
+ | Pre-LLM input integrity | [qadiya](https://github.com/Moshe-ship/qadiya) | classify + dispatch | Constraint-driven case registry, 15 tests | Done — exposed as `mizan.ToolGate` and wired into the Hermes plugin |
141
+ | Proof it works | [case-eval](https://github.com/Moshe-ship/case-eval) | measure | 272 ambiguous prompts, deterministic and LLM-in-the-loop modes, 28 tests | Keep results reproducible and publish the key tables from fresh runs |
142
+ | During tool selection | [mtg](https://github.com/Moshe-ship/mtg) | constrain | Morphological Type Guards for multilingual tool arguments, v0.1 advisory mode. Emits a `mizan` receipt via `mizan.constrain` | Move from advisory diagnostics toward enforceable policy modes |
143
+ | Post execution | [toolproof](https://github.com/Moshe-ship/toolproof) | verify | Pre-execution gating, signed receipts, 95 tests, v0.5.0. Emits a `mizan` receipt via `mizan.record_from_toolproof` | Publish the adversarial dataset and methodology behind headline claims |
144
+ | Benchmark | [arabic-agent-eval](https://github.com/Moshe-ship/arabic-agent-eval) | score | 51 Arabic function-calling items, 6 categories, 5 dialect variants, 22 functions | Reframe as open/installable/dialect-split, publish HF dataset and leaderboard |
145
+ | Tool layer | [wasl](https://github.com/Moshe-ship/wasl) | connect | Arabic MCP server, 30 tools | Register and demo as the Arabic tool substrate for agents |
146
+ | Agent runtime | [hurmoz](https://github.com/Moshe-ship/hurmoz) | operate | 63 Arabic Hermes skills | Keep as the Arabic skills layer and link the reliability stack from relevant skills |
147
+ | Agent runtime | [khwarizmi-hermes-plugin](https://github.com/Moshe-ship/khwarizmi-hermes-plugin) | operate | Thin Hermes adapter over `mizan`: preflight + qadiya tool gate (all four ops) | Rename to `mizan-hermes-plugin` when stable |
148
+ | Funnel | [artok](https://github.com/Moshe-ship/artok) | reveal | Arabic Token Tax calculator across 18 tokenizers | Publish as a Hugging Face Space and use it as top-of-funnel |
149
+ | Method showcase | [faraid](https://github.com/Moshe-ship/faraid) | demonstrate | Working inheritance calculator plus al-Khwarizmi six-case algebra, 16 tests | Use as a precise public example of the case method |
150
+
151
+ ## Pipeline
152
+
153
+ ```text
154
+ tool surface
155
+ -> scan for multilingual/Unicode poisoning mizan.mcpscan
156
+ user input
157
+ -> restore missing context jabr
158
+ -> balance duplication and contradictions muqabalah
159
+ -> classify + dispatch into explicit cases qadiya
160
+ -> constrain multilingual tool arguments mtg
161
+ -> execute, verify, and sign the receipt toolproof + mizan.Receipt
162
+ -> export OTel-compatible spans mizan.otel
163
+ -> score and publish evidence case-eval + arabic-agent-eval
164
+ ```
165
+
166
+ ## Why It Is Called Mizan
167
+
168
+ A *mizan* is a scale: it brings two sides into balance and it measures. Both meanings are the point.
169
+
170
+ The operations that bring an agent's input into balance are the same operations that gave algebra its name. Al-Khwarizmi's book titled them `al-jabr` (restoration) and `al-muqabalah` (balancing):
171
+
172
+ - `jabr` restores missing terms instead of letting a model silently guess.
173
+ - `muqabalah` balances duplicates and contradictions instead of letting a model silently choose.
174
+ - `qadiya` turns the remaining request into explicit cases instead of vague intent routing.
175
+ - `mtg` gives multilingual tool arguments stronger types than plain strings.
176
+ - `toolproof` records what actually ran, then verifies claims against signed receipts.
177
+
178
+ Mizan is the scale those operations serve. The brand is useful only if the engineering stays literal: a scale for agents means explicit operations, complete cases, reversible transformations, and auditable, weighable outcomes.
179
+
180
+ ## Honest Boundaries
181
+
182
+ - This repo now ships a small `mizan` package (`preflight`, `ToolGate`, and the `mtg`/`toolproof` receipt adapters); the underlying primitives still live in their own repos.
183
+ - The full pipeline (restore → balance → classify → constrain → verify) chains into one `Receipt`; see `examples/end_to_end.py`. `mtg`/`toolproof` are optional imports — the adapters accept native results, so `mizan` installs without them.
184
+ - The Hermes plugin now runs all four operations: `jabr` + `muqabalah` via `mizan.preflight`, and `qadiya` via `mizan.ToolGate`. The tool gate is a tool-name allowlist today; richer constraints (arg scope, target sensitivity) are supported by `ToolGate` but not yet surfaced in config.
185
+ - MTG is advisory in v0.1.0. It logs violations but does not block calls.
186
+ - ToolProof's strongest headline claims need a published dataset and reproducible methodology before they should be used in investor/customer copy.
187
+ - `arabic-agent-eval`, `wasl`, and `hurmoz` should avoid "first" or "largest" claims unless those claims are actively re-verified. Safer framing: open, installable, Arabic-first, dialect-aware.
188
+
189
+ ## Classification Rule
190
+
191
+ Every repo should have one job:
192
+
193
+ | Class | Rule | Examples |
194
+ |---|---|---|
195
+ | Core | Part of the reliability pipeline | `jabr`, `muqabalah`, `qadiya`, `case-eval`, `mtg`, `toolproof`, `arabic-agent-eval`, `wasl`, `hurmoz`, `khwarizmi-hermes-plugin`, `artok` |
196
+ | Proof | Shows credibility or a worked method | `faraid`, `Tarminal`, `Lisan`, `bidi-guard` |
197
+ | Suite | Belongs under an Arabic AI developer toolkit umbrella | `samt`, `mukhtasar`, `sarih`, `safha`, `qalam`, `raqeeb`, `naql`, `majal`, `jadwal`, `khalas` |
198
+ | Port | Valuable but on the older runtime surface | `mkhlab` into Hermes/Hurmoz |
199
+ | Client/cash | Funds the work and tests it in production | `performancemax`, `localbiz`, `yalla-ads`, `pmax-core` |
200
+ | Archive | One-off with no role, no proof value, and no cash value | Decide after audit, not blindly |
201
+
202
+ ## Status & next moves
203
+
204
+ Done: preflight (all four ops) wired into the Hermes plugin · `arabic-agent-eval` published as a HF dataset + static leaderboard · receipts chained across `jabr`/`muqabalah`/`qadiya`/`mtg`/`toolproof` (`examples/end_to_end.py`) · `hurmoz`/plugin/`wasl` submitted to `awesome-hermes-agent` · `mizan.mcpscan` shipped with the labeled corpus eval + Hermes plugin audit mode · `mizan.otel` exports receipts as OTel-compatible spans with HMAC signatures.
205
+
206
+ Next:
207
+ 1. Harden `mcpscan` against v2 held-out gaps (ZWNJ/joiner, tab-spacing, semantic vocabulary), then author a fresh v3 set. Held-out generalization so far: ~63% recall on novel attacks, **0 hard false positives** across two sets (audit/warn-ready, not default-block). Run the real `mcp-scan` for the generic-scanner comparison when a public claim is wanted.
208
+ 2. `arabic-agent-eval` v2: format-instruction adherence, a code-switch split, and outcome/policy-level scoring.
209
+ 3. Eventually: real PyPI versions (or vendoring) for `jabr`/`muqabalah`/`qadiya`/`mtg` instead of git extras; a formal receipt spec once the shape is stable.
210
+
211
+ ## One-Line Pitch
212
+
213
+ **Mizan is an Arabic-first reliability scale for AI agents: restore the prompt, balance contradictions, classify the case, constrain the arguments, verify the execution, and weigh the evidence.**
@@ -0,0 +1,132 @@
1
+ """Mizan ميزان — the reliability scale for AI agents.
2
+
3
+ One front door for the input-integrity half of the stack:
4
+
5
+ >>> from mizan import preflight, PreflightContext
6
+ >>> r = preflight("send it. cancel it.",
7
+ ... PreflightContext(contradiction_predicates=[("send", "cancel")]))
8
+ >>> r.ok
9
+ False
10
+ >>> r.contradiction is not None
11
+ True
12
+
13
+ The qadiya case primitives are re-exported for the dispatch/tool layer, and
14
+ the shared :class:`Receipt` is the one audit format every stage appends to.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from mizan.preflight import (
20
+ PreflightContext,
21
+ PreflightResult,
22
+ preflight,
23
+ strip_tags,
24
+ )
25
+ from mizan.receipt import (
26
+ Receipt,
27
+ StageRecord,
28
+ STAGE_BALANCE,
29
+ STAGE_CLASSIFY,
30
+ STAGE_CONSTRAIN,
31
+ STAGE_RESTORE,
32
+ STAGE_SCAN,
33
+ STAGE_VERIFY,
34
+ )
35
+ from mizan.integrations import (
36
+ record_from_mtg,
37
+ record_from_toolproof,
38
+ constrain,
39
+ )
40
+ from mizan.otel import receipt_to_spans, emit as emit_otel
41
+
42
+ # Re-export the qadiya case layer (classify + dispatch) for tool gating.
43
+ # Imported defensively so `import mizan` still works if qadiya is absent.
44
+ try:
45
+ from qadiya import ( # noqa: F401
46
+ Case,
47
+ Constraint,
48
+ CaseRegistry,
49
+ Classifier,
50
+ Dispatch,
51
+ DispatchResult,
52
+ DispatchOutcome,
53
+ NoMatchingCase,
54
+ AmbiguousCases,
55
+ enumerate_cases,
56
+ )
57
+ from mizan.toolgate import ( # noqa: F401
58
+ ToolGate,
59
+ GateDecision,
60
+ equals_constraint,
61
+ predicate_constraint,
62
+ OTHER,
63
+ )
64
+
65
+ _HAS_QADIYA = True
66
+ except Exception: # noqa: BLE001
67
+ _HAS_QADIYA = False
68
+
69
+ # mcpscan needs mtg; import defensively so `import mizan` works without it.
70
+ try:
71
+ from mizan.mcpscan import ( # noqa: F401
72
+ scan_tool,
73
+ scan_tools,
74
+ ScanResult,
75
+ Finding,
76
+ ScanConfig,
77
+ Decision,
78
+ decide,
79
+ report,
80
+ )
81
+
82
+ _HAS_MCPSCAN = True
83
+ except Exception: # noqa: BLE001
84
+ _HAS_MCPSCAN = False
85
+
86
+ __version__ = "0.1.0"
87
+
88
+ __all__ = [
89
+ "preflight",
90
+ "PreflightContext",
91
+ "PreflightResult",
92
+ "strip_tags",
93
+ "Receipt",
94
+ "StageRecord",
95
+ "STAGE_RESTORE",
96
+ "STAGE_BALANCE",
97
+ "STAGE_CLASSIFY",
98
+ "STAGE_CONSTRAIN",
99
+ "STAGE_SCAN",
100
+ "STAGE_VERIFY",
101
+ "record_from_mtg",
102
+ "record_from_toolproof",
103
+ "constrain",
104
+ "receipt_to_spans",
105
+ "emit_otel",
106
+ "__version__",
107
+ ]
108
+
109
+ if _HAS_QADIYA:
110
+ __all__ += [
111
+ "Case",
112
+ "Constraint",
113
+ "CaseRegistry",
114
+ "Classifier",
115
+ "Dispatch",
116
+ "DispatchResult",
117
+ "DispatchOutcome",
118
+ "NoMatchingCase",
119
+ "AmbiguousCases",
120
+ "enumerate_cases",
121
+ "ToolGate",
122
+ "GateDecision",
123
+ "equals_constraint",
124
+ "predicate_constraint",
125
+ "OTHER",
126
+ ]
127
+
128
+ if _HAS_MCPSCAN:
129
+ __all__ += [
130
+ "scan_tool", "scan_tools", "ScanResult", "Finding",
131
+ "ScanConfig", "Decision", "decide", "report",
132
+ ]