PyPI - kipimo - Versions diffs - 0.1.0__tar.gz - Mend

kipimo 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

kipimo-0.1.0/LICENSE +21 -0
kipimo-0.1.0/PKG-INFO +47 -0
kipimo-0.1.0/README.md +32 -0
kipimo-0.1.0/pyproject.toml +33 -0
kipimo-0.1.0/setup.cfg +4 -0
kipimo-0.1.0/src/kipimo/__init__.py +6 -0
kipimo-0.1.0/src/kipimo/cli.py +115 -0
kipimo-0.1.0/src/kipimo/data/kipimo_v0.1.jsonl +46 -0
kipimo-0.1.0/src/kipimo.egg-info/PKG-INFO +47 -0
kipimo-0.1.0/src/kipimo.egg-info/SOURCES.txt +13 -0
kipimo-0.1.0/src/kipimo.egg-info/dependency_links.txt +1 -0
kipimo-0.1.0/src/kipimo.egg-info/entry_points.txt +2 -0
kipimo-0.1.0/src/kipimo.egg-info/top_level.txt +1 -0
kipimo-0.1.0/tests/test_kipimo.py +47 -0
kipimo-0.1.0/tests/test_smoke.py +12 -0

kipimo-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 AfriKaziOS maintainers
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

kipimo-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,47 @@
+Metadata-Version: 2.4
+Name: kipimo
+Version: 0.1.0
+Summary: Swahili agent-task evaluation suite for the East Africa coordination stack — model-agnostic seed benchmark (46 tasks: server routing, term grounding, cascade routing).
+Author: AfriKaziOS maintainers
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/gabrielmahia/kipimo
+Project-URL: Repository, https://github.com/gabrielmahia/kipimo
+Project-URL: Issues, https://github.com/gabrielmahia/kipimo/issues
+Keywords: swahili,benchmark,evaluation,mcp,kenya,ai-agents
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Dynamic: license-file
+# kipimo
+Over one hundred million people coordinate their lives in Swahili, yet no benchmark measures whether an AI agent can route their requests correctly — send money, check drought status, find a clinic, verify a worker's credentials. Agents targeting East Africa are evaluated on English tasks and deployed on faith.
+`kipimo` (Swahili: *a measure*) is a model-agnostic seed benchmark for exactly that gap: **46 tasks** across three types, with golds machine-derived from authoritative sources — the coordination-stack registry and the live `africa-coord-bus` routing table — never from memory.
+| Type | n | What it measures | Metric |
+|---|---|---|---|
+| `server_routing` | 25 | Swahili request → correct stack server (payments, tax, health, land, labour…) | exact |
+| `term_grounding` | 14 | Swahili domain term → English meaning | exact (case-insensitive) |
+| `cascade_routing` | 7 | Coordination event → which sectors must be notified | set F1 |
+## Use it (any model, no API keys)
+```bash
+pip install kipimo
+kipimo tasks > tasks.jsonl        # feed to your agent however you like
+kipimo template > preds.jsonl     # fill "prediction": [...] per id
+kipimo score preds.jsonl          # per-type + overall report
+```
+The harness never calls a model — you generate predictions with whatever system you're evaluating; kipimo only scores. Any lab can publish comparable numbers.
+## Honesty box
+- **v0.1 is a seed set.** 46 tasks establish the format and scoring; breadth comes from contributions.
+- Swahili phrasing is simple-register and **pending native-speaker review** — that is [issue #1](https://github.com/gabrielmahia/kipimo/issues), and corrections are the most valuable contribution possible.
+- Scores measure stack-routing competence, not general Swahili fluency.
+- Dataset: **CC BY 4.0** (usable by everyone, including commercial labs — that's the point). Harness: **MIT**.
+## IP & Collaboration
+MIT-licensed harness, CC BY 4.0 data. Feedback via GitHub Issues only — pull requests are not accepted; task corrections and additions via Issues are actively wanted. Full policy: [docs/architecture/IP_POLICY.md](docs/architecture/IP_POLICY.md). Security: see [SECURITY.md](SECURITY.md).

kipimo-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,32 @@
+# kipimo
+Over one hundred million people coordinate their lives in Swahili, yet no benchmark measures whether an AI agent can route their requests correctly — send money, check drought status, find a clinic, verify a worker's credentials. Agents targeting East Africa are evaluated on English tasks and deployed on faith.
+`kipimo` (Swahili: *a measure*) is a model-agnostic seed benchmark for exactly that gap: **46 tasks** across three types, with golds machine-derived from authoritative sources — the coordination-stack registry and the live `africa-coord-bus` routing table — never from memory.
+| Type | n | What it measures | Metric |
+|---|---|---|---|
+| `server_routing` | 25 | Swahili request → correct stack server (payments, tax, health, land, labour…) | exact |
+| `term_grounding` | 14 | Swahili domain term → English meaning | exact (case-insensitive) |
+| `cascade_routing` | 7 | Coordination event → which sectors must be notified | set F1 |
+## Use it (any model, no API keys)
+```bash
+pip install kipimo
+kipimo tasks > tasks.jsonl        # feed to your agent however you like
+kipimo template > preds.jsonl     # fill "prediction": [...] per id
+kipimo score preds.jsonl          # per-type + overall report
+```
+The harness never calls a model — you generate predictions with whatever system you're evaluating; kipimo only scores. Any lab can publish comparable numbers.
+## Honesty box
+- **v0.1 is a seed set.** 46 tasks establish the format and scoring; breadth comes from contributions.
+- Swahili phrasing is simple-register and **pending native-speaker review** — that is [issue #1](https://github.com/gabrielmahia/kipimo/issues), and corrections are the most valuable contribution possible.
+- Scores measure stack-routing competence, not general Swahili fluency.
+- Dataset: **CC BY 4.0** (usable by everyone, including commercial labs — that's the point). Harness: **MIT**.
+## IP & Collaboration
+MIT-licensed harness, CC BY 4.0 data. Feedback via GitHub Issues only — pull requests are not accepted; task corrections and additions via Issues are actively wanted. Full policy: [docs/architecture/IP_POLICY.md](docs/architecture/IP_POLICY.md). Security: see [SECURITY.md](SECURITY.md).

kipimo-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,33 @@
+[build-system]
+requires = ["setuptools>=77", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "kipimo"
+version = "0.1.0"
+description = "Swahili agent-task evaluation suite for the East Africa coordination stack — model-agnostic seed benchmark (46 tasks: server routing, term grounding, cascade routing)."
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+license-files = ["LICENSE"]
+authors = [{name = "AfriKaziOS maintainers"}]
+keywords = ["swahili", "benchmark", "evaluation", "mcp", "kenya", "ai-agents"]
+[project.urls]
+Homepage = "https://github.com/gabrielmahia/kipimo"
+Repository = "https://github.com/gabrielmahia/kipimo"
+Issues = "https://github.com/gabrielmahia/kipimo/issues"
+[project.scripts]
+kipimo = "kipimo.cli:_main"
+[tool.setuptools]
+package-dir = {"" = "src"}
+packages = ["kipimo"]
+[tool.setuptools.package-data]
+kipimo = ["data/*.jsonl"]
+[tool.ruff]
+target-version = "py310"
+line-length = 100

kipimo-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

kipimo-0.1.0/src/kipimo/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""kipimo — Swahili agent-task evaluation for the East Africa coordination stack."""
+from .cli import __version__ as __version__
+from .cli import load_tasks as load_tasks
+from .cli import score_file as score_file
+from .cli import score_one as score_one

kipimo-0.1.0/src/kipimo/cli.py ADDED Viewed

@@ -0,0 +1,115 @@
+"""kipimo — Swahili agent-task evaluation for the East Africa coordination stack.
+Model-agnostic by design: kipimo emits tasks and scores prediction files. It
+never calls a model API, so any lab, student, or vendor can evaluate any agent
+against the same gold set. Golds are machine-derived from authoritative
+sources (the stack registry; the africa-coord-bus routing table), never from
+memory.
+Usage:
+    kipimo tasks                 # emit the task set (JSONL) to stdout
+    kipimo template              # emit an empty predictions file to fill in
+    kipimo score preds.jsonl     # score predictions against gold
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from collections import defaultdict
+from importlib import resources
+__version__ = "0.1.0"
+DISCLAIMER = ("kipimo v0.1 is a SEED benchmark (46 tasks). Swahili phrasing is "
+              "simple-register and pending native-speaker review (issue #1). "
+              "Scores indicate stack-routing competence, not general Swahili "
+              "fluency. Do not use as a sole deployment gate.")
+def load_tasks() -> list[dict]:
+    text = resources.files("kipimo").joinpath("data/kipimo_v0.1.jsonl").read_text("utf-8")
+    return [json.loads(line) for line in text.splitlines() if line.strip()]
+def _norm(s: str) -> str:
+    return " ".join(str(s).lower().split())
+def score_one(task: dict, pred: list[str]) -> float:
+    gold = [_norm(g) for g in task["gold"]]
+    p = [_norm(x) for x in (pred or [])]
+    if task["metric"] in ("exact", "exact_ci"):
+        return 1.0 if p and p[0] in gold else 0.0
+    if task["metric"] == "set_f1":
+        gs, ps = set(gold), set(p)
+        if not ps:
+            return 0.0
+        tp = len(gs & ps)
+        prec, rec = tp / len(ps), tp / len(gs)
+        return 0.0 if tp == 0 else 2 * prec * rec / (prec + rec)
+    raise ValueError(f"unknown metric {task['metric']}")
+def score_file(path: str) -> dict:
+    preds = {}
+    with open(path, encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                row = json.loads(line)
+                preds[row["id"]] = row.get("prediction", [])
+    tasks = load_tasks()
+    by_type: dict[str, list[float]] = defaultdict(list)
+    missing = []
+    for t in tasks:
+        if t["id"] not in preds:
+            missing.append(t["id"])
+            by_type[t["type"]].append(0.0)
+        else:
+            by_type[t["type"]].append(score_one(t, preds[t["id"]]))
+    report = {k: round(sum(v) / len(v), 4) for k, v in by_type.items()}
+    allv = [x for v in by_type.values() for x in v]
+    report["overall"] = round(sum(allv) / len(allv), 4)
+    report["n_tasks"] = len(tasks)
+    report["n_missing"] = len(missing)
+    return report
+def main(argv: list[str] | None = None) -> int:
+    p = argparse.ArgumentParser(prog="kipimo", description=__doc__.split("\n")[0],
+                                epilog=DISCLAIMER)
+    sub = p.add_subparsers(dest="cmd", required=True)
+    sub.add_parser("tasks", help="emit task set JSONL to stdout")
+    sub.add_parser("template", help="emit empty predictions JSONL to stdout")
+    sp = sub.add_parser("score", help="score a predictions file")
+    sp.add_argument("predictions", help="JSONL with {id, prediction:[...]} rows")
+    args = p.parse_args(argv)
+    if args.cmd == "tasks":
+        for t in load_tasks():
+            print(json.dumps(t, ensure_ascii=False))
+    elif args.cmd == "template":
+        for t in load_tasks():
+            print(json.dumps({"id": t["id"], "prediction": []}))
+    else:
+        rep = score_file(args.predictions)
+        print(json.dumps(rep, indent=2))
+        print(f"\n{DISCLAIMER}", file=sys.stderr)
+        if rep["n_missing"]:
+            print(f"Note: {rep['n_missing']} task(s) had no prediction and scored 0. "
+                  f"Run `kipimo template` for the full id list.", file=sys.stderr)
+    return 0
+def _main() -> int:
+    try:
+        return main()
+    except BrokenPipeError:
+        import os
+        os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno())
+        return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

kipimo-0.1.0/src/kipimo/data/kipimo_v0.1.jsonl ADDED Viewed

@@ -0,0 +1,46 @@
+{"id": "rt-01", "type": "server_routing", "lang": "sw", "input": "Nataka kutuma pesa kwa mama yangu kupitia simu.", "options": null, "gold": ["mpesa-mcp"], "metric": "exact"}
+{"id": "rt-02", "type": "server_routing", "lang": "sw", "input": "Ninahitaji kujua deni langu la ushuru KRA.", "options": null, "gold": ["kra-mcp"], "metric": "exact"}
+{"id": "rt-03", "type": "server_routing", "lang": "sw", "input": "Nataka mkopo mdogo kwa biashara yangu ya mboga.", "options": null, "gold": ["mkopo-mcp"], "metric": "exact"}
+{"id": "rt-04", "type": "server_routing", "lang": "sw", "input": "Je, kuna bima ya mazao dhidi ya ukame?", "options": null, "gold": ["bima-mcp"], "metric": "exact"}
+{"id": "rt-05", "type": "server_routing", "lang": "sw", "input": "Bei ya mahindi sokoni Nakuru ni ngapi leo?", "options": null, "gold": ["soko-mcp"], "metric": "exact"}
+{"id": "rt-06", "type": "server_routing", "lang": "sw", "input": "Nataka kutuma pesa kutoka Marekani hadi Kenya kwa gharama nafuu.", "options": null, "gold": ["remit-mcp"], "metric": "exact"}
+{"id": "rt-07", "type": "server_routing", "lang": "sw", "input": "Ni wapi hospitali iliyo karibu inayotoa huduma za upasuaji?", "options": null, "gold": ["kenya-health-mcp"], "metric": "exact"}
+{"id": "rt-08", "type": "server_routing", "lang": "sw", "input": "Mtoto wangu ana homa na kikohozi, nifanye nini?", "options": null, "gold": ["afya-mcp"], "metric": "exact"}
+{"id": "rt-09", "type": "server_routing", "lang": "sw", "input": "Ninahisi huzuni sana na msongo wa mawazo, nipate msaada wapi?", "options": null, "gold": ["afya-ya-akili-mcp"], "metric": "exact"}
+{"id": "rt-10", "type": "server_routing", "lang": "sw", "input": "Nipande mahindi lini msimu huu Uasin Gishu?", "options": null, "gold": ["kilimo-mcp"], "metric": "exact"}
+{"id": "rt-11", "type": "server_routing", "lang": "sw", "input": "Hali ya ukame kaunti ya Baringo ikoje sasa?", "options": null, "gold": ["wapimaji-mcp"], "metric": "exact"}
+{"id": "rt-12", "type": "server_routing", "lang": "sw", "input": "Nahitaji kibali cha mazingira kwa kiwanda changu kidogo.", "options": null, "gold": ["mazingira-mcp"], "metric": "exact"}
+{"id": "rt-13", "type": "server_routing", "lang": "sw", "input": "Huduma za kaunti ya Machakos ni zipi na bajeti yake?", "options": null, "gold": ["county-mcp"], "metric": "exact"}
+{"id": "rt-14", "type": "server_routing", "lang": "sw", "input": "Nifanye nini kupata cheti cha kuzaliwa? Fomu gani inahitajika?", "options": null, "gold": ["fomu-mcp"], "metric": "exact"}
+{"id": "rt-15", "type": "server_routing", "lang": "sw", "input": "Mwajiri wangu amenifukuza bila notisi. Haki zangu ni zipi?", "options": null, "gold": ["haki-ya-kazi-mcp"], "metric": "exact"}
+{"id": "rt-16", "type": "server_routing", "lang": "sw", "input": "Natafuta kazi ya udereva Nairobi.", "options": null, "gold": ["kazi-mcp"], "metric": "exact"}
+{"id": "rt-17", "type": "server_routing", "lang": "sw", "input": "Shule za sekondari Kiambu na matokeo ya KCSE.", "options": null, "gold": ["elimu-mcp"], "metric": "exact"}
+{"id": "rt-18", "type": "server_routing", "lang": "sw", "input": "Tafsiri sentensi hii kwa Kiingereza tafadhali.", "options": null, "gold": ["tafsiri-mcp"], "metric": "exact"}
+{"id": "rt-19", "type": "server_routing", "lang": "sw", "input": "Nataka kuhakiki ujuzi na vyeti vya mfanyakazi ninayemwajiri.", "options": null, "gold": ["sifa-mcp"], "metric": "exact"}
+{"id": "rt-20", "type": "server_routing", "lang": "sw", "input": "Nyaraka za shamba langu Kiambu — nithibitishe umiliki.", "options": null, "gold": ["ardhi-mcp"], "metric": "exact"}
+{"id": "rt-21", "type": "server_routing", "lang": "sw", "input": "Nyumba ya kupanga Ruaka bei gani kwa mwezi?", "options": null, "gold": ["nyumba-mcp"], "metric": "exact"}
+{"id": "rt-22", "type": "server_routing", "lang": "sw", "input": "Njia za matatu kutoka CBD hadi Rongai?", "options": null, "gold": ["usafiri-mcp"], "metric": "exact"}
+{"id": "rt-23", "type": "server_routing", "lang": "sw", "input": "Niko diaspora — nasimamia vipi mali yangu Kenya nikiwa nje?", "options": null, "gold": ["diaspora-mcp"], "metric": "exact"}
+{"id": "rt-24", "type": "server_routing", "lang": "sw", "input": "Tunataka kuanzisha chama cha kuweka akiba mtaani.", "options": null, "gold": ["jumuia-mcp"], "metric": "exact"}
+{"id": "rt-25", "type": "server_routing", "lang": "sw", "input": "Umeme haujafika kijiji chetu — kuna mpango gani wa nishati?", "options": null, "gold": ["nishati-mcp"], "metric": "exact"}
+{"id": "tm-01", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'mkopo' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["credit/loan"], "metric": "exact_ci"}
+{"id": "tm-02", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'bima' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["insurance"], "metric": "exact_ci"}
+{"id": "tm-03", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'faida' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["profit/returns"], "metric": "exact_ci"}
+{"id": "tm-04", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'soko' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["market"], "metric": "exact_ci"}
+{"id": "tm-05", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'afya' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["health"], "metric": "exact_ci"}
+{"id": "tm-06", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'afya ya akili' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["mental health"], "metric": "exact_ci"}
+{"id": "tm-07", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'kilimo' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["agriculture"], "metric": "exact_ci"}
+{"id": "tm-08", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'wapimaji' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["water"], "metric": "exact_ci"}
+{"id": "tm-09", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'mazingira' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["environment"], "metric": "exact_ci"}
+{"id": "tm-10", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'fomu' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["forms"], "metric": "exact_ci"}
+{"id": "tm-11", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'haki ya kazi' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["labour rights"], "metric": "exact_ci"}
+{"id": "tm-12", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'kazi' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["work"], "metric": "exact_ci"}
+{"id": "tm-13", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'habari' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["news"], "metric": "exact_ci"}
+{"id": "tm-14", "type": "term_grounding", "lang": "sw", "input": "Neno la Kiswahili 'elimu' linamaanisha nini kwa Kiingereza (neno moja au mawili)?", "gold": ["education"], "metric": "exact_ci"}
+{"id": "cas-01", "type": "cascade_routing", "lang": "sw", "input": "Ukame umefikia hatua ya tahadhari Baringo. Taarifa hii inapaswa kufika kwenye sekta zipi?", "event": {"domain": "water", "event_type": "drought_alert", "severity": "alert"}, "gold": ["agriculture", "finance", "health"], "metric": "set_f1"}
+{"id": "cas-02", "type": "cascade_routing", "lang": "sw", "input": "Ukame ni hatari sana Turkana. Sekta zipi zinapaswa kuarifiwa?", "event": {"domain": "water", "event_type": "drought_alert", "severity": "critical"}, "gold": ["agriculture", "finance", "health"], "metric": "set_f1"}
+{"id": "cas-03", "type": "cascade_routing", "lang": "sw", "input": "Mlipuko wa kipindupindu Kisumu. Taarifa ipelekwe kwenye sekta zipi?", "event": {"domain": "health", "event_type": "disease_outbreak", "severity": "alert"}, "gold": ["civic", "procurement"], "metric": "set_f1"}
+{"id": "cas-04", "type": "cascade_routing", "lang": "sw", "input": "Onyo la mafuriko bonde la Tana. Sekta zipi zihusishwe?", "event": {"domain": "water", "event_type": "flood_alert", "severity": "warning"}, "gold": ["civic", "health"], "metric": "set_f1"}
+{"id": "cas-05", "type": "cascade_routing", "lang": "sw", "input": "Mafuriko makubwa Tana River. Nani apokee taarifa?", "event": {"domain": "water", "event_type": "flood_alert", "severity": "critical"}, "gold": ["civic", "health"], "metric": "set_f1"}
+{"id": "cas-06", "type": "cascade_routing", "lang": "sw", "input": "Mlipuko mkubwa wa ugonjwa. Sekta zipi zinahusika?", "event": {"domain": "health", "event_type": "disease_outbreak", "severity": "critical"}, "gold": ["civic", "procurement"], "metric": "set_f1"}
+{"id": "cas-07", "type": "cascade_routing", "lang": "sw", "input": "Dalili za ukame zimeanza Kajiado. Wapi taarifa ipelekwe?", "event": {"domain": "water", "event_type": "drought_alert", "severity": "warning"}, "gold": ["agriculture", "finance"], "metric": "set_f1"}

kipimo-0.1.0/src/kipimo.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,47 @@
+Metadata-Version: 2.4
+Name: kipimo
+Version: 0.1.0
+Summary: Swahili agent-task evaluation suite for the East Africa coordination stack — model-agnostic seed benchmark (46 tasks: server routing, term grounding, cascade routing).
+Author: AfriKaziOS maintainers
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/gabrielmahia/kipimo
+Project-URL: Repository, https://github.com/gabrielmahia/kipimo
+Project-URL: Issues, https://github.com/gabrielmahia/kipimo/issues
+Keywords: swahili,benchmark,evaluation,mcp,kenya,ai-agents
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Dynamic: license-file
+# kipimo
+Over one hundred million people coordinate their lives in Swahili, yet no benchmark measures whether an AI agent can route their requests correctly — send money, check drought status, find a clinic, verify a worker's credentials. Agents targeting East Africa are evaluated on English tasks and deployed on faith.
+`kipimo` (Swahili: *a measure*) is a model-agnostic seed benchmark for exactly that gap: **46 tasks** across three types, with golds machine-derived from authoritative sources — the coordination-stack registry and the live `africa-coord-bus` routing table — never from memory.
+| Type | n | What it measures | Metric |
+|---|---|---|---|
+| `server_routing` | 25 | Swahili request → correct stack server (payments, tax, health, land, labour…) | exact |
+| `term_grounding` | 14 | Swahili domain term → English meaning | exact (case-insensitive) |
+| `cascade_routing` | 7 | Coordination event → which sectors must be notified | set F1 |
+## Use it (any model, no API keys)
+```bash
+pip install kipimo
+kipimo tasks > tasks.jsonl        # feed to your agent however you like
+kipimo template > preds.jsonl     # fill "prediction": [...] per id
+kipimo score preds.jsonl          # per-type + overall report
+```
+The harness never calls a model — you generate predictions with whatever system you're evaluating; kipimo only scores. Any lab can publish comparable numbers.
+## Honesty box
+- **v0.1 is a seed set.** 46 tasks establish the format and scoring; breadth comes from contributions.
+- Swahili phrasing is simple-register and **pending native-speaker review** — that is [issue #1](https://github.com/gabrielmahia/kipimo/issues), and corrections are the most valuable contribution possible.
+- Scores measure stack-routing competence, not general Swahili fluency.
+- Dataset: **CC BY 4.0** (usable by everyone, including commercial labs — that's the point). Harness: **MIT**.
+## IP & Collaboration
+MIT-licensed harness, CC BY 4.0 data. Feedback via GitHub Issues only — pull requests are not accepted; task corrections and additions via Issues are actively wanted. Full policy: [docs/architecture/IP_POLICY.md](docs/architecture/IP_POLICY.md). Security: see [SECURITY.md](SECURITY.md).

kipimo-0.1.0/src/kipimo.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,13 @@
+LICENSE
+README.md
+pyproject.toml
+src/kipimo/__init__.py
+src/kipimo/cli.py
+src/kipimo.egg-info/PKG-INFO
+src/kipimo.egg-info/SOURCES.txt
+src/kipimo.egg-info/dependency_links.txt
+src/kipimo.egg-info/entry_points.txt
+src/kipimo.egg-info/top_level.txt
+src/kipimo/data/kipimo_v0.1.jsonl
+tests/test_kipimo.py
+tests/test_smoke.py

kipimo-0.1.0/src/kipimo.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

kipimo-0.1.0/src/kipimo.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ kipimo = kipimo.cli:_main

kipimo-0.1.0/src/kipimo.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ kipimo

kipimo-0.1.0/tests/test_kipimo.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Integrity + scoring tests for the kipimo seed benchmark."""
+import json
+import subprocess
+import sys
+from kipimo import load_tasks, score_one
+def test_dataset_integrity():
+    tasks = load_tasks()
+    assert len(tasks) == 46
+    ids = [t["id"] for t in tasks]
+    assert len(ids) == len(set(ids)), "duplicate ids"
+    for t in tasks:
+        assert t["type"] in ("server_routing", "term_grounding", "cascade_routing")
+        assert t["gold"], t["id"]
+        assert t["metric"] in ("exact", "exact_ci", "set_f1")
+        assert t["input"].strip()
+def test_exact_scoring():
+    t = {"gold": ["mpesa-mcp"], "metric": "exact"}
+    assert score_one(t, ["mpesa-mcp"]) == 1.0
+    assert score_one(t, ["MPESA-MCP"]) == 1.0  # normalized
+    assert score_one(t, ["kra-mcp"]) == 0.0
+    assert score_one(t, []) == 0.0
+def test_set_f1_scoring():
+    t = {"gold": ["health", "finance"], "metric": "set_f1"}
+    assert score_one(t, ["health", "finance"]) == 1.0
+    assert 0 < score_one(t, ["health"]) < 1.0
+    assert score_one(t, ["water"]) == 0.0
+def test_cli_end_to_end(tmp_path):
+    r = subprocess.run([sys.executable, "-m", "kipimo.cli", "template"],
+                       capture_output=True, text=True)
+    assert r.returncode == 0
+    preds = tmp_path / "p.jsonl"
+    # perfect predictions = gold copied in
+    lines = [json.dumps({"id": t["id"], "prediction": t["gold"]}) for t in load_tasks()]
+    preds.write_text("\n".join(lines))
+    r = subprocess.run([sys.executable, "-m", "kipimo.cli", "score", str(preds)],
+                       capture_output=True, text=True)
+    rep = json.loads(r.stdout)
+    assert rep["overall"] == 1.0 and rep["n_missing"] == 0

kipimo-0.1.0/tests/test_smoke.py ADDED Viewed

@@ -0,0 +1,12 @@
+import ast
+import pathlib
+def test_all_sources_parse():
+    root = pathlib.Path(__file__).parent.parent / "src"
+    for f in root.rglob("*.py"):
+        ast.parse(f.read_text(encoding="utf-8"))
+def test_package_importable():
+    import kipimo  # noqa: F401