quizforge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quizforge/__init__.py ADDED
@@ -0,0 +1,104 @@
1
+ """quizforge — generate a deep, mixed-format question bank from source material
2
+ and grade it. Deterministic where it can, LLM where it must. Bring your own
3
+ chat model.
4
+
5
+ Public API
6
+ ----------
7
+ Generate:
8
+ generate_bank(material, llm, *, targets=..., existing=...) -> list[dict]
9
+ Sample:
10
+ sample_test(questions, blueprint=..., seen_ids=...) -> list[dict]
11
+ pick_spread(pool, want, seen_ids) -> list[dict]
12
+ DEFAULT_BLUEPRINT
13
+ Grade:
14
+ grade_fill_blank(question, user_answer) -> dict
15
+ grade_match(question, selections) -> dict
16
+ grade_open_answer(question, user_answer, llm) -> QuizGrade | None
17
+ QuizGrade
18
+ Integrity:
19
+ assess_speed(*, elapsed_seconds, question_types, passed) -> IntegrityFlag
20
+ expected_min_seconds(question_types) -> float
21
+ within_time_limit(elapsed_seconds, limit_seconds=...) -> bool
22
+ IntegrityFlag
23
+ Certificate:
24
+ make_certificate(*, learner_id, ..., score_pct, awarded_on) -> Certificate
25
+ verification_code(...) -> str | verify(certificate) -> bool
26
+ Certificate
27
+ Utilities:
28
+ normalize(text) -> str
29
+ structured_output(llm, schema) -> chain
30
+ """
31
+
32
+ from .bank import Bank
33
+ from .certificate import (
34
+ Certificate,
35
+ is_eligible,
36
+ level_for,
37
+ make_certificate,
38
+ verification_code,
39
+ verify,
40
+ )
41
+ from .generate import (
42
+ DEFAULT_COVERAGE,
43
+ DEFAULT_DIFF_SPLIT,
44
+ DEFAULT_SYSTEM,
45
+ DEFAULT_TARGETS,
46
+ generate_bank,
47
+ )
48
+ from .grade import (
49
+ DEFAULT_GRADE_SYSTEM,
50
+ QuizGrade,
51
+ grade_fill_blank,
52
+ grade_match,
53
+ grade_open_answer,
54
+ )
55
+ from .integrity import (
56
+ DEFAULT_MIN_SECONDS_PER_TYPE,
57
+ DEFAULT_TIME_LIMIT_SECONDS,
58
+ IntegrityFlag,
59
+ assess_speed,
60
+ expected_min_seconds,
61
+ within_time_limit,
62
+ )
63
+ from .llm import structured_output
64
+ from .sample import DEFAULT_BLUEPRINT, DIFFICULTY_ORDER, pick_spread, sample_test
65
+ from .schemas import DIFFICULTIES, FORMAT_KEYS, FORMATS
66
+ from .text import normalize
67
+
68
+ __version__ = "0.2.0"
69
+
70
+ __all__ = [
71
+ "Bank",
72
+ "generate_bank",
73
+ "sample_test",
74
+ "pick_spread",
75
+ "grade_fill_blank",
76
+ "grade_match",
77
+ "grade_open_answer",
78
+ "QuizGrade",
79
+ "assess_speed",
80
+ "expected_min_seconds",
81
+ "within_time_limit",
82
+ "IntegrityFlag",
83
+ "DEFAULT_MIN_SECONDS_PER_TYPE",
84
+ "DEFAULT_TIME_LIMIT_SECONDS",
85
+ "Certificate",
86
+ "make_certificate",
87
+ "verification_code",
88
+ "verify",
89
+ "is_eligible",
90
+ "level_for",
91
+ "normalize",
92
+ "structured_output",
93
+ "DEFAULT_BLUEPRINT",
94
+ "DEFAULT_TARGETS",
95
+ "DEFAULT_DIFF_SPLIT",
96
+ "DEFAULT_SYSTEM",
97
+ "DEFAULT_COVERAGE",
98
+ "DEFAULT_GRADE_SYSTEM",
99
+ "DIFFICULTY_ORDER",
100
+ "DIFFICULTIES",
101
+ "FORMAT_KEYS",
102
+ "FORMATS",
103
+ "__version__",
104
+ ]
quizforge/bank.py ADDED
@@ -0,0 +1,89 @@
1
+ """Bank — a YAML-backed question bank with grow/sample/save convenience.
2
+
3
+ A bank file is YAML with (at least) a ``questions:`` list; any other top-level
4
+ keys are metadata, carried through untouched on save. An optional ``material:``
5
+ key holds the source text used to ground generation (or pass it explicitly).
6
+
7
+ Comments are NOT preserved on save (clean round-trip via PyYAML). If you keep a
8
+ heavily-commented bank file, generate into memory and write where you control
9
+ the formatting instead of calling ``save``.
10
+ """
11
+
12
+ from collections import Counter
13
+ from pathlib import Path
14
+ from typing import List, Optional, Union
15
+
16
+ import yaml
17
+
18
+ from .generate import generate_bank
19
+ from .sample import sample_test
20
+
21
+ PathLike = Union[str, Path]
22
+
23
+
24
+ class Bank:
25
+ """Load, grow, sample, and save a question bank."""
26
+
27
+ def __init__(self, questions: Optional[List[dict]] = None, meta: Optional[dict] = None):
28
+ self.questions: List[dict] = list(questions or [])
29
+ self.meta: dict = dict(meta or {})
30
+
31
+ # ----- construction -----------------------------------------------------
32
+ @classmethod
33
+ def load(cls, path: PathLike) -> "Bank":
34
+ data = yaml.safe_load(Path(path).read_text()) or {}
35
+ if not isinstance(data, dict):
36
+ raise ValueError(f"{path}: expected a YAML mapping, got {type(data).__name__}")
37
+ questions = data.pop("questions", []) or []
38
+ return cls(questions=questions, meta=data)
39
+
40
+ @classmethod
41
+ def from_dict(cls, data: dict) -> "Bank":
42
+ data = dict(data or {})
43
+ return cls(questions=data.pop("questions", []) or [], meta=data)
44
+
45
+ # ----- inspection -------------------------------------------------------
46
+ @property
47
+ def material(self) -> str:
48
+ return self.meta.get("material", "") or ""
49
+
50
+ def counts_by_type(self) -> dict:
51
+ return dict(Counter(q.get("type", "mc") for q in self.questions))
52
+
53
+ def counts_by_difficulty(self) -> dict:
54
+ return dict(Counter(q.get("difficulty", "medium") for q in self.questions))
55
+
56
+ def __len__(self) -> int:
57
+ return len(self.questions)
58
+
59
+ # ----- mutation ---------------------------------------------------------
60
+ def add(self, new_questions: List[dict]) -> "Bank":
61
+ self.questions.extend(new_questions)
62
+ return self
63
+
64
+ def grow(self, llm, *, targets: Optional[dict] = None,
65
+ material: Optional[str] = None, **kwargs) -> List[dict]:
66
+ """Generate the shortfall to reach ``targets`` and append it in place.
67
+
68
+ Grounds on ``material`` if given, else the bank's ``material`` metadata.
69
+ Returns just the newly added questions.
70
+ """
71
+ mat = material if material is not None else self.material
72
+ if not (mat or "").strip():
73
+ raise ValueError("no material to ground generation — pass material=... "
74
+ "or set a 'material:' key in the bank file")
75
+ new = generate_bank(mat, llm, targets=targets, existing=self.questions, **kwargs)
76
+ self.add(new)
77
+ return new
78
+
79
+ def sample(self, blueprint: Optional[dict] = None, seen_ids=(), rng=None) -> List[dict]:
80
+ return sample_test(self.questions, blueprint=blueprint, seen_ids=seen_ids, rng=rng)
81
+
82
+ # ----- persistence ------------------------------------------------------
83
+ def to_dict(self) -> dict:
84
+ return {**self.meta, "questions": self.questions}
85
+
86
+ def save(self, path: PathLike) -> None:
87
+ with open(path, "w") as f:
88
+ yaml.safe_dump(self.to_dict(), f, sort_keys=False, allow_unicode=True,
89
+ width=100, default_flow_style=False)
@@ -0,0 +1,84 @@
1
+ """Completion certificates — the data + a tamper-evident verification code.
2
+
3
+ quizforge owns the *facts* of a certificate (who, which topic, what level, when)
4
+ and a deterministic verification code derived from them, so any consumer can
5
+ re-derive the code to confirm a certificate wasn't altered. Rendering — PDF,
6
+ HTML, image — is deliberately left to the consumer, where the branding lives, so
7
+ this module stays dependency-light (pydantic only) and clockless (the caller
8
+ supplies ``awarded_on``, keeping generation reproducible).
9
+ """
10
+
11
+ import hashlib
12
+ from typing import Optional
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+ PASS = "passed"
17
+ DISTINCTION = "distinction"
18
+ _SEP = "\x1f" # unit separator — unlikely to appear in any field
19
+
20
+
21
+ class Certificate(BaseModel):
22
+ """An earned completion certificate's facts + its verification code."""
23
+
24
+ learner_id: str = Field(description="Stable learner identity (e.g. email).")
25
+ learner_name: str = Field(description="Display name to print on the certificate.")
26
+ topic_id: str = Field(description="Topic/lesson identifier.")
27
+ topic_title: str = Field(description="Human-readable topic title.")
28
+ score_pct: int = Field(description="Best score as a 0-100 percentage.")
29
+ level: str = Field(description="'passed' or 'distinction'.")
30
+ awarded_on: str = Field(description="Award date as the caller's display string.")
31
+ verification_code: str = Field(description="Deterministic, tamper-evident code.")
32
+
33
+
34
+ def verification_code(*, learner_id: str, topic_id: str, awarded_on: str, level: str,
35
+ score_pct: int, secret: str = "", prefix: str = "QF") -> str:
36
+ """Derive a short, stable verification code from a certificate's fields.
37
+
38
+ Same fields (+ same ``secret``) always yield the same code; changing any
39
+ field changes the code, so a printed certificate can be checked against the
40
+ record. With a non-empty ``secret`` the code is unforgeable without it.
41
+ """
42
+ raw = _SEP.join([learner_id.lower().strip(), topic_id, awarded_on, level,
43
+ str(score_pct), secret])
44
+ digest = hashlib.sha256(raw.encode("utf-8")).hexdigest().upper()
45
+ return f"{prefix}-{digest[:4]}-{digest[4:8]}"
46
+
47
+
48
+ def is_eligible(score_pct: int, pass_threshold: float = 0.6) -> bool:
49
+ """Whether a score earns a certificate at all."""
50
+ return score_pct / 100.0 >= pass_threshold
51
+
52
+
53
+ def level_for(score_pct: int, distinction_threshold: float = 0.8) -> str:
54
+ """'distinction' at/above the threshold, else 'passed'."""
55
+ return DISTINCTION if score_pct / 100.0 >= distinction_threshold else PASS
56
+
57
+
58
+ def make_certificate(*, learner_id: str, learner_name: str, topic_id: str, topic_title: str,
59
+ score_pct: int, awarded_on: str, pass_threshold: float = 0.6,
60
+ distinction_threshold: float = 0.8, secret: str = "",
61
+ prefix: str = "QF") -> Certificate:
62
+ """Build a :class:`Certificate` for an eligible score.
63
+
64
+ Raises ``ValueError`` if ``score_pct`` is below ``pass_threshold`` — there is
65
+ no certificate for a non-pass.
66
+ """
67
+ if not is_eligible(score_pct, pass_threshold):
68
+ raise ValueError(f"score {score_pct}% is below the pass threshold "
69
+ f"{round(pass_threshold * 100)}% — no certificate")
70
+ level = level_for(score_pct, distinction_threshold)
71
+ code = verification_code(learner_id=learner_id, topic_id=topic_id, awarded_on=awarded_on,
72
+ level=level, score_pct=score_pct, secret=secret, prefix=prefix)
73
+ return Certificate(learner_id=learner_id, learner_name=learner_name, topic_id=topic_id,
74
+ topic_title=topic_title, score_pct=score_pct, level=level,
75
+ awarded_on=awarded_on, verification_code=code)
76
+
77
+
78
+ def verify(certificate: Certificate, secret: str = "", prefix: str = "QF") -> bool:
79
+ """Re-derive the code from the certificate's fields and confirm it matches."""
80
+ expected = verification_code(
81
+ learner_id=certificate.learner_id, topic_id=certificate.topic_id,
82
+ awarded_on=certificate.awarded_on, level=certificate.level,
83
+ score_pct=certificate.score_pct, secret=secret, prefix=prefix)
84
+ return expected == certificate.verification_code
quizforge/cli.py ADDED
@@ -0,0 +1,134 @@
1
+ """quizforge command line.
2
+
3
+ quizforge grow BANK [--material-file F] [--mc N ...] [--dry-run] --llm mod:factory
4
+ quizforge sample BANK [--seed N] [--json]
5
+ quizforge stats BANK
6
+
7
+ `grow` needs a chat model. Because quizforge is provider-neutral, you point it at
8
+ one you construct:
9
+
10
+ # a dotted path to a zero-arg callable returning a LangChain-style model
11
+ quizforge grow bank.yaml --llm myproject.models:make_llm
12
+
13
+ # or, if langchain-openai is installed and OPENAI_API_KEY is set
14
+ quizforge grow bank.yaml --openai-model gpt-4.1
15
+ """
16
+
17
+ import argparse
18
+ import importlib
19
+ import json
20
+ import logging
21
+ import random
22
+ import sys
23
+ from collections import Counter
24
+ from typing import Optional
25
+
26
+ from .bank import Bank
27
+ from .generate import DEFAULT_TARGETS
28
+ from .schemas import FORMAT_KEYS
29
+
30
+
31
+ def _resolve_llm(args):
32
+ """Build the chat model from --llm dotted path or --openai-model."""
33
+ if args.llm:
34
+ module_path, _, attr = args.llm.partition(":")
35
+ if not module_path or not attr:
36
+ raise SystemExit("--llm must be 'module.path:callable' (a zero-arg factory)")
37
+ factory = getattr(importlib.import_module(module_path), attr)
38
+ return factory()
39
+ if args.openai_model:
40
+ try:
41
+ from langchain_openai import ChatOpenAI
42
+ except ImportError:
43
+ raise SystemExit("--openai-model needs 'pip install langchain-openai'")
44
+ return ChatOpenAI(model=args.openai_model, temperature=args.temperature)
45
+ raise SystemExit("grow needs a model: pass --llm module:factory or --openai-model NAME")
46
+
47
+
48
+ def _cmd_grow(args) -> int:
49
+ bank = Bank.load(args.bank)
50
+ targets = {f: getattr(args, f) for f in FORMAT_KEYS if getattr(args, f) is not None}
51
+ material = None
52
+ if args.material_file:
53
+ material = open(args.material_file).read()
54
+
55
+ llm = _resolve_llm(args)
56
+ before = len(bank)
57
+ new = bank.grow(llm, targets=targets or None, material=material,
58
+ batch_size=args.batch, coverage=args.coverage or None)
59
+ print(f"Generated {len(new)} new questions ({dict(Counter(q['type'] for q in new))}).")
60
+ if args.dry_run:
61
+ print(f"--dry-run: not writing. Bank would grow {before} -> {before + len(new)}.")
62
+ for q in new[:3]:
63
+ print(f" [{q['type']}/{q['difficulty']}] {q['prompt'][:90]}")
64
+ return 0
65
+ bank.save(args.bank)
66
+ print(f"Wrote {args.bank}. Bank now {len(bank)} (was {before}).")
67
+ return 0
68
+
69
+
70
+ def _cmd_sample(args) -> int:
71
+ bank = Bank.load(args.bank)
72
+ rng = random.Random(args.seed) if args.seed is not None else None
73
+ test = bank.sample(rng=rng)
74
+ if args.json:
75
+ json.dump(test, sys.stdout, indent=2, ensure_ascii=False)
76
+ print()
77
+ return 0
78
+ counts = Counter(q.get("type", "mc") for q in test)
79
+ print(f"Sampled {len(test)} questions: {dict(counts)}")
80
+ for i, q in enumerate(test, 1):
81
+ print(f" {i:2d}. [{q.get('type')}/{q.get('difficulty')}] {q.get('prompt', '')[:90]}")
82
+ return 0
83
+
84
+
85
+ def _cmd_stats(args) -> int:
86
+ bank = Bank.load(args.bank)
87
+ print(f"Bank: {args.bank}")
88
+ print(f" total questions : {len(bank)}")
89
+ print(f" by format : {bank.counts_by_type()}")
90
+ print(f" by difficulty : {bank.counts_by_difficulty()}")
91
+ if bank.material:
92
+ print(f" material : {len(bank.material)} chars")
93
+ return 0
94
+
95
+
96
+ def build_parser() -> argparse.ArgumentParser:
97
+ p = argparse.ArgumentParser(prog="quizforge",
98
+ description="Generate, sample, and grade mixed-format question banks.")
99
+ sub = p.add_subparsers(dest="cmd", required=True)
100
+
101
+ g = sub.add_parser("grow", help="generate the shortfall to reach per-format targets")
102
+ g.add_argument("bank", help="path to the bank YAML file")
103
+ g.add_argument("--material-file", help="text file to ground generation (overrides bank 'material:')")
104
+ g.add_argument("--batch", type=int, default=8, help="questions per LLM call (default 8)")
105
+ g.add_argument("--coverage", default=None, help="coverage steering hint for the generator")
106
+ g.add_argument("--dry-run", action="store_true", help="generate and summarize, but don't write")
107
+ g.add_argument("--llm", help="dotted path 'module:callable' to a zero-arg model factory")
108
+ g.add_argument("--openai-model", help="convenience: build ChatOpenAI with this model name")
109
+ g.add_argument("--temperature", type=float, default=0.4, help="temperature for --openai-model")
110
+ for f in FORMAT_KEYS:
111
+ g.add_argument(f"--{f.replace('_', '-')}", dest=f, type=int, default=None,
112
+ help=f"target TOTAL {f} questions (default {DEFAULT_TARGETS[f]})")
113
+ g.set_defaults(func=_cmd_grow)
114
+
115
+ s = sub.add_parser("sample", help="draw one mixed-format test from the bank")
116
+ s.add_argument("bank")
117
+ s.add_argument("--seed", type=int, default=None, help="rng seed for a reproducible draw")
118
+ s.add_argument("--json", action="store_true", help="emit the sampled questions as JSON")
119
+ s.set_defaults(func=_cmd_sample)
120
+
121
+ st = sub.add_parser("stats", help="show bank size and format/difficulty mix")
122
+ st.add_argument("bank")
123
+ st.set_defaults(func=_cmd_stats)
124
+ return p
125
+
126
+
127
+ def main(argv: Optional[list] = None) -> int:
128
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
129
+ args = build_parser().parse_args(argv)
130
+ return args.func(args)
131
+
132
+
133
+ if __name__ == "__main__":
134
+ raise SystemExit(main())
quizforge/generate.py ADDED
@@ -0,0 +1,193 @@
1
+ """Generation — grow a question bank from source material with an injected LLM.
2
+
3
+ ``generate_bank`` drafts questions strictly from the material you provide,
4
+ validates each one, dedupes against what's already in the bank (and within the
5
+ run), assigns sequential ids, and returns the NEW questions as plain dicts. It
6
+ never writes files and never duplicates an existing prompt, so it's safe to
7
+ re-run to "top up" a bank to target sizes.
8
+ """
9
+
10
+ import logging
11
+ from collections import Counter
12
+ from typing import List, Optional
13
+
14
+ from .llm import structured_output
15
+ from .schemas import DIFFICULTIES, FORMATS
16
+ from .text import normalize
17
+
18
+ logger = logging.getLogger("quizforge")
19
+
20
+ # Default TOTAL bank size per format — comfortably deep for a 20-question test
21
+ # (default blueprint draws mc8/fill4/match2/short4/freetext2) so unseen
22
+ # questions last across many retakes.
23
+ DEFAULT_TARGETS = {"mc": 40, "fill_blank": 20, "match": 12, "short": 16, "freetext": 12}
24
+ DEFAULT_BATCH = 8
25
+ DEFAULT_DIFF_SPLIT = {"easy": 0.30, "medium": 0.45, "hard": 0.25}
26
+
27
+ DEFAULT_SYSTEM = (
28
+ "You are an expert quiz-content author. Write questions STRICTLY grounded in the provided "
29
+ "material — do not invent facts beyond it. Questions must be accurate, unambiguous, and test "
30
+ "real understanding, not trivia. For multiple choice, exactly one option is correct and the "
31
+ "distractors are plausible. Vary difficulty as requested. Return strict JSON for the "
32
+ "requested schema."
33
+ )
34
+
35
+ # Domain-neutral default. Callers with a specific emphasis (e.g. attack-scenario
36
+ # coverage for security training) pass their own ``coverage`` string.
37
+ DEFAULT_COVERAGE = (
38
+ "Cover the breadth of the material. Frame at least some questions as realistic applied "
39
+ "scenarios rather than abstract recall."
40
+ )
41
+
42
+
43
+ def _diff_counts(n: int, split: dict) -> dict:
44
+ """Split a batch of n into easy/medium/hard per ``split`` (sums to n)."""
45
+ easy = round(n * split.get("easy", 0))
46
+ hard = round(n * split.get("hard", 0))
47
+ medium = max(0, n - easy - hard)
48
+ return {"easy": easy, "medium": medium, "hard": hard}
49
+
50
+
51
+ def _valid_difficulty(d: str) -> str:
52
+ d = (d or "").strip().lower()
53
+ return d if d in DIFFICULTIES else "medium"
54
+
55
+
56
+ def _gen_batch(llm, schema, fmt_desc, material, n, existing_prompts, *,
57
+ system, coverage, diff_split):
58
+ counts = _diff_counts(n, diff_split)
59
+ avoid = "\n".join(f" - {p}" for p in list(existing_prompts)[-40:]) or " (none yet)"
60
+ coverage_line = f"COVERAGE: {coverage}\n" if coverage else ""
61
+ human = (
62
+ f"{material}\n\n"
63
+ f"Write exactly {n} {fmt_desc}.\n"
64
+ f"Difficulty mix: {counts['easy']} easy, {counts['medium']} medium, {counts['hard']} hard "
65
+ f"(set each question's difficulty field accordingly).\n"
66
+ f"{coverage_line}"
67
+ "Ground every fact in the material above; do not invent details.\n"
68
+ f"Do NOT duplicate or lightly reword any of these existing questions:\n{avoid}\n"
69
+ )
70
+ chain = structured_output(llm, schema)
71
+ batch = chain.invoke([("system", system), ("human", human)])
72
+ return batch.questions
73
+
74
+
75
+ def _validate(qtype: str, q, seen_norms: set) -> Optional[dict]:
76
+ """Coerce a generated question to a clean dict, or drop it (return None)."""
77
+ prompt = (getattr(q, "prompt", "") or "").strip()
78
+ if not prompt:
79
+ return None
80
+ norm = normalize(prompt)
81
+ if not norm or norm in seen_norms:
82
+ return None
83
+
84
+ out = {"type": qtype, "difficulty": _valid_difficulty(q.difficulty), "prompt": prompt}
85
+ if qtype == "mc":
86
+ choices = [c.strip() for c in (q.choices or []) if c.strip()]
87
+ if len(choices) < 3 or not (0 <= q.answer_idx < len(choices)) or not q.explanation.strip():
88
+ return None
89
+ out.update(choices=choices, answer_idx=q.answer_idx, explanation=q.explanation.strip())
90
+ elif qtype == "fill_blank":
91
+ accepted = [a.strip() for a in (q.accepted_answers or []) if a.strip()]
92
+ if not accepted:
93
+ return None
94
+ out.update(accepted_answers=accepted, explanation=(q.explanation or "").strip())
95
+ elif qtype == "match":
96
+ pairs = [{"left": p.left.strip(), "right": p.right.strip()}
97
+ for p in (q.pairs or []) if p.left.strip() and p.right.strip()]
98
+ rights = [p["right"].lower() for p in pairs]
99
+ if not (2 <= len(pairs) <= 6) or len(set(rights)) != len(rights): # distinct rights
100
+ return None
101
+ out.update(pairs=pairs, explanation=(q.explanation or "").strip())
102
+ else: # short / freetext
103
+ rubric = [r.strip() for r in (q.rubric or []) if r.strip()]
104
+ if not q.model_answer.strip() or not rubric:
105
+ return None
106
+ out.update(model_answer=q.model_answer.strip(), rubric=rubric)
107
+
108
+ seen_norms.add(norm)
109
+ return out
110
+
111
+
112
+ def _next_id_start(existing: List[dict]) -> int:
113
+ """Highest q<N> id in the existing bank, plus one (1 if none)."""
114
+ max_idx = 0
115
+ for q in existing or []:
116
+ qid = str(q.get("id", ""))
117
+ if qid.startswith("q") and qid[1:].isdigit():
118
+ max_idx = max(max_idx, int(qid[1:]))
119
+ return max_idx + 1
120
+
121
+
122
+ def generate_bank(material: str, llm, *, targets: Optional[dict] = None,
123
+ existing: Optional[List[dict]] = None, batch_size: int = DEFAULT_BATCH,
124
+ diff_split: Optional[dict] = None, system: str = DEFAULT_SYSTEM,
125
+ coverage: str = DEFAULT_COVERAGE, max_batches_per_format: int = 6,
126
+ id_prefix: str = "q") -> List[dict]:
127
+ """Generate the shortfall needed to bring a bank up to ``targets``.
128
+
129
+ Args:
130
+ material: the source text to ground questions in.
131
+ llm: any LangChain-style chat model (sync ``with_structured_output``).
132
+ targets: desired TOTAL count per format (defaults to ``DEFAULT_TARGETS``).
133
+ existing: the current bank (counted by type; prompts are not reused).
134
+ batch_size: questions requested per LLM call.
135
+ diff_split: easy/medium/hard proportions per batch.
136
+ system / coverage: prompt steering (override ``coverage`` for a domain).
137
+ max_batches_per_format: cap on retries when validation rejects items.
138
+ id_prefix: id scheme; ids continue after the highest existing ``<prefix><N>``.
139
+
140
+ Returns:
141
+ The NEW questions as dicts (with ``id``/``type``/``difficulty`` + fields).
142
+ Does not mutate ``existing`` and does not include it in the result.
143
+ """
144
+ targets = {**DEFAULT_TARGETS, **(targets or {})}
145
+ diff_split = diff_split or DEFAULT_DIFF_SPLIT
146
+ existing = existing or []
147
+
148
+ existing_by_type: dict = {}
149
+ seen_norms: set = set()
150
+ for q in existing:
151
+ existing_by_type.setdefault(q.get("type", "mc"), []).append(q)
152
+ seen_norms.add(normalize(q.get("prompt", "")))
153
+
154
+ all_new: List[dict] = []
155
+ for fmt, target in targets.items():
156
+ if fmt not in FORMATS:
157
+ logger.warning("Unknown format %r in targets — skipping", fmt)
158
+ continue
159
+ have = len(existing_by_type.get(fmt, []))
160
+ need = max(0, target - have)
161
+ if need == 0:
162
+ continue
163
+ schema, fmt_desc = FORMATS[fmt]
164
+ prompts_seen = {q["prompt"] for q in existing if q.get("type", "mc") == fmt}
165
+ made = batches = 0
166
+ while made < need and batches < max_batches_per_format:
167
+ batches += 1
168
+ batch_n = min(batch_size, need - made)
169
+ try:
170
+ raw = _gen_batch(llm, schema, fmt_desc, material, batch_n, prompts_seen,
171
+ system=system, coverage=coverage, diff_split=diff_split)
172
+ except Exception as exc: # noqa: BLE001 — skip a failed batch, keep going
173
+ logger.warning("%s batch failed (%s): %s", fmt, type(exc).__name__, str(exc)[:160])
174
+ continue
175
+ for q in raw:
176
+ v = _validate(fmt, q, seen_norms)
177
+ if v:
178
+ all_new.append(v)
179
+ prompts_seen.add(v["prompt"])
180
+ made += 1
181
+ if made >= need:
182
+ break
183
+ logger.info("%-11s have %2d / target %2d -> added %2d", fmt, have, target, made)
184
+
185
+ # Assign sequential ids continuing after the existing bank (id first).
186
+ start = _next_id_start(existing)
187
+ all_new = [{"id": f"{id_prefix}{i}", **q} for i, q in enumerate(all_new, start=start)]
188
+
189
+ if all_new:
190
+ logger.info("Generated %d new questions: %s | difficulty %s", len(all_new),
191
+ dict(Counter(q["type"] for q in all_new)),
192
+ dict(Counter(q["difficulty"] for q in all_new)))
193
+ return all_new
quizforge/grade.py ADDED
@@ -0,0 +1,119 @@
1
+ """Grading — deterministic where we can, LLM where we must.
2
+
3
+ Multiple-choice is graded by the caller (it's a trivial index compare).
4
+ Fill-in-the-blank and match-the-following are graded deterministically here
5
+ (no LLM, fast and free). Open-ended answers are scored 0..1 with feedback by an
6
+ injected chat model against the question's model answer + rubric.
7
+ """
8
+
9
+ import logging
10
+ from typing import List, Optional
11
+
12
+ from pydantic import BaseModel, Field
13
+
14
+ from .llm import structured_output
15
+ from .text import normalize
16
+
17
+ logger = logging.getLogger("quizforge")
18
+
19
+
20
+ def grade_fill_blank(question: dict, user_answer: str) -> dict:
21
+ """Deterministic grade for a fill-in-the-blank against accepted_answers.
22
+
23
+ Binary credit: 1.0 if the normalized answer matches any accepted answer.
24
+ """
25
+ accepted = question.get("accepted_answers") or []
26
+ ua = normalize(user_answer)
27
+ correct = bool(ua) and ua in {normalize(a) for a in accepted}
28
+ return {"score": 1.0 if correct else 0.0, "correct": correct, "accepted": accepted}
29
+
30
+
31
+ def grade_match(question: dict, selections: dict) -> dict:
32
+ """Deterministic grade for match-the-following with per-pair partial credit.
33
+
34
+ ``selections`` maps the stringified left-item index -> the right-side label
35
+ chosen. Score = fraction of rows matched correctly.
36
+ """
37
+ pairs = question.get("pairs") or []
38
+ total = len(pairs)
39
+ if not total:
40
+ return {"score": 0.0, "correct_count": 0, "total": 0, "rows": []}
41
+ rows = []
42
+ correct_count = 0
43
+ for i, pair in enumerate(pairs):
44
+ chosen = selections.get(str(i), "")
45
+ is_ok = normalize(chosen) == normalize(pair.get("right", ""))
46
+ if is_ok:
47
+ correct_count += 1
48
+ rows.append({
49
+ "left": pair.get("left", ""),
50
+ "right": pair.get("right", ""),
51
+ "chosen": chosen,
52
+ "correct": is_ok,
53
+ })
54
+ return {"score": correct_count / total, "correct_count": correct_count,
55
+ "total": total, "rows": rows}
56
+
57
+
58
+ class QuizGrade(BaseModel):
59
+ """The model's read on an open-ended answer."""
60
+
61
+ score: float = Field(description="Credit awarded, 0.0 to 1.0. 1.0 = covers the key points "
62
+ "correctly; ~0.5 = partially correct or missing a key idea; 0.0 = wrong, "
63
+ "empty, or just restates the question.")
64
+ verdict: str = Field(description="One of: 'correct', 'partial', 'incorrect'")
65
+ feedback: str = Field(description="2-3 sentences, encouraging coach tone. Say what was right, "
66
+ "then the single most important thing to add or fix.")
67
+ covered: List[str] = Field(description="Key points the answer got right (short phrases). Empty if none.")
68
+ missed: List[str] = Field(description="Key points the answer missed or got wrong (short phrases). Empty if none.")
69
+
70
+
71
+ DEFAULT_GRADE_SYSTEM = (
72
+ "You are a training assessor grading a learner's open-ended quiz answer. Grade on "
73
+ "CONCEPTS, not phrasing or grammar — credit the learner when they demonstrate the right "
74
+ "understanding in their own words. Be fair but rigorous: award partial credit when an "
75
+ "answer is on the right track but misses a key point, and award 0 when it is wrong, empty, "
76
+ "evasive ('I don't know'), or merely restates the question. Do not be fooled by "
77
+ "confident-sounding but incorrect answers. Return strict JSON for the QuizGrade schema."
78
+ )
79
+
80
+
81
+ def _open_prompt(question: dict, user_answer: str) -> str:
82
+ parts = [
83
+ f"QUESTION:\n{question['prompt']}",
84
+ "",
85
+ f"MODEL ANSWER (what a strong response covers):\n"
86
+ f"{(question.get('model_answer') or '').strip() or '(none provided)'}",
87
+ ]
88
+ rubric = question.get("rubric") or []
89
+ if rubric:
90
+ parts += ["", "RUBRIC — key points to look for:"]
91
+ parts += [f" - {point}" for point in rubric]
92
+ parts += ["", f"LEARNER'S ANSWER:\n{(user_answer or '').strip()}",
93
+ "", "Grade the learner's answer. score is fractional 0..1."]
94
+ return "\n".join(parts)
95
+
96
+
97
+ def grade_open_answer(question: dict, user_answer: str, llm,
98
+ system: str = DEFAULT_GRADE_SYSTEM) -> Optional[QuizGrade]:
99
+ """Score an open-ended answer 0..1 with feedback using the injected ``llm``.
100
+
101
+ Returns a :class:`QuizGrade`, or ``None`` if automated grading was
102
+ unavailable (callers should exclude the question from the attempt's max
103
+ score rather than penalize the learner for an outage).
104
+ """
105
+ if not (user_answer or "").strip():
106
+ return QuizGrade(score=0.0, verdict="incorrect", feedback="No answer was provided.",
107
+ covered=[], missed=list(question.get("rubric") or []))
108
+ try:
109
+ chain = structured_output(llm, QuizGrade)
110
+ grade = chain.invoke([
111
+ ("system", system),
112
+ ("human", _open_prompt(question, user_answer)),
113
+ ])
114
+ except Exception as exc: # noqa: BLE001 — degrade gracefully, never crash the quiz
115
+ logger.warning("Open-answer grading failed for %s: %s: %s",
116
+ question.get("id", "?"), type(exc).__name__, str(exc)[:200])
117
+ return None
118
+ grade.score = max(0.0, min(1.0, float(grade.score)))
119
+ return grade
quizforge/integrity.py ADDED
@@ -0,0 +1,123 @@
1
+ """Integrity signals — flag attempts that are too fast to be genuine.
2
+
3
+ A deep, unseen-first bank (see :mod:`quizforge.sample`) already makes
4
+ answer-sharing hard. This adds a cheap, deterministic backstop: an attempt that
5
+ *passed* in less time than it physically takes to read and answer the questions
6
+ is suspicious — it points at a leaked answer key, a shared screenshot, or
7
+ automation, not knowledge.
8
+
9
+ This is intentionally a dumb arithmetic heuristic, not an LLM call: "is 40s
10
+ plausible for 20 questions including two written answers" is a timing question,
11
+ and an LLM adds nothing to it. The per-format floors and thresholds are tunable
12
+ so a caller can dial sensitivity to their own population.
13
+ """
14
+
15
+ from typing import Iterable, List, Optional
16
+
17
+ from pydantic import BaseModel, Field
18
+
19
+ # Minimum *realistic* seconds to genuinely read + answer one question of each
20
+ # format — not an absolute physical floor, but the point below which engagement
21
+ # is implausible. Open formats dominate because a passing written answer takes
22
+ # real time to compose. Deliberately generous to keep false positives low.
23
+ DEFAULT_MIN_SECONDS_PER_TYPE = {
24
+ "mc": 5.0,
25
+ "fill_blank": 7.0,
26
+ "match": 10.0,
27
+ "short": 20.0,
28
+ "freetext": 40.0,
29
+ }
30
+ # Seconds assumed for a format not in the table (unknown / custom).
31
+ DEFAULT_FALLBACK_SECONDS = 6.0
32
+
33
+ # Default wall-clock budget for a single timed attempt (30 minutes). A limit is
34
+ # the opposite end of the timing spectrum from :func:`assess_speed`: too *slow*
35
+ # rather than too *fast*. Enforcement (a countdown that auto-submits) is the
36
+ # caller's UI concern; this module just owns the policy value and the check.
37
+ DEFAULT_TIME_LIMIT_SECONDS = 1800
38
+
39
+
40
+ def within_time_limit(elapsed_seconds: int,
41
+ limit_seconds: int = DEFAULT_TIME_LIMIT_SECONDS) -> bool:
42
+ """Whether a timed attempt finished inside its allotted budget.
43
+
44
+ Untimed attempts (``elapsed_seconds <= 0``, e.g. legacy rows or a disabled
45
+ timer) are treated as within limit — absence of timing is not a violation.
46
+ A non-positive ``limit_seconds`` means "no limit" and always returns True.
47
+ """
48
+ if elapsed_seconds <= 0 or limit_seconds <= 0:
49
+ return True
50
+ return elapsed_seconds <= limit_seconds
51
+
52
+
53
+ class IntegrityFlag(BaseModel):
54
+ """The verdict on a single attempt's timing."""
55
+
56
+ suspicious: bool = Field(description="True if the pass was implausibly fast.")
57
+ severity: str = Field(description="One of: 'none', 'low', 'high'.")
58
+ elapsed_seconds: int = Field(description="Seconds the learner actually took.")
59
+ expected_seconds: float = Field(description="Realistic minimum for these questions.")
60
+ speed_ratio: float = Field(description="elapsed / expected; <1 is faster than realistic.")
61
+ reasons: List[str] = Field(default_factory=list, description="Human-readable explanation.")
62
+
63
+
64
+ def expected_min_seconds(question_types: Iterable[str], per_type: Optional[dict] = None,
65
+ fallback: float = DEFAULT_FALLBACK_SECONDS) -> float:
66
+ """Sum the per-format realistic-minimum seconds for a set of question types."""
67
+ table = {**DEFAULT_MIN_SECONDS_PER_TYPE, **(per_type or {})}
68
+ return sum(table.get(t, fallback) for t in question_types)
69
+
70
+
71
+ def assess_speed(*, elapsed_seconds: int, question_types: Iterable[str], passed: bool,
72
+ per_type: Optional[dict] = None, fallback: float = DEFAULT_FALLBACK_SECONDS,
73
+ low_ratio: float = 0.5, high_ratio: float = 0.25) -> IntegrityFlag:
74
+ """Flag a *passing* attempt completed implausibly fast.
75
+
76
+ Only passes are assessed — a fast *fail* is someone giving up, not cheating.
77
+ Attempts with no recorded timing (``elapsed_seconds <= 0``, e.g. legacy rows)
78
+ are never flagged: absence of a signal is not evidence.
79
+
80
+ Args:
81
+ elapsed_seconds: wall-clock seconds the attempt took.
82
+ question_types: the format of each question in the attempt (e.g.
83
+ ``["mc", "mc", "freetext", ...]``).
84
+ passed: whether the attempt passed (only passes are flagged).
85
+ per_type: override realistic-minimum seconds per format.
86
+ low_ratio / high_ratio: speed_ratio thresholds for 'low' / 'high'
87
+ severity (lower ratio = faster = more suspicious).
88
+
89
+ Returns:
90
+ An :class:`IntegrityFlag`. ``severity`` is 'high' when
91
+ ``speed_ratio < high_ratio``, 'low' when ``< low_ratio``, else 'none'.
92
+ """
93
+ types = list(question_types)
94
+ expected = expected_min_seconds(types, per_type, fallback)
95
+
96
+ if elapsed_seconds <= 0:
97
+ return IntegrityFlag(suspicious=False, severity="none", elapsed_seconds=int(elapsed_seconds),
98
+ expected_seconds=round(expected, 1), speed_ratio=0.0,
99
+ reasons=["No timing was recorded for this attempt."])
100
+ if not passed:
101
+ return IntegrityFlag(suspicious=False, severity="none", elapsed_seconds=int(elapsed_seconds),
102
+ expected_seconds=round(expected, 1),
103
+ speed_ratio=round(elapsed_seconds / expected, 2) if expected else 0.0,
104
+ reasons=["Attempt did not pass — timing not assessed."])
105
+
106
+ ratio = (elapsed_seconds / expected) if expected > 0 else 1.0
107
+ if ratio < high_ratio:
108
+ severity = "high"
109
+ elif ratio < low_ratio:
110
+ severity = "low"
111
+ else:
112
+ severity = "none"
113
+
114
+ reasons: List[str] = []
115
+ if severity != "none":
116
+ reasons.append(
117
+ f"Passed in {int(elapsed_seconds)}s — {round(ratio * 100)}% of the "
118
+ f"~{round(expected)}s it realistically takes to read and answer "
119
+ f"{len(types)} questions."
120
+ )
121
+ return IntegrityFlag(suspicious=severity != "none", severity=severity,
122
+ elapsed_seconds=int(elapsed_seconds), expected_seconds=round(expected, 1),
123
+ speed_ratio=round(ratio, 2), reasons=reasons)
quizforge/llm.py ADDED
@@ -0,0 +1,42 @@
1
+ """Provider-neutral structured-output helper.
2
+
3
+ quizforge never imports a specific LLM SDK. You pass in any object that quacks
4
+ like a LangChain chat model — i.e. it exposes ``with_structured_output(schema)``
5
+ returning something with an ``.invoke(messages)`` method, where ``messages`` is a
6
+ list of ``(role, content)`` tuples. That covers ``langchain-openai``,
7
+ ``langchain-anthropic``, community wrappers, and your own shim.
8
+ """
9
+
10
+ import logging
11
+ from typing import Any
12
+
13
+ logger = logging.getLogger("quizforge")
14
+
15
+
16
+ class _RetryingStructuredChain:
17
+ """Binds a schema to the model and retries invoke on transient failures."""
18
+
19
+ def __init__(self, llm: Any, schema: type, max_retries: int):
20
+ self._bound = llm.with_structured_output(schema)
21
+ self._max_retries = max_retries
22
+
23
+ def invoke(self, messages):
24
+ last_exc = None
25
+ for attempt in range(self._max_retries + 1):
26
+ try:
27
+ return self._bound.invoke(messages)
28
+ except Exception as exc: # noqa: BLE001 — surface the last error after retries
29
+ last_exc = exc
30
+ if attempt < self._max_retries:
31
+ logger.warning("structured_output attempt %d failed (%s); retrying",
32
+ attempt + 1, type(exc).__name__)
33
+ raise last_exc
34
+
35
+
36
+ def structured_output(llm: Any, schema: type, max_retries: int = 1):
37
+ """Return a chain whose ``.invoke(messages)`` yields a validated ``schema``.
38
+
39
+ Uses the model's native structured-output binding and retries once by
40
+ default so a single flaky response doesn't fail the whole call.
41
+ """
42
+ return _RetryingStructuredChain(llm, schema, max_retries)
quizforge/sample.py ADDED
@@ -0,0 +1,73 @@
1
+ """Sampling — draw a fresh, mixed-format test from a deep bank.
2
+
3
+ A deep bank (far more questions than any one test shows) plus unseen-first
4
+ sampling is what makes two learners rarely see the same test, which defeats
5
+ answer-sharing. No LLM here — pure, deterministic given an rng.
6
+ """
7
+
8
+ import random
9
+ from collections import defaultdict
10
+ from typing import Iterable, List, Optional
11
+
12
+ # How many of each format a single test draws. Sums to 20 by default.
13
+ DEFAULT_BLUEPRINT = {"mc": 8, "fill_blank": 4, "match": 2, "short": 4, "freetext": 2}
14
+ DIFFICULTY_ORDER = ("easy", "medium", "hard")
15
+
16
+
17
+ def pick_spread(pool: List[dict], want: int, seen_ids: Iterable[str],
18
+ rng: Optional[random.Random] = None) -> List[dict]:
19
+ """Pick ``want`` questions from one format's pool, unseen-first + difficulty-spread.
20
+
21
+ Unseen questions are exhausted before reusing seen ones; within that order we
22
+ round-robin easy/medium/hard so a draw doesn't come back all-hard or all-easy.
23
+ """
24
+ rng = rng or random
25
+ if want <= 0 or not pool:
26
+ return []
27
+ seen = set(seen_ids)
28
+ unseen = [q for q in pool if q.get("id") not in seen]
29
+ reused = [q for q in pool if q.get("id") in seen]
30
+ rng.shuffle(unseen)
31
+ rng.shuffle(reused)
32
+ ordered = unseen + reused # exhaust unseen before recycling
33
+
34
+ buckets: dict = defaultdict(list)
35
+ for q in ordered:
36
+ buckets[q.get("difficulty", "medium")].append(q)
37
+
38
+ target = min(want, len(ordered))
39
+ picked: List[dict] = []
40
+ while len(picked) < target:
41
+ progressed = False
42
+ for level in DIFFICULTY_ORDER:
43
+ if buckets[level]:
44
+ picked.append(buckets[level].pop(0))
45
+ progressed = True
46
+ if len(picked) >= target:
47
+ break
48
+ if not progressed: # difficulties outside the canonical order — drain them
49
+ leftovers = [q for qs in buckets.values() for q in qs]
50
+ picked.extend(leftovers[: target - len(picked)])
51
+ break
52
+ return picked
53
+
54
+
55
+ def sample_test(questions: List[dict], blueprint: Optional[dict] = None,
56
+ seen_ids: Iterable[str] = (), rng: Optional[random.Random] = None) -> List[dict]:
57
+ """Draw a mixed-format test per ``blueprint``, unseen-first within each format.
58
+
59
+ Falls short of the blueprint only when the bank lacks enough of a format; the
60
+ test still assembles with whatever it can. Final order is shuffled so formats
61
+ interleave instead of arriving grouped by type.
62
+ """
63
+ rng = rng or random
64
+ blueprint = blueprint or DEFAULT_BLUEPRINT
65
+ by_type: dict = defaultdict(list)
66
+ for q in questions:
67
+ by_type[q.get("type", "mc")].append(q)
68
+
69
+ picked: List[dict] = []
70
+ for qtype, want in blueprint.items():
71
+ picked.extend(pick_spread(by_type.get(qtype, []), want, seen_ids, rng=rng))
72
+ rng.shuffle(picked)
73
+ return picked
quizforge/schemas.py ADDED
@@ -0,0 +1,85 @@
1
+ """Generation schemas — the per-format shapes the LLM must return.
2
+
3
+ These describe what a freshly *generated* question looks like coming back from
4
+ the model. After validation, questions are stored as plain dicts (YAML-friendly,
5
+ template-friendly) with an added ``id`` and ``type``; see ``generate`` and the
6
+ format-field reference in the README.
7
+ """
8
+
9
+ from typing import List
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+ # The five question formats and the difficulty levels the kernel understands.
14
+ FORMAT_KEYS = ("mc", "fill_blank", "match", "short", "freetext")
15
+ DIFFICULTIES = ("easy", "medium", "hard")
16
+
17
+
18
+ class GenMC(BaseModel):
19
+ """A generated multiple-choice question."""
20
+
21
+ prompt: str
22
+ choices: List[str] = Field(description="3-4 options; exactly one correct, others plausible-but-wrong")
23
+ answer_idx: int = Field(description="0-based index of the correct choice")
24
+ explanation: str = Field(description="Why the answer is right — a teaching sentence")
25
+ difficulty: str = Field(description="easy | medium | hard")
26
+
27
+
28
+ class GenFill(BaseModel):
29
+ """A generated fill-in-the-blank question."""
30
+
31
+ prompt: str = Field(description="A sentence with a ______ blank where the answer belongs")
32
+ accepted_answers: List[str] = Field(description="Every acceptable spelling/synonym/abbreviation of the answer")
33
+ explanation: str
34
+ difficulty: str
35
+
36
+
37
+ class GenPair(BaseModel):
38
+ left: str
39
+ right: str
40
+
41
+
42
+ class GenMatch(BaseModel):
43
+ """A generated match-the-following question."""
44
+
45
+ prompt: str
46
+ pairs: List[GenPair] = Field(description="3-4 unambiguous one-to-one pairs; distinct right-side labels")
47
+ explanation: str
48
+ difficulty: str
49
+
50
+
51
+ class GenOpen(BaseModel):
52
+ """A generated open-ended (short / free-response) question."""
53
+
54
+ prompt: str
55
+ model_answer: str = Field(description="A strong, concise reference answer")
56
+ rubric: List[str] = Field(description="3-5 key points a correct answer must cover")
57
+ difficulty: str
58
+
59
+
60
+ class MCBatch(BaseModel):
61
+ questions: List[GenMC]
62
+
63
+
64
+ class FillBatch(BaseModel):
65
+ questions: List[GenFill]
66
+
67
+
68
+ class MatchBatch(BaseModel):
69
+ questions: List[GenMatch]
70
+
71
+
72
+ class OpenBatch(BaseModel):
73
+ questions: List[GenOpen]
74
+
75
+
76
+ # Per-format: the batch schema the model fills, and a human-readable description
77
+ # spliced into the generation prompt. ``short`` and ``freetext`` share the open
78
+ # schema but differ in framing (one-liners vs. walk-me-through scenarios).
79
+ FORMATS = {
80
+ "mc": (MCBatch, "multiple-choice questions with one best answer and 3 plausible distractors"),
81
+ "fill_blank": (FillBatch, "fill-in-the-blank questions (a sentence with a ______ blank)"),
82
+ "match": (MatchBatch, "match-the-following questions with 3-4 unambiguous pairs"),
83
+ "short": (OpenBatch, "short-answer questions answerable in 1-2 sentences"),
84
+ "freetext": (OpenBatch, "scenario / free-response questions ('walk me through how you'd…')"),
85
+ }
quizforge/text.py ADDED
@@ -0,0 +1,11 @@
1
+ """Shared text normalization used by the deterministic graders and the
2
+ generator's dedup pass."""
3
+
4
+
5
+ def normalize(text: str) -> str:
6
+ """Casefold + collapse whitespace + strip surrounding punctuation.
7
+
8
+ So 'ICA.', ' ica ', and 'ICA' all compare equal without an LLM call.
9
+ """
10
+ cleaned = " ".join((text or "").lower().split())
11
+ return cleaned.strip(" \t\n.,;:!?\"'`()[]")
@@ -0,0 +1,104 @@
1
+ Metadata-Version: 2.4
2
+ Name: quizforge
3
+ Version: 0.2.0
4
+ Summary: Generate a deep, mixed-format question bank from source material and grade it — deterministic where it can, LLM where it must. Bring your own chat model.
5
+ Project-URL: Homepage, https://github.com/vinayvobbili/quizforge
6
+ Project-URL: Source, https://github.com/vinayvobbili/quizforge
7
+ Author: Vinay Vobbilichetty
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: assessment,education,grading,llm,question-bank,quiz,training
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Topic :: Education :: Testing
16
+ Requires-Python: >=3.10
17
+ Requires-Dist: pydantic>=2
18
+ Requires-Dist: pyyaml>=6
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest>=7; extra == 'dev'
21
+ Provides-Extra: openai
22
+ Requires-Dist: langchain-openai>=0.1; extra == 'openai'
23
+ Description-Content-Type: text/markdown
24
+
25
+ # quizforge
26
+
27
+ Generate a deep, **mixed-format** question bank from any source material, then grade it — **deterministic where it can, LLM where it must**. Bring your own chat model.
28
+
29
+ quizforge is the engine behind a training/readiness feature: it drafts far more questions than any single test shows (multiple choice, fill-in-the-blank, match-the-following, short answer, and free-response scenarios), samples a fresh shuffled test on each attempt — so two learners rarely see the same one — and grades every format. MC/fill/match are graded instantly with no model call; open-ended answers are scored 0–1 with coaching feedback by an LLM you provide.
30
+
31
+ - **Model-agnostic** — pass any LangChain-style chat model (`with_structured_output`). No SDK is bundled.
32
+ - **Deep bank, anti-sharing sampling** — unseen-first, difficulty-spread draws per a configurable blueprint.
33
+ - **Cheap grading** — only open-ended answers cost a model call; everything else is local and free.
34
+ - **Plain dicts in, plain dicts out** — YAML/JSON-friendly, easy to store and template.
35
+
36
+ ## Install
37
+
38
+ ```bash
39
+ pip install quizforge
40
+ ```
41
+
42
+ Bring a chat model from whichever provider you use, e.g.:
43
+
44
+ ```bash
45
+ pip install langchain-openai # or langchain-anthropic, etc.
46
+ ```
47
+
48
+ ## Quickstart
49
+
50
+ ### Generate a bank
51
+
52
+ ```python
53
+ from quizforge import generate_bank
54
+ from langchain_openai import ChatOpenAI
55
+
56
+ llm = ChatOpenAI(model="gpt-4.1", temperature=0.4)
57
+
58
+ material = open("citrix_lesson.md").read()
59
+ new_questions = generate_bank(
60
+ material, llm,
61
+ targets={"mc": 40, "fill_blank": 20, "match": 12, "short": 16, "freetext": 12},
62
+ existing=[], # pass your current bank to top it up
63
+ coverage="At least half should be applied incident-response scenarios.",
64
+ )
65
+ # -> list of dicts with id/type/difficulty + per-format fields. Store as you like.
66
+ ```
67
+
68
+ `generate_bank` only produces the *shortfall* to reach `targets`, validates each
69
+ question, and never duplicates an existing prompt — safe to re-run to grow a bank.
70
+
71
+ ### Sample a test
72
+
73
+ ```python
74
+ from quizforge import sample_test, DEFAULT_BLUEPRINT
75
+
76
+ test = sample_test(bank, blueprint=DEFAULT_BLUEPRINT, seen_ids=already_seen)
77
+ # DEFAULT_BLUEPRINT draws mc8 / fill4 / match2 / short4 / freetext2 = 20, shuffled.
78
+ ```
79
+
80
+ ### Grade
81
+
82
+ ```python
83
+ from quizforge import grade_fill_blank, grade_match, grade_open_answer
84
+
85
+ grade_fill_blank(q, "ICA") # {"score": 1.0, "correct": True, ...}
86
+ grade_match(q, {"0": "RDP", "1": "ICA"}) # per-pair partial credit
87
+ grade_open_answer(q, learner_text, llm) # QuizGrade(score, verdict, feedback, ...) or None
88
+ ```
89
+
90
+ `grade_open_answer` returns `None` if the model was unavailable — exclude that
91
+ question from the attempt's max score rather than penalizing the learner.
92
+
93
+ ## Question shapes
94
+
95
+ Each question is a dict with `id`, `type`, `difficulty`, `prompt`, plus:
96
+
97
+ - `mc` — `choices: [str]`, `answer_idx: int`, `explanation: str`
98
+ - `fill_blank` — `accepted_answers: [str]`, `explanation: str`
99
+ - `match` — `pairs: [{left, right}]`, `explanation: str`
100
+ - `short` / `freetext` — `model_answer: str`, `rubric: [str]`
101
+
102
+ ## License
103
+
104
+ MIT
@@ -0,0 +1,16 @@
1
+ quizforge/__init__.py,sha256=fgrMRAFDYItRLsSydsFZE2grIpyHnLf5ZfCCkoIGAGU,2666
2
+ quizforge/bank.py,sha256=8IpsET4x4Ulodxam_sGv4Ot65ECLwGsP2VqzAFQs-LM,3612
3
+ quizforge/certificate.py,sha256=iOOtXQGpACd1Us76XIXSBST-Z19FHOIWmYMJSCH-iUA,4247
4
+ quizforge/cli.py,sha256=NgC3ciNc8pdD0PdwwoKBoXoAEujbv8ZNdIXiYDiAW8o,5420
5
+ quizforge/generate.py,sha256=UCqEVvbwRFhrXqveXW21H3_qvUUfzDC4WY5txckXEFo,8638
6
+ quizforge/grade.py,sha256=hZ90Nnuqf1u7WPWJG9fHS9d-kLhzIkP5GxSLcIPSpos,5221
7
+ quizforge/integrity.py,sha256=1_DbL9W8jJkKE7ncRs-ddxrsSllaiLVModyXjwmSQQM,5928
8
+ quizforge/llm.py,sha256=Ycd6RT2yCkZy_4GvUsNJKWRrWwDUooBrKuoJ1Dl1Hxg,1684
9
+ quizforge/sample.py,sha256=2qTaGM2mjVO6yzKRstXZrO59ercWuakD5jbet06lKTk,2888
10
+ quizforge/schemas.py,sha256=4zkq9bl-AmiszF3i0r9iRA7N77Gz_a64j53XOP5buJA,2917
11
+ quizforge/text.py,sha256=jzFeT9DleOUhmRZpCFQIoZNIzqxSI1wc6fBJvl6VBNU,387
12
+ quizforge-0.2.0.dist-info/METADATA,sha256=NohUiaIHbU0Bzkv47Bj55X2iBys69ZdnQNbFDBpA1F4,4096
13
+ quizforge-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
14
+ quizforge-0.2.0.dist-info/entry_points.txt,sha256=HL6bvElX7W1gotbUwLyKBVtMkDi_tJlkQXvVzbaX6-4,49
15
+ quizforge-0.2.0.dist-info/licenses/LICENSE,sha256=Q7CXTchzC9hqR2Dr-9cRh3bM2kXTgXGwk-dO0rGvQsE,1076
16
+ quizforge-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ quizforge = quizforge.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Vinay Vobbilichetty
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.