html2mcq 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
html2mcq/generator.py ADDED
@@ -0,0 +1,581 @@
1
+ """
2
+ MCQGenerator: The main public API for html2mcq.
3
+ Ties together ContentExtractor + AI backend to produce MCQSets.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import os
9
+ import re
10
+ import time
11
+ from pathlib import Path
12
+ from typing import List, Optional, Tuple, Union
13
+
14
+ from .extractor import ContentExtractor
15
+ from .models import ContentBlock, MCQQuestion, MCQSet
16
+ from .prompts import build_system_prompt, build_user_prompt
17
+ from .video import VideoTranscriptExtractor, extract_video_id, is_youtube_url
18
+ from .pdf import PDFExtractor
19
+
20
+
21
+ # ── AI backend registry ───────────────────────────────────────────────────────
22
+
23
+ class _AnthropicBackend:
24
+ """Uses the official anthropic SDK."""
25
+
26
+ DEFAULT_MODEL = "claude-opus-4-6"
27
+
28
+ def __init__(self, api_key: str, model: str):
29
+ try:
30
+ import anthropic
31
+ except ImportError:
32
+ raise ImportError("pip install anthropic")
33
+ self.client = anthropic.Anthropic(api_key=api_key)
34
+ self.model = model or self.DEFAULT_MODEL
35
+
36
+ def complete(self, system: str, user: str, max_tokens: int) -> str:
37
+ msg = self.client.messages.create(
38
+ model=self.model,
39
+ max_tokens=max_tokens,
40
+ system=system,
41
+ messages=[{"role": "user", "content": user}],
42
+ )
43
+ return msg.content[0].text
44
+
45
+
46
+ class _OpenAIBackend:
47
+ """Uses the official openai SDK."""
48
+
49
+ DEFAULT_MODEL = "gpt-4o"
50
+
51
+ def __init__(self, api_key: str, model: str):
52
+ try:
53
+ import openai
54
+ except ImportError:
55
+ raise ImportError("pip install openai")
56
+ self.client = openai.OpenAI(api_key=api_key)
57
+ self.model = model or self.DEFAULT_MODEL
58
+
59
+ def complete(self, system: str, user: str, max_tokens: int) -> str:
60
+ resp = self.client.chat.completions.create(
61
+ model=self.model,
62
+ max_tokens=max_tokens,
63
+ messages=[
64
+ {"role": "system", "content": system},
65
+ {"role": "user", "content": user},
66
+ ],
67
+ )
68
+ return resp.choices[0].message.content
69
+
70
+
71
+ class _OpenRouterBackend:
72
+ """
73
+ Uses OpenRouter (https://openrouter.ai) — drop-in for any model
74
+ including Llama, Mistral, Gemini via the OpenAI-compatible API.
75
+ """
76
+
77
+ DEFAULT_MODEL = "meta-llama/llama-3.3-70b-instruct"
78
+
79
+ def __init__(self, api_key: str, model: str, site_url: str = "", site_name: str = "html2mcq"):
80
+ try:
81
+ import openai
82
+ except ImportError:
83
+ raise ImportError("pip install openai")
84
+ self.client = openai.OpenAI(
85
+ api_key=api_key,
86
+ base_url="https://openrouter.ai/api/v1",
87
+ default_headers={
88
+ "HTTP-Referer": site_url,
89
+ "X-Title": site_name,
90
+ },
91
+ )
92
+ self.model = model or self.DEFAULT_MODEL
93
+
94
+ def complete(self, system: str, user: str, max_tokens: int) -> str:
95
+ resp = self.client.chat.completions.create(
96
+ model=self.model,
97
+ max_tokens=max_tokens,
98
+ messages=[
99
+ {"role": "system", "content": system},
100
+ {"role": "user", "content": user},
101
+ ],
102
+ )
103
+ return resp.choices[0].message.content
104
+
105
+
106
+ def _make_backend(provider: str, api_key: str, model: str, **kwargs):
107
+ provider = provider.lower()
108
+ if provider == "anthropic":
109
+ return _AnthropicBackend(api_key, model)
110
+ if provider == "openai":
111
+ return _OpenAIBackend(api_key, model)
112
+ if provider == "openrouter":
113
+ return _OpenRouterBackend(api_key, model, **kwargs)
114
+ raise ValueError(f"Unknown provider '{provider}'. Choose: anthropic | openai | openrouter")
115
+
116
+
117
+ # ── MCQGenerator ──────────────────────────────────────────────────────────────
118
+
119
+ class MCQGenerator:
120
+ """
121
+ Generate N MCQ questions from any HTML tutorial page.
122
+
123
+ Quick start
124
+ -----------
125
+ >>> from html2mcq import MCQGenerator
126
+ >>> gen = MCQGenerator(api_key="sk-ant-...", provider="anthropic")
127
+ >>> mcq_set = gen.from_url("https://docs.python.org/3/tutorial/", n=10)
128
+ >>> print(mcq_set.to_pretty_str())
129
+
130
+ Parameters
131
+ ----------
132
+ api_key : str
133
+ Your AI provider API key. Falls back to environment variables:
134
+ ANTHROPIC_API_KEY, OPENAI_API_KEY, OPENROUTER_API_KEY.
135
+ provider : str
136
+ "anthropic" (default) | "openai" | "openrouter"
137
+ model : str
138
+ Override the default model for the provider.
139
+ batch_size : int
140
+ Number of questions to request per API call (default 10).
141
+ Large `n` values are split into batches to stay within token limits.
142
+ max_tokens : int
143
+ Max tokens for each API response (default 4096).
144
+ extractor_kwargs : dict
145
+ Keyword args forwarded to ContentExtractor.
146
+ **backend_kwargs
147
+ Extra args forwarded to the backend (e.g. site_url for OpenRouter).
148
+ """
149
+
150
+ ENV_KEYS = {
151
+ "anthropic": "ANTHROPIC_API_KEY",
152
+ "openai": "OPENAI_API_KEY",
153
+ "openrouter": "OPENROUTER_API_KEY",
154
+ }
155
+
156
+ def __init__(
157
+ self,
158
+ api_key: Optional[str] = None,
159
+ provider: str = "anthropic",
160
+ model: str = "",
161
+ batch_size: int = 10,
162
+ max_tokens: int = 4096,
163
+ extractor_kwargs: Optional[dict] = None,
164
+ transcript_languages: Optional[List[str]] = None,
165
+ transcript_chunk_size: int = 800,
166
+ pdf_backend: str = "pymupdf",
167
+ docling_api_url: str = "",
168
+ docling_ocr: bool = True,
169
+ pdf_chunk_size: int = 1500,
170
+ custom_instructions: Optional[str] = None,
171
+ **backend_kwargs,
172
+ ):
173
+ self.provider = provider.lower()
174
+ _key = api_key or os.environ.get(self.ENV_KEYS.get(self.provider, ""), "")
175
+ if not _key:
176
+ raise ValueError(
177
+ f"No API key supplied. Pass api_key= or set "
178
+ f"{self.ENV_KEYS.get(self.provider, 'YOUR_API_KEY')} env var."
179
+ )
180
+ self.backend = _make_backend(self.provider, _key, model, **backend_kwargs)
181
+ self.batch_size = max(1, batch_size)
182
+ self.max_tokens = max_tokens
183
+ self.extractor = ContentExtractor(**(extractor_kwargs or {}))
184
+ self.transcript_extractor = VideoTranscriptExtractor(
185
+ languages=transcript_languages or ["en"],
186
+ chunk_size=transcript_chunk_size,
187
+ )
188
+ self.pdf_extractor = PDFExtractor(
189
+ backend=pdf_backend,
190
+ docling_api_url=docling_api_url,
191
+ docling_ocr=docling_ocr,
192
+ chunk_size=pdf_chunk_size,
193
+ )
194
+ self.custom_instructions = custom_instructions or ""
195
+
196
+ # ── Public API ────────────────────────────────────────────────────────────
197
+
198
+
199
+
200
+ def from_html(
201
+ self,
202
+ html: str,
203
+ n: int = 10,
204
+ base_url: str = "",
205
+ difficulty_mix: Optional[str] = None,
206
+ focus_topics: Optional[List[str]] = None,
207
+ enrich_videos: bool = True,
208
+ enrich_pdfs: bool = True,
209
+ custom_instructions: Optional[str] = None,
210
+ ) -> MCQSet:
211
+ """
212
+ Generate *n* MCQs from raw HTML.
213
+
214
+ Parameters
215
+ ----------
216
+ html : str
217
+ Raw HTML content.
218
+ n : int
219
+ Number of questions.
220
+ base_url : str
221
+ Used to resolve relative links inside the HTML.
222
+ enrich_videos : bool
223
+ Auto-fetch YouTube transcripts found in page (default True).
224
+ enrich_pdfs : bool
225
+ Auto-download and extract PDF links found in page (default True).
226
+ """
227
+ title, blocks = self.extractor.from_html(html, base_url=base_url)
228
+ if enrich_videos:
229
+ blocks = self.transcript_extractor.enrich_blocks(blocks)
230
+ if enrich_pdfs:
231
+ blocks = self.pdf_extractor.enrich_blocks(blocks)
232
+ return self._generate(
233
+ blocks=blocks,
234
+ n=n,
235
+ page_title=title,
236
+ source_url=base_url or None,
237
+ difficulty_mix=difficulty_mix,
238
+ focus_topics=focus_topics,
239
+ custom_instructions=custom_instructions,
240
+ )
241
+
242
+ def from_blocks(
243
+ self,
244
+ blocks: List[ContentBlock],
245
+ n: int = 10,
246
+ page_title: str = "Custom Content",
247
+ source_url: Optional[str] = None,
248
+ difficulty_mix: Optional[str] = None,
249
+ focus_topics: Optional[List[str]] = None,
250
+ custom_instructions: Optional[str] = None,
251
+ ) -> MCQSet:
252
+ """
253
+ Generate MCQs from pre-extracted ContentBlocks.
254
+ Useful when you've already parsed the page yourself.
255
+ """
256
+ return self._generate(
257
+ blocks=blocks,
258
+ n=n,
259
+ page_title=page_title,
260
+ source_url=source_url,
261
+ difficulty_mix=difficulty_mix,
262
+ focus_topics=focus_topics,
263
+ custom_instructions=custom_instructions,
264
+ )
265
+
266
+ def from_video_url(
267
+ self,
268
+ url: str,
269
+ n: int = 10,
270
+ video_title: str = "",
271
+ difficulty_mix: Optional[str] = None,
272
+ focus_topics: Optional[List[str]] = None,
273
+ custom_instructions: Optional[str] = None,
274
+ ) -> MCQSet:
275
+ """
276
+ Generate MCQs directly from a YouTube video URL.
277
+ Fetches the transcript automatically — no API key needed.
278
+
279
+ Parameters
280
+ ----------
281
+ url : str
282
+ YouTube video URL (any format: watch, youtu.be, embed, shorts).
283
+ n : int
284
+ Number of questions to generate.
285
+ video_title : str, optional
286
+ Title of the video. If empty, uses the URL as title.
287
+ difficulty_mix : str, optional
288
+ E.g. "30% easy, 40% medium, 30% hard".
289
+ focus_topics : list[str], optional
290
+ Topics to focus on.
291
+
292
+ Example
293
+ -------
294
+ >>> gen = MCQGenerator(provider="anthropic")
295
+ >>> mcq = gen.from_video_url(
296
+ ... "https://www.youtube.com/watch?v=VXU4LSAQDSc",
297
+ ... n=10,
298
+ ... video_title="Grammarly AI Tutorial"
299
+ ... )
300
+ """
301
+ print(f" [html2mcq] Fetching transcript for: {url}")
302
+ blocks = self.transcript_extractor.from_url(url)
303
+ if not blocks:
304
+ raise ValueError(f"No transcript found for: {url}")
305
+ print(f" [html2mcq] Got {len(blocks)} transcript chunks → generating {n} MCQs...")
306
+ title = video_title or f"Video: {url}"
307
+ return self._generate(
308
+ blocks=blocks,
309
+ n=n,
310
+ page_title=title,
311
+ source_url=url,
312
+ difficulty_mix=difficulty_mix,
313
+ focus_topics=focus_topics,
314
+ custom_instructions=custom_instructions,
315
+ )
316
+
317
+ def from_url(
318
+ self,
319
+ url: str,
320
+ n: int = 10,
321
+ difficulty_mix: Optional[str] = None,
322
+ focus_topics: Optional[List[str]] = None,
323
+ enrich_videos: bool = True,
324
+ enrich_pdfs: bool = True,
325
+ custom_instructions: Optional[str] = None,
326
+ ) -> MCQSet:
327
+ """
328
+ Fetch the page at *url*, extract content, and generate *n* MCQs.
329
+ If the URL is a YouTube link, fetches the transcript automatically.
330
+
331
+ Parameters
332
+ ----------
333
+ url : str
334
+ Tutorial page URL, or a direct YouTube video URL.
335
+ n : int
336
+ Number of MCQ questions to generate.
337
+ enrich_videos : bool
338
+ If True (default), automatically fetch transcripts for any YouTube
339
+ video links found in the page.
340
+ enrich_pdfs : bool
341
+ If True (default), automatically download and extract any PDF links
342
+ found in the page.
343
+ """
344
+ # Direct YouTube URL — go straight to transcript
345
+ if is_youtube_url(url):
346
+ return self.from_video_url(url, n=n,
347
+ difficulty_mix=difficulty_mix,
348
+ focus_topics=focus_topics,
349
+ custom_instructions=custom_instructions)
350
+
351
+ title, blocks = self.extractor.from_url(url)
352
+
353
+ if enrich_videos:
354
+ blocks = self.transcript_extractor.enrich_blocks(blocks)
355
+ if enrich_pdfs:
356
+ blocks = self.pdf_extractor.enrich_blocks(blocks)
357
+
358
+ return self._generate(
359
+ blocks=blocks,
360
+ n=n,
361
+ page_title=title,
362
+ source_url=url,
363
+ difficulty_mix=difficulty_mix,
364
+ focus_topics=focus_topics,
365
+ custom_instructions=custom_instructions,
366
+ )
367
+
368
+ def from_pdf_url(
369
+ self,
370
+ url: str,
371
+ n: int = 10,
372
+ pdf_title: str = "",
373
+ difficulty_mix: Optional[str] = None,
374
+ focus_topics: Optional[List[str]] = None,
375
+ custom_instructions: Optional[str] = None,
376
+ ) -> MCQSet:
377
+ """
378
+ Generate MCQs directly from a PDF URL.
379
+
380
+ Backend priority: PyMuPDF → auto-fallback to Docling if text is sparse.
381
+
382
+ Parameters
383
+ ----------
384
+ url : str
385
+ Direct URL to a PDF file.
386
+ n : int
387
+ Number of questions to generate.
388
+ pdf_title : str, optional
389
+ Title for the MCQSet. Defaults to the filename from the URL.
390
+ difficulty_mix : str, optional
391
+ E.g. "30% easy, 40% medium, 30% hard".
392
+ focus_topics : list[str], optional
393
+ Topics to focus on.
394
+
395
+ Example
396
+ -------
397
+ >>> gen = MCQGenerator(provider="anthropic")
398
+ >>> mcq = gen.from_pdf_url(
399
+ ... "https://example.com/python-tutorial.pdf",
400
+ ... n=10,
401
+ ... pdf_title="Python Tutorial"
402
+ ... )
403
+ """
404
+ blocks = self.pdf_extractor.from_url(url)
405
+ if not blocks:
406
+ raise ValueError(f"No text could be extracted from PDF: {url}")
407
+ title = pdf_title or url.split("/")[-1].replace(".pdf", "").replace("-", " ").replace("_", " ").title()
408
+ return self._generate(
409
+ blocks=blocks,
410
+ n=n,
411
+ page_title=title,
412
+ source_url=url,
413
+ difficulty_mix=difficulty_mix,
414
+ focus_topics=focus_topics,
415
+ custom_instructions=custom_instructions,
416
+ )
417
+
418
+ def from_pdf_path(
419
+ self,
420
+ path: str,
421
+ n: int = 10,
422
+ pdf_title: str = "",
423
+ difficulty_mix: Optional[str] = None,
424
+ focus_topics: Optional[List[str]] = None,
425
+ custom_instructions: Optional[str] = None,
426
+ ) -> MCQSet:
427
+ """
428
+ Generate MCQs from a local PDF file.
429
+
430
+ Parameters
431
+ ----------
432
+ path : str
433
+ Local file path to a PDF.
434
+ n : int
435
+ Number of questions to generate.
436
+ """
437
+ blocks = self.pdf_extractor.from_path(path)
438
+ if not blocks:
439
+ raise ValueError(f"No text could be extracted from PDF: {path}")
440
+ title = pdf_title or Path(path).stem.replace("-", " ").replace("_", " ").title()
441
+ return self._generate(
442
+ blocks=blocks,
443
+ n=n,
444
+ page_title=title,
445
+ source_url=f"file://{path}",
446
+ difficulty_mix=difficulty_mix,
447
+ focus_topics=focus_topics,
448
+ custom_instructions=custom_instructions,
449
+ )
450
+
451
+ def _resolve_instructions(self, per_call: Optional[str]) -> str:
452
+ """
453
+ Merge instance-level and per-call custom instructions.
454
+ Instance-level runs first, per-call appended after.
455
+ Either can be empty string.
456
+ """
457
+ parts = []
458
+ if self.custom_instructions and self.custom_instructions.strip():
459
+ parts.append(self.custom_instructions.strip())
460
+ if per_call and per_call.strip():
461
+ parts.append(per_call.strip())
462
+ return "\n".join(parts)
463
+
464
+ # ── Internal generation pipeline ─────────────────────────────────────────
465
+
466
+ def _generate(
467
+ self,
468
+ blocks: List[ContentBlock],
469
+ n: int,
470
+ page_title: str,
471
+ source_url: Optional[str],
472
+ difficulty_mix: Optional[str],
473
+ focus_topics: Optional[List[str]],
474
+ custom_instructions: Optional[str] = None,
475
+ ) -> MCQSet:
476
+ if not blocks:
477
+ raise ValueError("No content extracted from the page. Check the URL or HTML.")
478
+
479
+ all_questions: List[MCQQuestion] = []
480
+ system_prompt = build_system_prompt()
481
+ remaining = n
482
+
483
+ while remaining > 0:
484
+ batch_n = min(remaining, self.batch_size)
485
+ user_prompt = build_user_prompt(
486
+ blocks=blocks,
487
+ n=batch_n,
488
+ difficulty_mix=difficulty_mix,
489
+ focus_topics=focus_topics,
490
+ page_title=page_title,
491
+ custom_instructions=self._resolve_instructions(custom_instructions),
492
+ )
493
+ raw = self.backend.complete(system_prompt, user_prompt, self.max_tokens)
494
+ batch_questions = self._parse_response(raw)
495
+ all_questions.extend(batch_questions)
496
+ remaining -= len(batch_questions)
497
+
498
+ # Safety: if AI returned fewer than asked, don't loop forever
499
+ if len(batch_questions) == 0:
500
+ break
501
+ if remaining > 0 and len(batch_questions) < batch_n:
502
+ break
503
+
504
+ # Trim to exactly n
505
+ all_questions = all_questions[:n]
506
+
507
+ summary = self._build_summary(blocks)
508
+ exam_time = max(1, len(all_questions) * 2) # 2 minutes per question
509
+
510
+ return MCQSet(
511
+ source_url=source_url,
512
+ page_title=page_title,
513
+ questions=all_questions,
514
+ total_questions=len(all_questions),
515
+ content_summary=summary,
516
+ total_exam_time=exam_time,
517
+ metadata={
518
+ "provider": self.provider,
519
+ "model": getattr(self.backend, "model", "unknown"),
520
+ "requested_n": n,
521
+ "content_blocks": len(blocks),
522
+ "content_types": list({b.type for b in blocks}),
523
+ },
524
+ )
525
+
526
+ def _parse_response(self, raw: str) -> List[MCQQuestion]:
527
+ """Parse AI JSON response into MCQQuestion objects."""
528
+ # Strip any accidental markdown fences
529
+ text = raw.strip()
530
+ if text.startswith("```"):
531
+ text = re.sub(r"^```[a-z]*\n?", "", text)
532
+ text = re.sub(r"\n?```$", "", text)
533
+
534
+ try:
535
+ data = json.loads(text)
536
+ except json.JSONDecodeError:
537
+ match = re.search(r"\[.*\]", text, re.DOTALL)
538
+ if match:
539
+ data = json.loads(match.group())
540
+ else:
541
+ raise ValueError(f"AI returned non-JSON response:\n{raw[:500]}")
542
+
543
+ questions = []
544
+ for item in data:
545
+ try:
546
+ # Support both old single int and new list format for answers
547
+ raw_answers = item.get("answers", item.get("correct_answer", 0))
548
+ if isinstance(raw_answers, int):
549
+ answers = [raw_answers]
550
+ else:
551
+ answers = [int(a) for a in raw_answers]
552
+
553
+ multi = item.get("multi", len(answers) > 1)
554
+ marks = float(item.get("marks", 1))
555
+ negative_marks = float(item.get("negative_marks", 0.0 if multi else 0.25))
556
+
557
+ q = MCQQuestion(
558
+ question_html=item.get("question_html", item.get("question", "")),
559
+ options=item["options"][:4],
560
+ answers=answers,
561
+ multi=bool(multi),
562
+ marks=marks,
563
+ negative_marks=negative_marks,
564
+ difficulty=item.get("difficulty", "medium").lower(),
565
+ explaination=item.get("explaination", item.get("explanation", "")),
566
+ )
567
+ questions.append(q)
568
+ except (KeyError, TypeError, ValueError):
569
+ continue # Skip malformed items
570
+ return questions
571
+
572
+ @staticmethod
573
+ def _build_summary(blocks: List[ContentBlock]) -> str:
574
+ counts = {}
575
+ for b in blocks:
576
+ counts[b.type] = counts.get(b.type, 0) + 1
577
+ parts = [f"{v} {k}{'s' if v>1 else ''}" for k, v in sorted(counts.items())]
578
+ return "Content: " + ", ".join(parts)
579
+
580
+
581
+
html2mcq/models.py ADDED
@@ -0,0 +1,107 @@
1
+ """
2
+ Data models for html2mcq.
3
+ """
4
+ from dataclasses import dataclass, field, asdict
5
+ from typing import List, Optional, Dict, Any
6
+ import json
7
+
8
+
9
+ @dataclass
10
+ class ContentBlock:
11
+ """Represents a block of extracted content from an HTML page."""
12
+ type: str # "text" | "image" | "video" | "pdf" | "code" | "table"
13
+ content: str # text content or URL
14
+ alt_text: Optional[str] = None # for images
15
+ caption: Optional[str] = None # for media
16
+ metadata: Dict[str, Any] = field(default_factory=dict)
17
+
18
+ def to_dict(self) -> dict:
19
+ return asdict(self)
20
+
21
+
22
+ @dataclass
23
+ class MCQQuestion:
24
+ """A single multiple-choice question."""
25
+ question_html: str # Question text (may contain inline HTML)
26
+ options: List[str] # Always exactly 4 options
27
+ answers: List[int] # 0-based indices of ALL correct options
28
+ multi: bool # True if more than one correct answer
29
+ marks: float # Marks awarded for correct answer
30
+ negative_marks: float # Marks deducted for wrong answer
31
+ difficulty: str # "easy" | "medium" | "hard"
32
+ explaination: str # Explanation (typo preserved per spec)
33
+
34
+ def to_dict(self) -> dict:
35
+ return {
36
+ "question_html": self.question_html,
37
+ "options": self.options,
38
+ "answers": self.answers,
39
+ "multi": self.multi,
40
+ "marks": self.marks,
41
+ "negative_marks": self.negative_marks,
42
+ "difficulty": self.difficulty,
43
+ "explaination": self.explaination,
44
+ }
45
+
46
+ def to_pretty_str(self, number: int = 1) -> str:
47
+ multi_tag = " [MULTI]" if self.multi else ""
48
+ lines = [
49
+ f"Q{number}. [{self.difficulty.upper()}]{multi_tag} {self.question_html}",
50
+ f" Marks: +{self.marks} / -{self.negative_marks}",
51
+ "",
52
+ ]
53
+ for i, opt in enumerate(self.options):
54
+ marker = "✓" if i in self.answers else " "
55
+ lines.append(f" {marker} {chr(65+i)}) {opt}")
56
+ if self.explaination:
57
+ lines += ["", f" Explanation: {self.explaination}"]
58
+ return "\n".join(lines)
59
+
60
+
61
+ @dataclass
62
+ class MCQSet:
63
+ """A complete set of MCQ questions generated from a page."""
64
+ source_url: Optional[str]
65
+ page_title: str
66
+ questions: List[MCQQuestion]
67
+ total_questions: int
68
+ content_summary: str
69
+ total_exam_time: int = 30 # Minutes; 2 min per question by default
70
+ metadata: Dict[str, Any] = field(default_factory=dict)
71
+
72
+ def to_dict(self) -> dict:
73
+ """Returns the clean exam-ready JSON structure."""
74
+ return {
75
+ "total_exam_time": self.total_exam_time,
76
+ "questions": [q.to_dict() for q in self.questions],
77
+ }
78
+
79
+ def to_json(self, indent: int = 2) -> str:
80
+ return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
81
+
82
+ def to_pretty_str(self) -> str:
83
+ lines = [
84
+ f"{'='*60}",
85
+ f"MCQ Set : {self.page_title}",
86
+ f"Source : {self.source_url or 'N/A'}",
87
+ f"Questions: {self.total_questions} | Exam time: {self.total_exam_time} min",
88
+ f"Summary : {self.content_summary}",
89
+ f"{'='*60}",
90
+ "",
91
+ ]
92
+ for i, q in enumerate(self.questions, 1):
93
+ lines.append(q.to_pretty_str(i))
94
+ lines.append("")
95
+ return "\n".join(lines)
96
+
97
+ def filter_by_difficulty(self, difficulty: str) -> "MCQSet":
98
+ filtered = [q for q in self.questions if q.difficulty.lower() == difficulty.lower()]
99
+ return MCQSet(
100
+ source_url=self.source_url,
101
+ page_title=self.page_title,
102
+ questions=filtered,
103
+ total_questions=len(filtered),
104
+ content_summary=self.content_summary,
105
+ total_exam_time=len(filtered) * 2,
106
+ metadata=self.metadata,
107
+ )