html2mcq 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
html2mcq/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ """
2
+ html2mcq - Convert any HTML tutorial page, YouTube video, or PDF into MCQ questions using AI.
3
+ """
4
+
5
+ from .generator import MCQGenerator
6
+ from .extractor import ContentExtractor
7
+ from .video import VideoTranscriptExtractor
8
+ from .pdf import PDFExtractor
9
+ from .models import MCQQuestion, MCQSet, ContentBlock
10
+
11
+ __version__ = "1.2.0"
12
+ __author__ = "html2mcq"
13
+ __all__ = [
14
+ "MCQGenerator",
15
+ "ContentExtractor",
16
+ "VideoTranscriptExtractor",
17
+ "PDFExtractor",
18
+ "MCQQuestion",
19
+ "MCQSet",
20
+ "ContentBlock",
21
+ ]
html2mcq/cli.py ADDED
@@ -0,0 +1,101 @@
1
+ """
2
+ html2mcq CLI
3
+ ============
4
+
5
+ Usage examples
6
+ --------------
7
+ html2mcq https://docs.python.org/3/tutorial/ --n 15
8
+ html2mcq https://example.com/tutorial --n 10 --provider openai --output quiz.json
9
+ html2mcq --html page.html --n 5 --difficulty "50% easy, 50% medium"
10
+ """
11
+ import argparse
12
+ import json
13
+ import os
14
+ import sys
15
+
16
+
17
+ def main():
18
+ parser = argparse.ArgumentParser(
19
+ prog="html2mcq",
20
+ description="Convert any HTML tutorial page to MCQ questions using AI.",
21
+ )
22
+
23
+ # Input
24
+ input_group = parser.add_mutually_exclusive_group(required=True)
25
+ input_group.add_argument("url", nargs="?", help="URL of the tutorial page")
26
+ input_group.add_argument("--html", metavar="FILE", help="Path to a local HTML file")
27
+
28
+ # Generation options
29
+ parser.add_argument("-n", "--n", type=int, default=10, help="Number of questions (default: 10)")
30
+ parser.add_argument("--difficulty", default=None, help='E.g. "30%% easy, 40%% medium, 30%% hard"')
31
+ parser.add_argument("--topics", nargs="*", help="Focus topics")
32
+ parser.add_argument("--instructions", "-i", default="",
33
+ help='Custom instructions e.g. "Make answers very close and confusing"')
34
+ parser.add_argument("--batch-size", type=int, default=10, help="Questions per API call (default: 10)")
35
+
36
+ # AI provider
37
+ parser.add_argument("--provider", default="anthropic", choices=["anthropic", "openai", "openrouter"],
38
+ help="AI provider (default: anthropic)")
39
+ parser.add_argument("--model", default="", help="Override model name")
40
+ parser.add_argument("--api-key", default="", help="API key (or set env var)")
41
+
42
+ # Output
43
+ parser.add_argument("--output", "-o", default="", help="Output file (.json or .txt). Default: stdout")
44
+ parser.add_argument("--format", choices=["json", "pretty"], default="pretty",
45
+ help="Output format (default: pretty)")
46
+
47
+ args = parser.parse_args()
48
+
49
+ # Lazy import to keep startup fast
50
+ try:
51
+ from html2mcq import MCQGenerator
52
+ except ImportError as e:
53
+ print(f"Error: {e}", file=sys.stderr)
54
+ sys.exit(1)
55
+
56
+ api_key = args.api_key or ""
57
+ try:
58
+ gen = MCQGenerator(
59
+ api_key=api_key or None,
60
+ provider=args.provider,
61
+ model=args.model,
62
+ batch_size=args.batch_size,
63
+ )
64
+ except ValueError as e:
65
+ print(f"Configuration error: {e}", file=sys.stderr)
66
+ sys.exit(1)
67
+
68
+ try:
69
+ if args.html:
70
+ with open(args.html, encoding="utf-8") as f:
71
+ html = f.read()
72
+ mcq_set = gen.from_html(html, n=args.n,
73
+ difficulty_mix=args.difficulty,
74
+ focus_topics=args.topics,
75
+ custom_instructions=args.instructions or None)
76
+ else:
77
+ print(f"Fetching and analysing: {args.url}", file=sys.stderr)
78
+ mcq_set = gen.from_url(args.url, n=args.n,
79
+ difficulty_mix=args.difficulty,
80
+ focus_topics=args.topics,
81
+ custom_instructions=args.instructions or None)
82
+ except Exception as e:
83
+ print(f"Generation failed: {e}", file=sys.stderr)
84
+ sys.exit(1)
85
+
86
+ # Format output
87
+ if args.format == "json":
88
+ output = mcq_set.to_json()
89
+ else:
90
+ output = mcq_set.to_pretty_str()
91
+
92
+ if args.output:
93
+ with open(args.output, "w", encoding="utf-8") as f:
94
+ f.write(output)
95
+ print(f"Saved {mcq_set.total_questions} questions to {args.output}", file=sys.stderr)
96
+ else:
97
+ print(output)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()
html2mcq/extractor.py ADDED
@@ -0,0 +1,311 @@
1
+ """
2
+ ContentExtractor: Parses HTML tutorial pages and extracts text, images,
3
+ video links, PDF links, code blocks, and tables into structured ContentBlocks.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import re
8
+ import urllib.request
9
+ import urllib.error
10
+ from typing import List, Optional, Tuple
11
+ from urllib.parse import urljoin, urlparse
12
+
13
+ from .models import ContentBlock
14
+
15
+ try:
16
+ from bs4 import BeautifulSoup, Tag
17
+ BS4_AVAILABLE = True
18
+ except ImportError:
19
+ BS4_AVAILABLE = False
20
+
21
+
22
+ # ── URL helpers ───────────────────────────────────────────────────────────────
23
+
24
+ VIDEO_PATTERNS = [
25
+ r"youtube\.com/watch",
26
+ r"youtu\.be/",
27
+ r"vimeo\.com/",
28
+ r"dailymotion\.com/",
29
+ r"twitch\.tv/",
30
+ r"\.mp4$",
31
+ r"\.webm$",
32
+ r"\.ogg$",
33
+ ]
34
+
35
+ PDF_PATTERNS = [
36
+ r"\.pdf($|\?)",
37
+ r"drive\.google\.com.*pdf",
38
+ r"docs\.google\.com/.*presentation",
39
+ ]
40
+
41
+ IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".bmp", ".tiff"}
42
+
43
+
44
+ def _is_video_url(url: str) -> bool:
45
+ return any(re.search(p, url, re.IGNORECASE) for p in VIDEO_PATTERNS)
46
+
47
+
48
+ def _is_pdf_url(url: str) -> bool:
49
+ return any(re.search(p, url, re.IGNORECASE) for p in PDF_PATTERNS)
50
+
51
+
52
+ def _is_image_url(url: str) -> bool:
53
+ path = urlparse(url).path.lower()
54
+ return any(path.endswith(ext) for ext in IMAGE_EXTENSIONS)
55
+
56
+
57
+ def _absolute_url(base: str, href: str) -> str:
58
+ if not href:
59
+ return ""
60
+ return urljoin(base, href)
61
+
62
+
63
+ # ── Extractor ─────────────────────────────────────────────────────────────────
64
+
65
+ class ContentExtractor:
66
+ """
67
+ Extracts structured content from an HTML string or URL.
68
+
69
+ Usage
70
+ -----
71
+ extractor = ContentExtractor()
72
+ blocks = extractor.from_url("https://example.com/tutorial")
73
+ # or
74
+ blocks = extractor.from_html(html_string, base_url="https://example.com/tutorial")
75
+ """
76
+
77
+ # Tags whose text content we skip entirely (navigation, boilerplate, etc.)
78
+ SKIP_TAGS = {"script", "style", "nav", "footer", "header", "aside", "noscript"}
79
+ # Tags that typically contain the main article body
80
+ MAIN_TAGS = {"article", "main", "section", "div"}
81
+
82
+ def __init__(
83
+ self,
84
+ min_text_length: int = 40,
85
+ include_images: bool = True,
86
+ include_videos: bool = True,
87
+ include_pdfs: bool = True,
88
+ include_code: bool = True,
89
+ include_tables: bool = True,
90
+ user_agent: str = "html2mcq/1.0 (content extractor)",
91
+ timeout: int = 15,
92
+ ):
93
+ if not BS4_AVAILABLE:
94
+ raise ImportError(
95
+ "BeautifulSoup4 is required: pip install html2mcq[bs4] "
96
+ "or pip install beautifulsoup4 lxml"
97
+ )
98
+ self.min_text_length = min_text_length
99
+ self.include_images = include_images
100
+ self.include_videos = include_videos
101
+ self.include_pdfs = include_pdfs
102
+ self.include_code = include_code
103
+ self.include_tables = include_tables
104
+ self.user_agent = user_agent
105
+ self.timeout = timeout
106
+
107
+ # ── Public API ────────────────────────────────────────────────────────────
108
+
109
+ def from_url(self, url: str) -> Tuple[str, List[ContentBlock]]:
110
+ """
111
+ Fetch *url*, parse the HTML, and return (page_title, blocks).
112
+ """
113
+ html = self._fetch(url)
114
+ return self.from_html(html, base_url=url)
115
+
116
+ def from_html(
117
+ self, html: str, base_url: str = ""
118
+ ) -> Tuple[str, List[ContentBlock]]:
119
+ """
120
+ Parse *html* and return (page_title, blocks).
121
+ *base_url* is used to resolve relative links.
122
+ """
123
+ soup = BeautifulSoup(html, "lxml" if self._lxml_available() else "html.parser")
124
+
125
+ # Remove boilerplate tags
126
+ for tag in soup(self.SKIP_TAGS):
127
+ tag.decompose()
128
+
129
+ title = self._extract_title(soup)
130
+ blocks: List[ContentBlock] = []
131
+ seen_urls: set = set()
132
+
133
+ # Walk the DOM in document order
134
+ body = soup.find("body") or soup
135
+ self._walk(body, base_url, blocks, seen_urls)
136
+
137
+ return title, blocks
138
+
139
+ # ── Private helpers ───────────────────────────────────────────────────────
140
+
141
+ def _fetch(self, url: str) -> str:
142
+ req = urllib.request.Request(
143
+ url,
144
+ headers={
145
+ "User-Agent": self.user_agent,
146
+ "Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
147
+ },
148
+ )
149
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp:
150
+ charset = resp.headers.get_content_charset() or "utf-8"
151
+ return resp.read().decode(charset, errors="replace")
152
+
153
+ @staticmethod
154
+ def _lxml_available() -> bool:
155
+ try:
156
+ import lxml # noqa: F401
157
+ return True
158
+ except ImportError:
159
+ return False
160
+
161
+ @staticmethod
162
+ def _extract_title(soup: BeautifulSoup) -> str:
163
+ h1 = soup.find("h1")
164
+ if h1:
165
+ return h1.get_text(strip=True)
166
+ title_tag = soup.find("title")
167
+ if title_tag:
168
+ return title_tag.get_text(strip=True)
169
+ return "Untitled Page"
170
+
171
+ def _walk(
172
+ self,
173
+ node,
174
+ base_url: str,
175
+ blocks: List[ContentBlock],
176
+ seen_urls: set,
177
+ ):
178
+ for child in node.children:
179
+ if not hasattr(child, "name") or child.name is None:
180
+ # NavigableString – skip, text is captured at element level
181
+ continue
182
+
183
+ tag: Tag = child
184
+ name = tag.name.lower()
185
+
186
+ if name in self.SKIP_TAGS:
187
+ continue
188
+
189
+ # ── Images ──
190
+ if name == "img" and self.include_images:
191
+ src = tag.get("src", "")
192
+ if src:
193
+ abs_src = _absolute_url(base_url, src)
194
+ if abs_src not in seen_urls:
195
+ seen_urls.add(abs_src)
196
+ blocks.append(ContentBlock(
197
+ type="image",
198
+ content=abs_src,
199
+ alt_text=tag.get("alt", ""),
200
+ caption=tag.get("title", ""),
201
+ ))
202
+ continue
203
+
204
+ # ── <video> ──
205
+ if name == "video" and self.include_videos:
206
+ src = tag.get("src", "")
207
+ source_tag = tag.find("source")
208
+ if not src and source_tag:
209
+ src = source_tag.get("src", "")
210
+ if src:
211
+ abs_src = _absolute_url(base_url, src)
212
+ if abs_src not in seen_urls:
213
+ seen_urls.add(abs_src)
214
+ blocks.append(ContentBlock(
215
+ type="video",
216
+ content=abs_src,
217
+ caption=tag.get("title", ""),
218
+ ))
219
+ continue
220
+
221
+ # ── <iframe> (YouTube / Vimeo embeds) ──
222
+ if name == "iframe" and self.include_videos:
223
+ src = tag.get("src", "")
224
+ if src and _is_video_url(src):
225
+ abs_src = _absolute_url(base_url, src)
226
+ if abs_src not in seen_urls:
227
+ seen_urls.add(abs_src)
228
+ blocks.append(ContentBlock(
229
+ type="video",
230
+ content=abs_src,
231
+ caption=tag.get("title", ""),
232
+ metadata={"embed": True},
233
+ ))
234
+ continue
235
+
236
+ # ── Anchors ──
237
+ if name == "a":
238
+ href = tag.get("href", "")
239
+ if href:
240
+ abs_href = _absolute_url(base_url, href)
241
+ if abs_href not in seen_urls:
242
+ if self.include_pdfs and _is_pdf_url(abs_href):
243
+ seen_urls.add(abs_href)
244
+ blocks.append(ContentBlock(
245
+ type="pdf",
246
+ content=abs_href,
247
+ alt_text=tag.get_text(strip=True),
248
+ ))
249
+ elif self.include_videos and _is_video_url(abs_href):
250
+ seen_urls.add(abs_href)
251
+ blocks.append(ContentBlock(
252
+ type="video",
253
+ content=abs_href,
254
+ alt_text=tag.get_text(strip=True),
255
+ ))
256
+ elif self.include_images and _is_image_url(abs_href):
257
+ seen_urls.add(abs_href)
258
+ blocks.append(ContentBlock(
259
+ type="image",
260
+ content=abs_href,
261
+ alt_text=tag.get_text(strip=True),
262
+ ))
263
+ # Still recurse into <a> for nested text / images
264
+ self._walk(tag, base_url, blocks, seen_urls)
265
+ continue
266
+
267
+ # ── Code blocks ──
268
+ if name in ("pre", "code") and self.include_code:
269
+ code_text = tag.get_text()
270
+ if len(code_text.strip()) >= 10:
271
+ lang = ""
272
+ cls = tag.get("class", [])
273
+ for c in cls:
274
+ if "language-" in c:
275
+ lang = c.replace("language-", "")
276
+ blocks.append(ContentBlock(
277
+ type="code",
278
+ content=code_text,
279
+ metadata={"language": lang},
280
+ ))
281
+ continue
282
+
283
+ # ── Tables ──
284
+ if name == "table" and self.include_tables:
285
+ rows = []
286
+ for tr in tag.find_all("tr"):
287
+ cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
288
+ if cells:
289
+ rows.append(" | ".join(cells))
290
+ if rows:
291
+ blocks.append(ContentBlock(
292
+ type="table",
293
+ content="\n".join(rows),
294
+ metadata={"rows": len(rows)},
295
+ ))
296
+ continue
297
+
298
+ # ── Headings & text-bearing elements ──
299
+ if name in ("h1","h2","h3","h4","h5","h6","p","li","blockquote","figcaption","td","th","dt","dd","summary","details"):
300
+ text = tag.get_text(" ", strip=True)
301
+ if len(text) >= self.min_text_length:
302
+ blocks.append(ContentBlock(
303
+ type="text",
304
+ content=text,
305
+ metadata={"tag": name},
306
+ ))
307
+ # Don't recurse into these – we already captured their text
308
+ continue
309
+
310
+ # ── Recurse into containers ──
311
+ self._walk(tag, base_url, blocks, seen_urls)