html2mcq 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- html2mcq/__init__.py +21 -0
- html2mcq/cli.py +101 -0
- html2mcq/extractor.py +311 -0
- html2mcq/generator.py +581 -0
- html2mcq/models.py +107 -0
- html2mcq/pdf.py +553 -0
- html2mcq/prompts.py +167 -0
- html2mcq/video.py +331 -0
- html2mcq-1.3.0.dist-info/METADATA +344 -0
- html2mcq-1.3.0.dist-info/RECORD +14 -0
- html2mcq-1.3.0.dist-info/WHEEL +5 -0
- html2mcq-1.3.0.dist-info/entry_points.txt +2 -0
- html2mcq-1.3.0.dist-info/licenses/LICENSE +25 -0
- html2mcq-1.3.0.dist-info/top_level.txt +1 -0
html2mcq/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
html2mcq - Convert any HTML tutorial page, YouTube video, or PDF into MCQ questions using AI.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .generator import MCQGenerator
|
|
6
|
+
from .extractor import ContentExtractor
|
|
7
|
+
from .video import VideoTranscriptExtractor
|
|
8
|
+
from .pdf import PDFExtractor
|
|
9
|
+
from .models import MCQQuestion, MCQSet, ContentBlock
|
|
10
|
+
|
|
11
|
+
__version__ = "1.2.0"
|
|
12
|
+
__author__ = "html2mcq"
|
|
13
|
+
__all__ = [
|
|
14
|
+
"MCQGenerator",
|
|
15
|
+
"ContentExtractor",
|
|
16
|
+
"VideoTranscriptExtractor",
|
|
17
|
+
"PDFExtractor",
|
|
18
|
+
"MCQQuestion",
|
|
19
|
+
"MCQSet",
|
|
20
|
+
"ContentBlock",
|
|
21
|
+
]
|
html2mcq/cli.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""
|
|
2
|
+
html2mcq CLI
|
|
3
|
+
============
|
|
4
|
+
|
|
5
|
+
Usage examples
|
|
6
|
+
--------------
|
|
7
|
+
html2mcq https://docs.python.org/3/tutorial/ --n 15
|
|
8
|
+
html2mcq https://example.com/tutorial --n 10 --provider openai --output quiz.json
|
|
9
|
+
html2mcq --html page.html --n 5 --difficulty "50% easy, 50% medium"
|
|
10
|
+
"""
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main():
|
|
18
|
+
parser = argparse.ArgumentParser(
|
|
19
|
+
prog="html2mcq",
|
|
20
|
+
description="Convert any HTML tutorial page to MCQ questions using AI.",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Input
|
|
24
|
+
input_group = parser.add_mutually_exclusive_group(required=True)
|
|
25
|
+
input_group.add_argument("url", nargs="?", help="URL of the tutorial page")
|
|
26
|
+
input_group.add_argument("--html", metavar="FILE", help="Path to a local HTML file")
|
|
27
|
+
|
|
28
|
+
# Generation options
|
|
29
|
+
parser.add_argument("-n", "--n", type=int, default=10, help="Number of questions (default: 10)")
|
|
30
|
+
parser.add_argument("--difficulty", default=None, help='E.g. "30%% easy, 40%% medium, 30%% hard"')
|
|
31
|
+
parser.add_argument("--topics", nargs="*", help="Focus topics")
|
|
32
|
+
parser.add_argument("--instructions", "-i", default="",
|
|
33
|
+
help='Custom instructions e.g. "Make answers very close and confusing"')
|
|
34
|
+
parser.add_argument("--batch-size", type=int, default=10, help="Questions per API call (default: 10)")
|
|
35
|
+
|
|
36
|
+
# AI provider
|
|
37
|
+
parser.add_argument("--provider", default="anthropic", choices=["anthropic", "openai", "openrouter"],
|
|
38
|
+
help="AI provider (default: anthropic)")
|
|
39
|
+
parser.add_argument("--model", default="", help="Override model name")
|
|
40
|
+
parser.add_argument("--api-key", default="", help="API key (or set env var)")
|
|
41
|
+
|
|
42
|
+
# Output
|
|
43
|
+
parser.add_argument("--output", "-o", default="", help="Output file (.json or .txt). Default: stdout")
|
|
44
|
+
parser.add_argument("--format", choices=["json", "pretty"], default="pretty",
|
|
45
|
+
help="Output format (default: pretty)")
|
|
46
|
+
|
|
47
|
+
args = parser.parse_args()
|
|
48
|
+
|
|
49
|
+
# Lazy import to keep startup fast
|
|
50
|
+
try:
|
|
51
|
+
from html2mcq import MCQGenerator
|
|
52
|
+
except ImportError as e:
|
|
53
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
|
|
56
|
+
api_key = args.api_key or ""
|
|
57
|
+
try:
|
|
58
|
+
gen = MCQGenerator(
|
|
59
|
+
api_key=api_key or None,
|
|
60
|
+
provider=args.provider,
|
|
61
|
+
model=args.model,
|
|
62
|
+
batch_size=args.batch_size,
|
|
63
|
+
)
|
|
64
|
+
except ValueError as e:
|
|
65
|
+
print(f"Configuration error: {e}", file=sys.stderr)
|
|
66
|
+
sys.exit(1)
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
if args.html:
|
|
70
|
+
with open(args.html, encoding="utf-8") as f:
|
|
71
|
+
html = f.read()
|
|
72
|
+
mcq_set = gen.from_html(html, n=args.n,
|
|
73
|
+
difficulty_mix=args.difficulty,
|
|
74
|
+
focus_topics=args.topics,
|
|
75
|
+
custom_instructions=args.instructions or None)
|
|
76
|
+
else:
|
|
77
|
+
print(f"Fetching and analysing: {args.url}", file=sys.stderr)
|
|
78
|
+
mcq_set = gen.from_url(args.url, n=args.n,
|
|
79
|
+
difficulty_mix=args.difficulty,
|
|
80
|
+
focus_topics=args.topics,
|
|
81
|
+
custom_instructions=args.instructions or None)
|
|
82
|
+
except Exception as e:
|
|
83
|
+
print(f"Generation failed: {e}", file=sys.stderr)
|
|
84
|
+
sys.exit(1)
|
|
85
|
+
|
|
86
|
+
# Format output
|
|
87
|
+
if args.format == "json":
|
|
88
|
+
output = mcq_set.to_json()
|
|
89
|
+
else:
|
|
90
|
+
output = mcq_set.to_pretty_str()
|
|
91
|
+
|
|
92
|
+
if args.output:
|
|
93
|
+
with open(args.output, "w", encoding="utf-8") as f:
|
|
94
|
+
f.write(output)
|
|
95
|
+
print(f"Saved {mcq_set.total_questions} questions to {args.output}", file=sys.stderr)
|
|
96
|
+
else:
|
|
97
|
+
print(output)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
main()
|
html2mcq/extractor.py
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ContentExtractor: Parses HTML tutorial pages and extracts text, images,
|
|
3
|
+
video links, PDF links, code blocks, and tables into structured ContentBlocks.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import urllib.request
|
|
9
|
+
import urllib.error
|
|
10
|
+
from typing import List, Optional, Tuple
|
|
11
|
+
from urllib.parse import urljoin, urlparse
|
|
12
|
+
|
|
13
|
+
from .models import ContentBlock
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from bs4 import BeautifulSoup, Tag
|
|
17
|
+
BS4_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
BS4_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ── URL helpers ───────────────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
VIDEO_PATTERNS = [
|
|
25
|
+
r"youtube\.com/watch",
|
|
26
|
+
r"youtu\.be/",
|
|
27
|
+
r"vimeo\.com/",
|
|
28
|
+
r"dailymotion\.com/",
|
|
29
|
+
r"twitch\.tv/",
|
|
30
|
+
r"\.mp4$",
|
|
31
|
+
r"\.webm$",
|
|
32
|
+
r"\.ogg$",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
PDF_PATTERNS = [
|
|
36
|
+
r"\.pdf($|\?)",
|
|
37
|
+
r"drive\.google\.com.*pdf",
|
|
38
|
+
r"docs\.google\.com/.*presentation",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".bmp", ".tiff"}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _is_video_url(url: str) -> bool:
|
|
45
|
+
return any(re.search(p, url, re.IGNORECASE) for p in VIDEO_PATTERNS)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _is_pdf_url(url: str) -> bool:
|
|
49
|
+
return any(re.search(p, url, re.IGNORECASE) for p in PDF_PATTERNS)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _is_image_url(url: str) -> bool:
|
|
53
|
+
path = urlparse(url).path.lower()
|
|
54
|
+
return any(path.endswith(ext) for ext in IMAGE_EXTENSIONS)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _absolute_url(base: str, href: str) -> str:
|
|
58
|
+
if not href:
|
|
59
|
+
return ""
|
|
60
|
+
return urljoin(base, href)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ── Extractor ─────────────────────────────────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
class ContentExtractor:
|
|
66
|
+
"""
|
|
67
|
+
Extracts structured content from an HTML string or URL.
|
|
68
|
+
|
|
69
|
+
Usage
|
|
70
|
+
-----
|
|
71
|
+
extractor = ContentExtractor()
|
|
72
|
+
blocks = extractor.from_url("https://example.com/tutorial")
|
|
73
|
+
# or
|
|
74
|
+
blocks = extractor.from_html(html_string, base_url="https://example.com/tutorial")
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
# Tags whose text content we skip entirely (navigation, boilerplate, etc.)
|
|
78
|
+
SKIP_TAGS = {"script", "style", "nav", "footer", "header", "aside", "noscript"}
|
|
79
|
+
# Tags that typically contain the main article body
|
|
80
|
+
MAIN_TAGS = {"article", "main", "section", "div"}
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
min_text_length: int = 40,
|
|
85
|
+
include_images: bool = True,
|
|
86
|
+
include_videos: bool = True,
|
|
87
|
+
include_pdfs: bool = True,
|
|
88
|
+
include_code: bool = True,
|
|
89
|
+
include_tables: bool = True,
|
|
90
|
+
user_agent: str = "html2mcq/1.0 (content extractor)",
|
|
91
|
+
timeout: int = 15,
|
|
92
|
+
):
|
|
93
|
+
if not BS4_AVAILABLE:
|
|
94
|
+
raise ImportError(
|
|
95
|
+
"BeautifulSoup4 is required: pip install html2mcq[bs4] "
|
|
96
|
+
"or pip install beautifulsoup4 lxml"
|
|
97
|
+
)
|
|
98
|
+
self.min_text_length = min_text_length
|
|
99
|
+
self.include_images = include_images
|
|
100
|
+
self.include_videos = include_videos
|
|
101
|
+
self.include_pdfs = include_pdfs
|
|
102
|
+
self.include_code = include_code
|
|
103
|
+
self.include_tables = include_tables
|
|
104
|
+
self.user_agent = user_agent
|
|
105
|
+
self.timeout = timeout
|
|
106
|
+
|
|
107
|
+
# ── Public API ────────────────────────────────────────────────────────────
|
|
108
|
+
|
|
109
|
+
def from_url(self, url: str) -> Tuple[str, List[ContentBlock]]:
|
|
110
|
+
"""
|
|
111
|
+
Fetch *url*, parse the HTML, and return (page_title, blocks).
|
|
112
|
+
"""
|
|
113
|
+
html = self._fetch(url)
|
|
114
|
+
return self.from_html(html, base_url=url)
|
|
115
|
+
|
|
116
|
+
def from_html(
|
|
117
|
+
self, html: str, base_url: str = ""
|
|
118
|
+
) -> Tuple[str, List[ContentBlock]]:
|
|
119
|
+
"""
|
|
120
|
+
Parse *html* and return (page_title, blocks).
|
|
121
|
+
*base_url* is used to resolve relative links.
|
|
122
|
+
"""
|
|
123
|
+
soup = BeautifulSoup(html, "lxml" if self._lxml_available() else "html.parser")
|
|
124
|
+
|
|
125
|
+
# Remove boilerplate tags
|
|
126
|
+
for tag in soup(self.SKIP_TAGS):
|
|
127
|
+
tag.decompose()
|
|
128
|
+
|
|
129
|
+
title = self._extract_title(soup)
|
|
130
|
+
blocks: List[ContentBlock] = []
|
|
131
|
+
seen_urls: set = set()
|
|
132
|
+
|
|
133
|
+
# Walk the DOM in document order
|
|
134
|
+
body = soup.find("body") or soup
|
|
135
|
+
self._walk(body, base_url, blocks, seen_urls)
|
|
136
|
+
|
|
137
|
+
return title, blocks
|
|
138
|
+
|
|
139
|
+
# ── Private helpers ───────────────────────────────────────────────────────
|
|
140
|
+
|
|
141
|
+
def _fetch(self, url: str) -> str:
|
|
142
|
+
req = urllib.request.Request(
|
|
143
|
+
url,
|
|
144
|
+
headers={
|
|
145
|
+
"User-Agent": self.user_agent,
|
|
146
|
+
"Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
|
|
147
|
+
},
|
|
148
|
+
)
|
|
149
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
|
|
150
|
+
charset = resp.headers.get_content_charset() or "utf-8"
|
|
151
|
+
return resp.read().decode(charset, errors="replace")
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def _lxml_available() -> bool:
|
|
155
|
+
try:
|
|
156
|
+
import lxml # noqa: F401
|
|
157
|
+
return True
|
|
158
|
+
except ImportError:
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
@staticmethod
|
|
162
|
+
def _extract_title(soup: BeautifulSoup) -> str:
|
|
163
|
+
h1 = soup.find("h1")
|
|
164
|
+
if h1:
|
|
165
|
+
return h1.get_text(strip=True)
|
|
166
|
+
title_tag = soup.find("title")
|
|
167
|
+
if title_tag:
|
|
168
|
+
return title_tag.get_text(strip=True)
|
|
169
|
+
return "Untitled Page"
|
|
170
|
+
|
|
171
|
+
def _walk(
|
|
172
|
+
self,
|
|
173
|
+
node,
|
|
174
|
+
base_url: str,
|
|
175
|
+
blocks: List[ContentBlock],
|
|
176
|
+
seen_urls: set,
|
|
177
|
+
):
|
|
178
|
+
for child in node.children:
|
|
179
|
+
if not hasattr(child, "name") or child.name is None:
|
|
180
|
+
# NavigableString – skip, text is captured at element level
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
tag: Tag = child
|
|
184
|
+
name = tag.name.lower()
|
|
185
|
+
|
|
186
|
+
if name in self.SKIP_TAGS:
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
# ── Images ──
|
|
190
|
+
if name == "img" and self.include_images:
|
|
191
|
+
src = tag.get("src", "")
|
|
192
|
+
if src:
|
|
193
|
+
abs_src = _absolute_url(base_url, src)
|
|
194
|
+
if abs_src not in seen_urls:
|
|
195
|
+
seen_urls.add(abs_src)
|
|
196
|
+
blocks.append(ContentBlock(
|
|
197
|
+
type="image",
|
|
198
|
+
content=abs_src,
|
|
199
|
+
alt_text=tag.get("alt", ""),
|
|
200
|
+
caption=tag.get("title", ""),
|
|
201
|
+
))
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
# ── <video> ──
|
|
205
|
+
if name == "video" and self.include_videos:
|
|
206
|
+
src = tag.get("src", "")
|
|
207
|
+
source_tag = tag.find("source")
|
|
208
|
+
if not src and source_tag:
|
|
209
|
+
src = source_tag.get("src", "")
|
|
210
|
+
if src:
|
|
211
|
+
abs_src = _absolute_url(base_url, src)
|
|
212
|
+
if abs_src not in seen_urls:
|
|
213
|
+
seen_urls.add(abs_src)
|
|
214
|
+
blocks.append(ContentBlock(
|
|
215
|
+
type="video",
|
|
216
|
+
content=abs_src,
|
|
217
|
+
caption=tag.get("title", ""),
|
|
218
|
+
))
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
# ── <iframe> (YouTube / Vimeo embeds) ──
|
|
222
|
+
if name == "iframe" and self.include_videos:
|
|
223
|
+
src = tag.get("src", "")
|
|
224
|
+
if src and _is_video_url(src):
|
|
225
|
+
abs_src = _absolute_url(base_url, src)
|
|
226
|
+
if abs_src not in seen_urls:
|
|
227
|
+
seen_urls.add(abs_src)
|
|
228
|
+
blocks.append(ContentBlock(
|
|
229
|
+
type="video",
|
|
230
|
+
content=abs_src,
|
|
231
|
+
caption=tag.get("title", ""),
|
|
232
|
+
metadata={"embed": True},
|
|
233
|
+
))
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
# ── Anchors ──
|
|
237
|
+
if name == "a":
|
|
238
|
+
href = tag.get("href", "")
|
|
239
|
+
if href:
|
|
240
|
+
abs_href = _absolute_url(base_url, href)
|
|
241
|
+
if abs_href not in seen_urls:
|
|
242
|
+
if self.include_pdfs and _is_pdf_url(abs_href):
|
|
243
|
+
seen_urls.add(abs_href)
|
|
244
|
+
blocks.append(ContentBlock(
|
|
245
|
+
type="pdf",
|
|
246
|
+
content=abs_href,
|
|
247
|
+
alt_text=tag.get_text(strip=True),
|
|
248
|
+
))
|
|
249
|
+
elif self.include_videos and _is_video_url(abs_href):
|
|
250
|
+
seen_urls.add(abs_href)
|
|
251
|
+
blocks.append(ContentBlock(
|
|
252
|
+
type="video",
|
|
253
|
+
content=abs_href,
|
|
254
|
+
alt_text=tag.get_text(strip=True),
|
|
255
|
+
))
|
|
256
|
+
elif self.include_images and _is_image_url(abs_href):
|
|
257
|
+
seen_urls.add(abs_href)
|
|
258
|
+
blocks.append(ContentBlock(
|
|
259
|
+
type="image",
|
|
260
|
+
content=abs_href,
|
|
261
|
+
alt_text=tag.get_text(strip=True),
|
|
262
|
+
))
|
|
263
|
+
# Still recurse into <a> for nested text / images
|
|
264
|
+
self._walk(tag, base_url, blocks, seen_urls)
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
# ── Code blocks ──
|
|
268
|
+
if name in ("pre", "code") and self.include_code:
|
|
269
|
+
code_text = tag.get_text()
|
|
270
|
+
if len(code_text.strip()) >= 10:
|
|
271
|
+
lang = ""
|
|
272
|
+
cls = tag.get("class", [])
|
|
273
|
+
for c in cls:
|
|
274
|
+
if "language-" in c:
|
|
275
|
+
lang = c.replace("language-", "")
|
|
276
|
+
blocks.append(ContentBlock(
|
|
277
|
+
type="code",
|
|
278
|
+
content=code_text,
|
|
279
|
+
metadata={"language": lang},
|
|
280
|
+
))
|
|
281
|
+
continue
|
|
282
|
+
|
|
283
|
+
# ── Tables ──
|
|
284
|
+
if name == "table" and self.include_tables:
|
|
285
|
+
rows = []
|
|
286
|
+
for tr in tag.find_all("tr"):
|
|
287
|
+
cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
|
|
288
|
+
if cells:
|
|
289
|
+
rows.append(" | ".join(cells))
|
|
290
|
+
if rows:
|
|
291
|
+
blocks.append(ContentBlock(
|
|
292
|
+
type="table",
|
|
293
|
+
content="\n".join(rows),
|
|
294
|
+
metadata={"rows": len(rows)},
|
|
295
|
+
))
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
# ── Headings & text-bearing elements ──
|
|
299
|
+
if name in ("h1","h2","h3","h4","h5","h6","p","li","blockquote","figcaption","td","th","dt","dd","summary","details"):
|
|
300
|
+
text = tag.get_text(" ", strip=True)
|
|
301
|
+
if len(text) >= self.min_text_length:
|
|
302
|
+
blocks.append(ContentBlock(
|
|
303
|
+
type="text",
|
|
304
|
+
content=text,
|
|
305
|
+
metadata={"tag": name},
|
|
306
|
+
))
|
|
307
|
+
# Don't recurse into these – we already captured their text
|
|
308
|
+
continue
|
|
309
|
+
|
|
310
|
+
# ── Recurse into containers ──
|
|
311
|
+
self._walk(tag, base_url, blocks, seen_urls)
|