pdf2mcq 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdf2mcq-1.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 pdf2mcq
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pdf2mcq-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,173 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdf2mcq
3
+ Version: 1.0.0
4
+ Summary: Convert PDF files (text, scanned, mixed) into MCQ questions using AI
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://github.com/manjur-ai/pdf2mcq
7
+ Project-URL: Issues, https://github.com/manjur-ai/pdf2mcq/issues
8
+ Keywords: mcq,quiz,ai,education,pdf,ocr,llm,openrouter
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Education
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Education
18
+ Classifier: Topic :: Text Processing
19
+ Requires-Python: >=3.9
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: anthropic>=0.25
23
+ Requires-Dist: openai>=1.30
24
+ Requires-Dist: pymupdf>=1.24
25
+ Requires-Dist: Pillow>=10.0
26
+ Requires-Dist: pytesseract>=0.3
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7; extra == "dev"
29
+ Requires-Dist: pytest-cov; extra == "dev"
30
+ Requires-Dist: build; extra == "dev"
31
+ Requires-Dist: twine; extra == "dev"
32
+ Dynamic: license-file
33
+
34
+ # pdf2mcq
35
+
36
+ Convert PDF files — text PDFs, scanned books, mixed documents — into high-quality MCQ questions using AI.
37
+
38
+ Built on top of **html2mcq**'s PDF pipeline, extracted as a standalone library focused purely on PDF-to-MCQ generation.
39
+
40
+ ---
41
+
42
+ ## Features
43
+
44
+ - **Smart PDF detection** — automatically detects text PDFs, scanned PDFs, and mixed documents
45
+ - **Text PDFs** — fast extraction via PyMuPDF with chunking at sentence boundaries
46
+ - **Scanned PDFs** — renders pages as images → vision API OCR (or pytesseract fallback)
47
+ - **Mixed PDFs** — text pages via PyMuPDF + scanned pages via OCR, combined intelligently
48
+ - **Multiple AI providers:** OpenRouter, Anthropic, OpenAI, Ollama
49
+ - **Auto model failover** for MCQ generation
50
+ - **CLI & Python API**
51
+
52
+ ---
53
+
54
+ ## Quick Start
55
+
56
+ ### CLI
57
+
58
+ ```bash
59
+ # Single PDF
60
+ pdf2mcq --pdf-path textbook.pdf -n 10
61
+
62
+ # Multiple PDF URLs
63
+ pdf2mcq --pdf-url https://example.com/chapter1.pdf --pdf-url https://example.com/chapter2.pdf
64
+
65
+ # Scan a folder of PDFs
66
+ pdf2mcq --pdf-folder ./textbooks/
67
+
68
+ # Output as JSON
69
+ pdf2mcq --pdf-path notes.pdf -o questions.json --format json
70
+ ```
71
+
72
+ ### Python API
73
+
74
+ ```python
75
+ from pdf2mcq import PDFMCQGenerator
76
+
77
+ gen = PDFMCQGenerator(
78
+ api_key="sk-or-v1-...",
79
+ provider="openrouter",
80
+ mcq_model="google/gemini-2.5-flash-lite",
81
+ )
82
+
83
+ # From local PDF
84
+ mcq = gen.from_pdf_paths("textbook.pdf", n=5)
85
+ print(mcq.to_pretty_str())
86
+
87
+ # From URL
88
+ mcq = gen.from_pdf_urls("https://example.com/notes.pdf", n=3)
89
+ print(mcq.to_json())
90
+
91
+ # Multiple PDFs
92
+ mcq = gen.from_pdf_paths(["chapter1.pdf", "chapter2.pdf", "chapter3.pdf"])
93
+ ```
94
+
95
+ ### Custom Instructions
96
+
97
+ ```python
98
+ mcq = gen.from_pdf_paths(
99
+ "lecture-notes.pdf",
100
+ n=10,
101
+ difficulty_mix="50% easy, 50% hard",
102
+ focus_topics=["machine learning", "neural networks"],
103
+ custom_instructions="Focus on mathematical derivations",
104
+ )
105
+ ```
106
+
107
+ ### Auto Model Selection
108
+
109
+ ```python
110
+ gen = PDFMCQGenerator(
111
+ api_key="sk-or-v1-...",
112
+ mcq_model="auto",
113
+ mcq_model_list=[
114
+ "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free",
115
+ "google/gemma-4-31b-it:free",
116
+ ],
117
+ )
118
+ ```
119
+
120
+ ### Environment Variables
121
+
122
+ | Variable | Purpose |
123
+ |---|---|
124
+ | `OPENROUTER_API_KEY` | Default API key for OpenRouter |
125
+ | `ANTHROPIC_API_KEY` | API key for Anthropic |
126
+ | `OPENAI_API_KEY` | API key for OpenAI |
127
+ | `PDF2MCQ_MCQ_MODELS` | Comma-separated MCQ model priority list for `mcq_model="auto"` |
128
+ | `PDF2MCQ_OCR_MODELS` | Comma-separated OCR model priority list for scanned PDFs |
129
+
130
+ ---
131
+
132
+ ## Output Format
133
+
134
+ ```python
135
+ # Pretty-print
136
+ print(mcq.to_pretty_str())
137
+
138
+ # JSON
139
+ print(mcq.to_json())
140
+ # {
141
+ # "total_exam_time": 20,
142
+ # "questions": [
143
+ # {
144
+ # "question_html": "What is gradient descent?",
145
+ # "options": ["...", "...", "...", "..."],
146
+ # "answers": [0],
147
+ # "multi": false,
148
+ # "marks": 1.0,
149
+ # "negative_marks": 0.25,
150
+ # "difficulty": "easy",
151
+ # "explaination": "..."
152
+ # }
153
+ # ]
154
+ # }
155
+ ```
156
+
157
+ ---
158
+
159
+ ## Installation
160
+
161
+ ```bash
162
+ pip install pdf2mcq
163
+ ```
164
+
165
+ Requires **PyMuPDF** (fitz) — installed automatically as a dependency.
166
+
167
+ For scanned PDF OCR, also install [Tesseract](https://github.com/tesseract-ocr/tesseract).
168
+
169
+ ---
170
+
171
+ ## License
172
+
173
+ MIT
@@ -0,0 +1,140 @@
1
+ # pdf2mcq
2
+
3
+ Convert PDF files — text PDFs, scanned books, mixed documents — into high-quality MCQ questions using AI.
4
+
5
+ Built on top of **html2mcq**'s PDF pipeline, extracted as a standalone library focused purely on PDF-to-MCQ generation.
6
+
7
+ ---
8
+
9
+ ## Features
10
+
11
+ - **Smart PDF detection** — automatically detects text PDFs, scanned PDFs, and mixed documents
12
+ - **Text PDFs** — fast extraction via PyMuPDF with chunking at sentence boundaries
13
+ - **Scanned PDFs** — renders pages as images → vision API OCR (or pytesseract fallback)
14
+ - **Mixed PDFs** — text pages via PyMuPDF + scanned pages via OCR, combined intelligently
15
+ - **Multiple AI providers:** OpenRouter, Anthropic, OpenAI, Ollama
16
+ - **Auto model failover** for MCQ generation
17
+ - **CLI & Python API**
18
+
19
+ ---
20
+
21
+ ## Quick Start
22
+
23
+ ### CLI
24
+
25
+ ```bash
26
+ # Single PDF
27
+ pdf2mcq --pdf-path textbook.pdf -n 10
28
+
29
+ # Multiple PDF URLs
30
+ pdf2mcq --pdf-url https://example.com/chapter1.pdf --pdf-url https://example.com/chapter2.pdf
31
+
32
+ # Scan a folder of PDFs
33
+ pdf2mcq --pdf-folder ./textbooks/
34
+
35
+ # Output as JSON
36
+ pdf2mcq --pdf-path notes.pdf -o questions.json --format json
37
+ ```
38
+
39
+ ### Python API
40
+
41
+ ```python
42
+ from pdf2mcq import PDFMCQGenerator
43
+
44
+ gen = PDFMCQGenerator(
45
+ api_key="sk-or-v1-...",
46
+ provider="openrouter",
47
+ mcq_model="google/gemini-2.5-flash-lite",
48
+ )
49
+
50
+ # From local PDF
51
+ mcq = gen.from_pdf_paths("textbook.pdf", n=5)
52
+ print(mcq.to_pretty_str())
53
+
54
+ # From URL
55
+ mcq = gen.from_pdf_urls("https://example.com/notes.pdf", n=3)
56
+ print(mcq.to_json())
57
+
58
+ # Multiple PDFs
59
+ mcq = gen.from_pdf_paths(["chapter1.pdf", "chapter2.pdf", "chapter3.pdf"])
60
+ ```
61
+
62
+ ### Custom Instructions
63
+
64
+ ```python
65
+ mcq = gen.from_pdf_paths(
66
+ "lecture-notes.pdf",
67
+ n=10,
68
+ difficulty_mix="50% easy, 50% hard",
69
+ focus_topics=["machine learning", "neural networks"],
70
+ custom_instructions="Focus on mathematical derivations",
71
+ )
72
+ ```
73
+
74
+ ### Auto Model Selection
75
+
76
+ ```python
77
+ gen = PDFMCQGenerator(
78
+ api_key="sk-or-v1-...",
79
+ mcq_model="auto",
80
+ mcq_model_list=[
81
+ "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free",
82
+ "google/gemma-4-31b-it:free",
83
+ ],
84
+ )
85
+ ```
86
+
87
+ ### Environment Variables
88
+
89
+ | Variable | Purpose |
90
+ |---|---|
91
+ | `OPENROUTER_API_KEY` | Default API key for OpenRouter |
92
+ | `ANTHROPIC_API_KEY` | API key for Anthropic |
93
+ | `OPENAI_API_KEY` | API key for OpenAI |
94
+ | `PDF2MCQ_MCQ_MODELS` | Comma-separated MCQ model priority list for `mcq_model="auto"` |
95
+ | `PDF2MCQ_OCR_MODELS` | Comma-separated OCR model priority list for scanned PDFs |
96
+
97
+ ---
98
+
99
+ ## Output Format
100
+
101
+ ```python
102
+ # Pretty-print
103
+ print(mcq.to_pretty_str())
104
+
105
+ # JSON
106
+ print(mcq.to_json())
107
+ # {
108
+ # "total_exam_time": 20,
109
+ # "questions": [
110
+ # {
111
+ # "question_html": "What is gradient descent?",
112
+ # "options": ["...", "...", "...", "..."],
113
+ # "answers": [0],
114
+ # "multi": false,
115
+ # "marks": 1.0,
116
+ # "negative_marks": 0.25,
117
+ # "difficulty": "easy",
118
+ # "explaination": "..."
119
+ # }
120
+ # ]
121
+ # }
122
+ ```
123
+
124
+ ---
125
+
126
+ ## Installation
127
+
128
+ ```bash
129
+ pip install pdf2mcq
130
+ ```
131
+
132
+ Requires **PyMuPDF** (fitz) — installed automatically as a dependency.
133
+
134
+ For scanned PDF OCR, also install [Tesseract](https://github.com/tesseract-ocr/tesseract).
135
+
136
+ ---
137
+
138
+ ## License
139
+
140
+ MIT
@@ -0,0 +1,13 @@
1
+ from .generator import PDFMCQGenerator
2
+ from .pdf import PDFExtractor
3
+ from .models import MCQQuestion, MCQSet, ContentBlock
4
+
5
+ __version__ = "1.0.0"
6
+ __author__ = "pdf2mcq"
7
+ __all__ = [
8
+ "PDFMCQGenerator",
9
+ "PDFExtractor",
10
+ "MCQQuestion",
11
+ "MCQSet",
12
+ "ContentBlock",
13
+ ]
@@ -0,0 +1,187 @@
1
+ import argparse
2
+ import json
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+
7
+
8
+ _PDF_EXTENSION = ".pdf"
9
+
10
+
11
+ def _glob_files(folder: str, extensions: set) -> list:
12
+ folder = Path(folder)
13
+ if not folder.is_dir():
14
+ print(f"Error: folder not found: {folder}", file=sys.stderr)
15
+ sys.exit(1)
16
+ files = []
17
+ for ext in extensions:
18
+ files.extend(folder.glob(f"*{ext}"))
19
+ files.extend(folder.glob(f"*{ext.upper()}"))
20
+ seen = set()
21
+ unique = []
22
+ for f in sorted(files, key=lambda p: p.name.lower()):
23
+ if f.suffix.lower() in extensions and f.name not in seen:
24
+ seen.add(f.name)
25
+ unique.append(str(f))
26
+ return unique
27
+
28
+
29
+ def _get_api_key(args):
30
+ key = args.api_key or ""
31
+ if key:
32
+ return key
33
+ env_vars = {
34
+ "openrouter": "OPENROUTER_API_KEY",
35
+ "anthropic": "ANTHROPIC_API_KEY",
36
+ "openai": "OPENAI_API_KEY",
37
+ "ollama": "",
38
+ }
39
+ env_key = env_vars.get(args.provider, "")
40
+ if env_key:
41
+ return os.environ.get(env_key, "")
42
+ return ""
43
+
44
+
45
+ def main():
46
+ parser = argparse.ArgumentParser(
47
+ prog="pdf2mcq",
48
+ description="Convert PDF files (text, scanned, mixed) into MCQ questions using AI.",
49
+ )
50
+
51
+ parser.add_argument("--version", action="store_true", help="Show version and exit")
52
+
53
+ input_group = parser.add_argument_group("Input sources (at least one required)")
54
+ input_group.add_argument("--pdf-url", metavar="URL", action="append", default=[],
55
+ help="PDF URL (repeatable: --pdf-url url1 --pdf-url url2)")
56
+ input_group.add_argument("--pdf-path", metavar="FILE", action="append", default=[],
57
+ help="Local PDF file path (repeatable)")
58
+ input_group.add_argument("--pdf-folder", metavar="DIR", default="",
59
+ help="Scan folder for PDF files (.pdf)")
60
+
61
+ gen_group = parser.add_argument_group("Generation options")
62
+ gen_group.add_argument("-n", "--n", type=int, default=999,
63
+ help="Number of questions (default: 999 = as many as content supports)")
64
+ gen_group.add_argument("--difficulty", default=None,
65
+ help='E.g. "30%% easy, 40%% medium, 30%% hard"')
66
+ gen_group.add_argument("--topics", nargs="*", help="Focus topics")
67
+ gen_group.add_argument("--instructions", "-i", default="",
68
+ help='Custom instructions e.g. "Make answers very close and confusing"')
69
+ gen_group.add_argument("--batch-size", type=int, default=10,
70
+ help="Questions per API call (default: 10)")
71
+
72
+ ai_group = parser.add_argument_group("AI provider")
73
+ ai_group.add_argument("--provider", default="openrouter",
74
+ choices=["anthropic", "openai", "openrouter", "ollama"],
75
+ help="AI provider (default: openrouter). Use 'ollama' for local LLM.")
76
+ ai_group.add_argument("--mcq-model", default="",
77
+ help="MCQ generation model (or 'auto' to try --mcq-models)")
78
+ ai_group.add_argument("--mcq-models", default="",
79
+ help="Comma-separated priority model list for --mcq-model auto. "
80
+ "Runtime-reloadable via PDF2MCQ_MCQ_MODELS env var.")
81
+ ai_group.add_argument("--api-key", default="",
82
+ help="API key. Falls back to OPENROUTER_API_KEY / ANTHROPIC_API_KEY / OPENAI_API_KEY env var.")
83
+ ai_group.add_argument("--ollama-base-url", default="http://localhost:11434/v1",
84
+ help="Ollama API base URL (default: http://localhost:11434/v1). "
85
+ "Only used when --provider ollama.")
86
+
87
+ pdf_group = parser.add_argument_group("PDF processing")
88
+ pdf_group.add_argument("--pdf-backend", default="auto_detect",
89
+ choices=["auto_detect", "pymupdf", "image"],
90
+ help="PDF extraction backend (default: auto_detect)")
91
+ pdf_group.add_argument("--scanned-max-pages", type=int, default=50,
92
+ help="Max pages to OCR for scanned PDFs (default: 50)")
93
+ pdf_group.add_argument("--prompt-log-path", default="",
94
+ help="Dump prompts to file, or 'stdout' / '-' for terminal")
95
+
96
+ out_group = parser.add_argument_group("Output")
97
+ out_group.add_argument("--output", "-o", default="",
98
+ help="Output file (.json or .txt). Default: stdout")
99
+ out_group.add_argument("--format", choices=["json", "pretty"], default="pretty",
100
+ help="Output format (default: pretty)")
101
+
102
+ args = parser.parse_args()
103
+
104
+ if args.version:
105
+ try:
106
+ from pdf2mcq import __version__
107
+ except ImportError:
108
+ __version__ = "unknown"
109
+ print(f"pdf2mcq v{__version__}")
110
+ sys.exit(0)
111
+
112
+ if args.pdf_folder:
113
+ args.pdf_path.extend(_glob_files(args.pdf_folder, {_PDF_EXTENSION}))
114
+
115
+ has_input = bool(args.pdf_url or args.pdf_path)
116
+ if not has_input:
117
+ parser.print_help()
118
+ print("\nError: at least one input source is required "
119
+ "(--pdf-url, --pdf-path, --pdf-folder)", file=sys.stderr)
120
+ sys.exit(1)
121
+
122
+ try:
123
+ from pdf2mcq import PDFMCQGenerator
124
+ except ImportError as e:
125
+ print(f"Error: {e}", file=sys.stderr)
126
+ sys.exit(1)
127
+
128
+ api_key = _get_api_key(args)
129
+ mcq_model_list = None
130
+ if args.mcq_models:
131
+ mcq_model_list = [m.strip() for m in args.mcq_models.split(",") if m.strip()]
132
+
133
+ try:
134
+ gen = PDFMCQGenerator(
135
+ api_key=api_key or None,
136
+ provider=args.provider,
137
+ mcq_model=args.mcq_model,
138
+ mcq_model_list=mcq_model_list,
139
+ batch_size=args.batch_size,
140
+ pdf_backend=args.pdf_backend,
141
+ pdf_scanned_max_pages=args.scanned_max_pages,
142
+ prompt_log_path=args.prompt_log_path or None,
143
+ ollama_base_url=args.ollama_base_url,
144
+ )
145
+ except ValueError as e:
146
+ print(f"Configuration error: {e}", file=sys.stderr)
147
+ sys.exit(1)
148
+
149
+ n = args.n
150
+ difficulty = args.difficulty
151
+ topics = args.topics
152
+ instructions = args.instructions or None
153
+
154
+ mcq_set = None
155
+
156
+ try:
157
+ if args.pdf_url:
158
+ print(f"Fetching PDFs: {args.pdf_url}", file=sys.stderr)
159
+ mcq_set = gen.from_pdf_urls(args.pdf_url, n=n, difficulty_mix=difficulty,
160
+ focus_topics=topics, custom_instructions=instructions)
161
+ if args.pdf_path:
162
+ print(f"Reading PDFs: {args.pdf_path}", file=sys.stderr)
163
+ mcq_set = gen.from_pdf_paths(args.pdf_path, n=n, difficulty_mix=difficulty,
164
+ focus_topics=topics, custom_instructions=instructions)
165
+ except Exception as e:
166
+ print(f"Generation failed: {e}", file=sys.stderr)
167
+ sys.exit(1)
168
+
169
+ if mcq_set is None or not mcq_set.questions:
170
+ print("No questions were generated.", file=sys.stderr)
171
+ sys.exit(1)
172
+
173
+ if args.format == "json":
174
+ output = mcq_set.to_json()
175
+ else:
176
+ output = mcq_set.to_pretty_str()
177
+
178
+ if args.output:
179
+ with open(args.output, "w", encoding="utf-8") as f:
180
+ f.write(output)
181
+ print(f"Saved {mcq_set.total_questions} questions to {args.output}", file=sys.stderr)
182
+ else:
183
+ print(output)
184
+
185
+
186
+ if __name__ == "__main__":
187
+ main()