pdf2mcq 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf2mcq/__init__.py +13 -0
- pdf2mcq/cli.py +187 -0
- pdf2mcq/generator.py +546 -0
- pdf2mcq/image_ocr.py +159 -0
- pdf2mcq/models.py +100 -0
- pdf2mcq/pdf.py +554 -0
- pdf2mcq/prompts.py +124 -0
- pdf2mcq-1.0.0.dist-info/METADATA +173 -0
- pdf2mcq-1.0.0.dist-info/RECORD +13 -0
- pdf2mcq-1.0.0.dist-info/WHEEL +5 -0
- pdf2mcq-1.0.0.dist-info/entry_points.txt +2 -0
- pdf2mcq-1.0.0.dist-info/licenses/LICENSE +21 -0
- pdf2mcq-1.0.0.dist-info/top_level.txt +1 -0
pdf2mcq/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .generator import PDFMCQGenerator
|
|
2
|
+
from .pdf import PDFExtractor
|
|
3
|
+
from .models import MCQQuestion, MCQSet, ContentBlock
|
|
4
|
+
|
|
5
|
+
__version__ = "1.0.0"
|
|
6
|
+
__author__ = "pdf2mcq"
|
|
7
|
+
__all__ = [
|
|
8
|
+
"PDFMCQGenerator",
|
|
9
|
+
"PDFExtractor",
|
|
10
|
+
"MCQQuestion",
|
|
11
|
+
"MCQSet",
|
|
12
|
+
"ContentBlock",
|
|
13
|
+
]
|
pdf2mcq/cli.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
_PDF_EXTENSION = ".pdf"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _glob_files(folder: str, extensions: set) -> list:
|
|
12
|
+
folder = Path(folder)
|
|
13
|
+
if not folder.is_dir():
|
|
14
|
+
print(f"Error: folder not found: {folder}", file=sys.stderr)
|
|
15
|
+
sys.exit(1)
|
|
16
|
+
files = []
|
|
17
|
+
for ext in extensions:
|
|
18
|
+
files.extend(folder.glob(f"*{ext}"))
|
|
19
|
+
files.extend(folder.glob(f"*{ext.upper()}"))
|
|
20
|
+
seen = set()
|
|
21
|
+
unique = []
|
|
22
|
+
for f in sorted(files, key=lambda p: p.name.lower()):
|
|
23
|
+
if f.suffix.lower() in extensions and f.name not in seen:
|
|
24
|
+
seen.add(f.name)
|
|
25
|
+
unique.append(str(f))
|
|
26
|
+
return unique
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _get_api_key(args):
|
|
30
|
+
key = args.api_key or ""
|
|
31
|
+
if key:
|
|
32
|
+
return key
|
|
33
|
+
env_vars = {
|
|
34
|
+
"openrouter": "OPENROUTER_API_KEY",
|
|
35
|
+
"anthropic": "ANTHROPIC_API_KEY",
|
|
36
|
+
"openai": "OPENAI_API_KEY",
|
|
37
|
+
"ollama": "",
|
|
38
|
+
}
|
|
39
|
+
env_key = env_vars.get(args.provider, "")
|
|
40
|
+
if env_key:
|
|
41
|
+
return os.environ.get(env_key, "")
|
|
42
|
+
return ""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def main():
|
|
46
|
+
parser = argparse.ArgumentParser(
|
|
47
|
+
prog="pdf2mcq",
|
|
48
|
+
description="Convert PDF files (text, scanned, mixed) into MCQ questions using AI.",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
parser.add_argument("--version", action="store_true", help="Show version and exit")
|
|
52
|
+
|
|
53
|
+
input_group = parser.add_argument_group("Input sources (at least one required)")
|
|
54
|
+
input_group.add_argument("--pdf-url", metavar="URL", action="append", default=[],
|
|
55
|
+
help="PDF URL (repeatable: --pdf-url url1 --pdf-url url2)")
|
|
56
|
+
input_group.add_argument("--pdf-path", metavar="FILE", action="append", default=[],
|
|
57
|
+
help="Local PDF file path (repeatable)")
|
|
58
|
+
input_group.add_argument("--pdf-folder", metavar="DIR", default="",
|
|
59
|
+
help="Scan folder for PDF files (.pdf)")
|
|
60
|
+
|
|
61
|
+
gen_group = parser.add_argument_group("Generation options")
|
|
62
|
+
gen_group.add_argument("-n", "--n", type=int, default=999,
|
|
63
|
+
help="Number of questions (default: 999 = as many as content supports)")
|
|
64
|
+
gen_group.add_argument("--difficulty", default=None,
|
|
65
|
+
help='E.g. "30%% easy, 40%% medium, 30%% hard"')
|
|
66
|
+
gen_group.add_argument("--topics", nargs="*", help="Focus topics")
|
|
67
|
+
gen_group.add_argument("--instructions", "-i", default="",
|
|
68
|
+
help='Custom instructions e.g. "Make answers very close and confusing"')
|
|
69
|
+
gen_group.add_argument("--batch-size", type=int, default=10,
|
|
70
|
+
help="Questions per API call (default: 10)")
|
|
71
|
+
|
|
72
|
+
ai_group = parser.add_argument_group("AI provider")
|
|
73
|
+
ai_group.add_argument("--provider", default="openrouter",
|
|
74
|
+
choices=["anthropic", "openai", "openrouter", "ollama"],
|
|
75
|
+
help="AI provider (default: openrouter). Use 'ollama' for local LLM.")
|
|
76
|
+
ai_group.add_argument("--mcq-model", default="",
|
|
77
|
+
help="MCQ generation model (or 'auto' to try --mcq-models)")
|
|
78
|
+
ai_group.add_argument("--mcq-models", default="",
|
|
79
|
+
help="Comma-separated priority model list for --mcq-model auto. "
|
|
80
|
+
"Runtime-reloadable via PDF2MCQ_MCQ_MODELS env var.")
|
|
81
|
+
ai_group.add_argument("--api-key", default="",
|
|
82
|
+
help="API key. Falls back to OPENROUTER_API_KEY / ANTHROPIC_API_KEY / OPENAI_API_KEY env var.")
|
|
83
|
+
ai_group.add_argument("--ollama-base-url", default="http://localhost:11434/v1",
|
|
84
|
+
help="Ollama API base URL (default: http://localhost:11434/v1). "
|
|
85
|
+
"Only used when --provider ollama.")
|
|
86
|
+
|
|
87
|
+
pdf_group = parser.add_argument_group("PDF processing")
|
|
88
|
+
pdf_group.add_argument("--pdf-backend", default="auto_detect",
|
|
89
|
+
choices=["auto_detect", "pymupdf", "image"],
|
|
90
|
+
help="PDF extraction backend (default: auto_detect)")
|
|
91
|
+
pdf_group.add_argument("--scanned-max-pages", type=int, default=50,
|
|
92
|
+
help="Max pages to OCR for scanned PDFs (default: 50)")
|
|
93
|
+
pdf_group.add_argument("--prompt-log-path", default="",
|
|
94
|
+
help="Dump prompts to file, or 'stdout' / '-' for terminal")
|
|
95
|
+
|
|
96
|
+
out_group = parser.add_argument_group("Output")
|
|
97
|
+
out_group.add_argument("--output", "-o", default="",
|
|
98
|
+
help="Output file (.json or .txt). Default: stdout")
|
|
99
|
+
out_group.add_argument("--format", choices=["json", "pretty"], default="pretty",
|
|
100
|
+
help="Output format (default: pretty)")
|
|
101
|
+
|
|
102
|
+
args = parser.parse_args()
|
|
103
|
+
|
|
104
|
+
if args.version:
|
|
105
|
+
try:
|
|
106
|
+
from pdf2mcq import __version__
|
|
107
|
+
except ImportError:
|
|
108
|
+
__version__ = "unknown"
|
|
109
|
+
print(f"pdf2mcq v{__version__}")
|
|
110
|
+
sys.exit(0)
|
|
111
|
+
|
|
112
|
+
if args.pdf_folder:
|
|
113
|
+
args.pdf_path.extend(_glob_files(args.pdf_folder, {_PDF_EXTENSION}))
|
|
114
|
+
|
|
115
|
+
has_input = bool(args.pdf_url or args.pdf_path)
|
|
116
|
+
if not has_input:
|
|
117
|
+
parser.print_help()
|
|
118
|
+
print("\nError: at least one input source is required "
|
|
119
|
+
"(--pdf-url, --pdf-path, --pdf-folder)", file=sys.stderr)
|
|
120
|
+
sys.exit(1)
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
from pdf2mcq import PDFMCQGenerator
|
|
124
|
+
except ImportError as e:
|
|
125
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
126
|
+
sys.exit(1)
|
|
127
|
+
|
|
128
|
+
api_key = _get_api_key(args)
|
|
129
|
+
mcq_model_list = None
|
|
130
|
+
if args.mcq_models:
|
|
131
|
+
mcq_model_list = [m.strip() for m in args.mcq_models.split(",") if m.strip()]
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
gen = PDFMCQGenerator(
|
|
135
|
+
api_key=api_key or None,
|
|
136
|
+
provider=args.provider,
|
|
137
|
+
mcq_model=args.mcq_model,
|
|
138
|
+
mcq_model_list=mcq_model_list,
|
|
139
|
+
batch_size=args.batch_size,
|
|
140
|
+
pdf_backend=args.pdf_backend,
|
|
141
|
+
pdf_scanned_max_pages=args.scanned_max_pages,
|
|
142
|
+
prompt_log_path=args.prompt_log_path or None,
|
|
143
|
+
ollama_base_url=args.ollama_base_url,
|
|
144
|
+
)
|
|
145
|
+
except ValueError as e:
|
|
146
|
+
print(f"Configuration error: {e}", file=sys.stderr)
|
|
147
|
+
sys.exit(1)
|
|
148
|
+
|
|
149
|
+
n = args.n
|
|
150
|
+
difficulty = args.difficulty
|
|
151
|
+
topics = args.topics
|
|
152
|
+
instructions = args.instructions or None
|
|
153
|
+
|
|
154
|
+
mcq_set = None
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
if args.pdf_url:
|
|
158
|
+
print(f"Fetching PDFs: {args.pdf_url}", file=sys.stderr)
|
|
159
|
+
mcq_set = gen.from_pdf_urls(args.pdf_url, n=n, difficulty_mix=difficulty,
|
|
160
|
+
focus_topics=topics, custom_instructions=instructions)
|
|
161
|
+
if args.pdf_path:
|
|
162
|
+
print(f"Reading PDFs: {args.pdf_path}", file=sys.stderr)
|
|
163
|
+
mcq_set = gen.from_pdf_paths(args.pdf_path, n=n, difficulty_mix=difficulty,
|
|
164
|
+
focus_topics=topics, custom_instructions=instructions)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
print(f"Generation failed: {e}", file=sys.stderr)
|
|
167
|
+
sys.exit(1)
|
|
168
|
+
|
|
169
|
+
if mcq_set is None or not mcq_set.questions:
|
|
170
|
+
print("No questions were generated.", file=sys.stderr)
|
|
171
|
+
sys.exit(1)
|
|
172
|
+
|
|
173
|
+
if args.format == "json":
|
|
174
|
+
output = mcq_set.to_json()
|
|
175
|
+
else:
|
|
176
|
+
output = mcq_set.to_pretty_str()
|
|
177
|
+
|
|
178
|
+
if args.output:
|
|
179
|
+
with open(args.output, "w", encoding="utf-8") as f:
|
|
180
|
+
f.write(output)
|
|
181
|
+
print(f"Saved {mcq_set.total_questions} questions to {args.output}", file=sys.stderr)
|
|
182
|
+
else:
|
|
183
|
+
print(output)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
if __name__ == "__main__":
|
|
187
|
+
main()
|