pdf-transcriber 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_transcriber/__init__.py +6 -0
- pdf_transcriber/cli.py +291 -0
- pdf_transcriber/config.py +109 -0
- pdf_transcriber/core/__init__.py +21 -0
- pdf_transcriber/core/linter/__init__.py +5 -0
- pdf_transcriber/core/linter/engine.py +184 -0
- pdf_transcriber/core/linter/models.py +72 -0
- pdf_transcriber/core/linter/rules/__init__.py +55 -0
- pdf_transcriber/core/linter/rules/artifacts.py +1030 -0
- pdf_transcriber/core/linter/rules/markdown.py +191 -0
- pdf_transcriber/core/linter/rules/math.py +633 -0
- pdf_transcriber/core/metadata_parser.py +245 -0
- pdf_transcriber/core/pdf_processor.py +173 -0
- pdf_transcriber/core/state_manager.py +325 -0
- pdf_transcriber/core/transcription.py +476 -0
- pdf_transcriber/server.py +50 -0
- pdf_transcriber/skills/__init__.py +1 -0
- pdf_transcriber/skills/transcribe.md +48 -0
- pdf_transcriber/tools/__init__.py +4 -0
- pdf_transcriber/tools/lint.py +72 -0
- pdf_transcriber/tools/transcribe.py +333 -0
- pdf_transcriber-1.0.0.dist-info/METADATA +401 -0
- pdf_transcriber-1.0.0.dist-info/RECORD +26 -0
- pdf_transcriber-1.0.0.dist-info/WHEEL +4 -0
- pdf_transcriber-1.0.0.dist-info/entry_points.txt +3 -0
- pdf_transcriber-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
"""transcribe_pdf tool implementation."""
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from pdf_transcriber.config import Config
|
|
6
|
+
from pdf_transcriber.core.pdf_processor import PDFProcessor
|
|
7
|
+
from pdf_transcriber.core.transcription import get_transcription_engine, clear_engine_cache
|
|
8
|
+
from pdf_transcriber.core.state_manager import StateManager
|
|
9
|
+
from pdf_transcriber.core.metadata_parser import (
|
|
10
|
+
create_initial_metadata,
|
|
11
|
+
generate_frontmatter
|
|
12
|
+
)
|
|
13
|
+
from pdf_transcriber.core.linter import engine as lint_engine
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def register(mcp, config: Config):
|
|
19
|
+
"""Register transcribe_pdf tool with MCP server."""
|
|
20
|
+
|
|
21
|
+
@mcp.tool()
|
|
22
|
+
async def transcribe_pdf(
|
|
23
|
+
pdf_path: str,
|
|
24
|
+
quality: str = "balanced",
|
|
25
|
+
mode: str = "streaming",
|
|
26
|
+
output_dir: str | None = None,
|
|
27
|
+
resume: bool = True,
|
|
28
|
+
metadata: dict | None = None,
|
|
29
|
+
lint: bool = True,
|
|
30
|
+
chunk_size: int | None = None
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
Convert a PDF to Markdown using vision-based transcription.
|
|
34
|
+
|
|
35
|
+
This tool uses Marker OCR with optional LLM enhancement to transcribe
|
|
36
|
+
PDF pages to Markdown. It supports resume-on-failure, quality presets,
|
|
37
|
+
and rich metadata.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
pdf_path: Path to the PDF file to transcribe
|
|
41
|
+
quality: Quality preset - "fast" (100 DPI), "balanced" (150 DPI, default), or "high-quality" (200 DPI)
|
|
42
|
+
mode: Processing mode - "streaming" (page-by-page, default) or "batch" (concurrent)
|
|
43
|
+
output_dir: Override default output directory (default: ./transcriptions)
|
|
44
|
+
resume: If True, resume from previous progress if available (default: True)
|
|
45
|
+
metadata: Optional metadata dict with fields: title, authors (list), year (int), journal, arxiv_id, doi, keywords (list)
|
|
46
|
+
lint: If True (default), run linting with auto-fix after transcription. Original saved as {name}.original.md
|
|
47
|
+
chunk_size: Pages per processing chunk (None = auto-detect based on PDF size, 0 = disable chunking)
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Dictionary with keys:
|
|
51
|
+
- success (bool): Whether transcription succeeded
|
|
52
|
+
- output_path (str | None): Path to final output file
|
|
53
|
+
- pages_transcribed (int): Number of pages successfully transcribed
|
|
54
|
+
- total_pages (int): Total pages in PDF
|
|
55
|
+
- partial_content (str | None): Partial transcription if failed
|
|
56
|
+
- error (str | None): Error message if failed
|
|
57
|
+
- metadata (dict): Final metadata applied
|
|
58
|
+
- lint_results (dict | None): Linting results if lint=True
|
|
59
|
+
|
|
60
|
+
Example:
|
|
61
|
+
{
|
|
62
|
+
"pdf_path": "~/Downloads/paper.pdf",
|
|
63
|
+
"quality": "balanced",
|
|
64
|
+
"metadata": {
|
|
65
|
+
"title": "Introduction to Algebraic Geometry",
|
|
66
|
+
"authors": ["Hartshorne"],
|
|
67
|
+
"keywords": ["algebraic geometry", "sheaves"]
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
"""
|
|
71
|
+
# Validate and expand paths
|
|
72
|
+
pdf_path = Path(pdf_path).expanduser().resolve()
|
|
73
|
+
if not pdf_path.exists():
|
|
74
|
+
return {
|
|
75
|
+
"success": False,
|
|
76
|
+
"output_path": None,
|
|
77
|
+
"pages_transcribed": 0,
|
|
78
|
+
"total_pages": 0,
|
|
79
|
+
"partial_content": None,
|
|
80
|
+
"error": f"PDF not found: {pdf_path}",
|
|
81
|
+
"metadata": {},
|
|
82
|
+
"lint_results": None
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if quality not in config.quality_presets:
|
|
86
|
+
return {
|
|
87
|
+
"success": False,
|
|
88
|
+
"output_path": None,
|
|
89
|
+
"pages_transcribed": 0,
|
|
90
|
+
"total_pages": 0,
|
|
91
|
+
"partial_content": None,
|
|
92
|
+
"error": f"Invalid quality: {quality}. Must be one of {list(config.quality_presets.keys())}",
|
|
93
|
+
"metadata": {},
|
|
94
|
+
"lint_results": None
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
# Determine output location
|
|
98
|
+
paper_name = pdf_path.stem
|
|
99
|
+
out_dir = Path(output_dir).expanduser() if output_dir else config.output_dir
|
|
100
|
+
paper_dir = out_dir / paper_name
|
|
101
|
+
paper_dir.mkdir(parents=True, exist_ok=True)
|
|
102
|
+
|
|
103
|
+
# Get DPI from quality preset
|
|
104
|
+
dpi = config.get_dpi(quality)
|
|
105
|
+
|
|
106
|
+
logger.info(
|
|
107
|
+
f"Starting transcription: {pdf_path.name} "
|
|
108
|
+
f"(quality={quality}/{dpi}dpi, mode={mode})"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Initialize state manager
|
|
112
|
+
state_mgr = StateManager(out_dir, paper_name)
|
|
113
|
+
|
|
114
|
+
# Check for existing job
|
|
115
|
+
if resume and state_mgr.has_existing_job():
|
|
116
|
+
state = state_mgr.load_state()
|
|
117
|
+
if state:
|
|
118
|
+
logger.info(
|
|
119
|
+
f"Resuming job: {len(state.completed_pages)}/{state.total_pages} "
|
|
120
|
+
f"pages done"
|
|
121
|
+
)
|
|
122
|
+
else:
|
|
123
|
+
# Start fresh
|
|
124
|
+
try:
|
|
125
|
+
with PDFProcessor(str(pdf_path), dpi) as proc:
|
|
126
|
+
total_pages = proc.total_pages
|
|
127
|
+
except Exception as e:
|
|
128
|
+
return {
|
|
129
|
+
"success": False,
|
|
130
|
+
"output_path": None,
|
|
131
|
+
"pages_transcribed": 0,
|
|
132
|
+
"total_pages": 0,
|
|
133
|
+
"partial_content": None,
|
|
134
|
+
"error": f"Failed to open PDF: {e}",
|
|
135
|
+
"metadata": {},
|
|
136
|
+
"lint_results": None
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
state = state_mgr.create_job(
|
|
140
|
+
str(pdf_path), total_pages, "markdown", quality
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Get transcription engine (cached to avoid reloading models)
|
|
144
|
+
engine = get_transcription_engine(
|
|
145
|
+
use_gpu=config.use_gpu,
|
|
146
|
+
batch_size=config.marker_batch_size,
|
|
147
|
+
langs=config.marker_langs,
|
|
148
|
+
# LLM-enhanced OCR settings
|
|
149
|
+
use_llm=config.use_llm,
|
|
150
|
+
llm_service=config.llm_service,
|
|
151
|
+
ollama_base_url=config.ollama_base_url,
|
|
152
|
+
ollama_model=config.ollama_model
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Determine actual chunk size (auto-chunking logic)
|
|
156
|
+
if chunk_size is not None:
|
|
157
|
+
# Explicit chunk_size: use it (0 = disable chunking)
|
|
158
|
+
actual_chunk_size = chunk_size
|
|
159
|
+
elif state.total_pages > config.auto_chunk_threshold:
|
|
160
|
+
# Large PDF: auto-enable chunking with default size
|
|
161
|
+
actual_chunk_size = config.chunk_size
|
|
162
|
+
logger.info(
|
|
163
|
+
f"Auto-chunking enabled: {state.total_pages} pages > "
|
|
164
|
+
f"{config.auto_chunk_threshold} threshold (chunk_size={actual_chunk_size})"
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
# Small PDF: process all at once
|
|
168
|
+
actual_chunk_size = 0
|
|
169
|
+
|
|
170
|
+
# Transcribe
|
|
171
|
+
try:
|
|
172
|
+
with PDFProcessor(str(pdf_path), dpi) as proc:
|
|
173
|
+
if mode == "streaming":
|
|
174
|
+
content = await engine.transcribe_streaming(
|
|
175
|
+
proc, "markdown", state_mgr,
|
|
176
|
+
chunk_size=actual_chunk_size
|
|
177
|
+
)
|
|
178
|
+
elif mode == "batch":
|
|
179
|
+
content = await engine.transcribe_batch(
|
|
180
|
+
proc, "markdown", state_mgr, config.max_concurrent_pages
|
|
181
|
+
)
|
|
182
|
+
else:
|
|
183
|
+
return {
|
|
184
|
+
"success": False,
|
|
185
|
+
"output_path": None,
|
|
186
|
+
"pages_transcribed": 0,
|
|
187
|
+
"total_pages": state.total_pages,
|
|
188
|
+
"partial_content": None,
|
|
189
|
+
"error": f"Invalid mode: {mode}. Must be 'streaming' or 'batch'",
|
|
190
|
+
"metadata": {},
|
|
191
|
+
"lint_results": None
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
except Exception as e:
|
|
195
|
+
# Return partial result on failure
|
|
196
|
+
partial = state_mgr.assemble_output()
|
|
197
|
+
summary = state_mgr.get_progress_summary()
|
|
198
|
+
|
|
199
|
+
logger.error(f"Transcription failed: {e}")
|
|
200
|
+
|
|
201
|
+
return {
|
|
202
|
+
"success": False,
|
|
203
|
+
"output_path": None,
|
|
204
|
+
"pages_transcribed": summary["completed"],
|
|
205
|
+
"total_pages": summary["total"],
|
|
206
|
+
"partial_content": partial if partial else None,
|
|
207
|
+
"error": f"Transcription failed: {e}",
|
|
208
|
+
"metadata": metadata or {},
|
|
209
|
+
"lint_results": None
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
# Build metadata
|
|
213
|
+
meta_dict = metadata or {}
|
|
214
|
+
paper_title = meta_dict.get("title", paper_name)
|
|
215
|
+
paper_authors = meta_dict.get("authors", [])
|
|
216
|
+
paper_year = meta_dict.get("year")
|
|
217
|
+
|
|
218
|
+
paper_meta = create_initial_metadata(
|
|
219
|
+
title=paper_title,
|
|
220
|
+
pdf_source=pdf_path,
|
|
221
|
+
total_pages=state.total_pages,
|
|
222
|
+
output_format="markdown",
|
|
223
|
+
quality=quality,
|
|
224
|
+
authors=paper_authors,
|
|
225
|
+
year=paper_year,
|
|
226
|
+
journal=meta_dict.get("journal"),
|
|
227
|
+
arxiv_id=meta_dict.get("arxiv_id"),
|
|
228
|
+
doi=meta_dict.get("doi"),
|
|
229
|
+
keywords=meta_dict.get("keywords", [])
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Update transcribed_pages count
|
|
233
|
+
summary = state_mgr.get_progress_summary()
|
|
234
|
+
paper_meta.transcribed_pages = summary["completed"]
|
|
235
|
+
|
|
236
|
+
# Write final output with frontmatter
|
|
237
|
+
output_path = paper_dir / f"{paper_name}.md"
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
final_content = generate_frontmatter(paper_meta) + "\n" + content
|
|
241
|
+
output_path.write_text(final_content, encoding="utf-8")
|
|
242
|
+
except Exception as e:
|
|
243
|
+
return {
|
|
244
|
+
"success": False,
|
|
245
|
+
"output_path": None,
|
|
246
|
+
"pages_transcribed": summary["completed"],
|
|
247
|
+
"total_pages": summary["total"],
|
|
248
|
+
"partial_content": content,
|
|
249
|
+
"error": f"Failed to write output file: {e}",
|
|
250
|
+
"metadata": paper_meta.to_dict(),
|
|
251
|
+
"lint_results": None
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
# Cleanup progress files on success
|
|
255
|
+
if summary["completed"] == summary["total"]:
|
|
256
|
+
state_mgr.cleanup()
|
|
257
|
+
|
|
258
|
+
logger.info(
|
|
259
|
+
f"Transcription complete: {output_path} "
|
|
260
|
+
f"({summary['completed']}/{summary['total']} pages)"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# Run linting if enabled
|
|
264
|
+
lint_results = None
|
|
265
|
+
if lint:
|
|
266
|
+
try:
|
|
267
|
+
# Save original (non-linted) version for manual review
|
|
268
|
+
original_path = paper_dir / f"{paper_name}.original.md"
|
|
269
|
+
original_path.write_text(final_content, encoding="utf-8")
|
|
270
|
+
logger.info(f"Saved original (pre-lint) to: {original_path}")
|
|
271
|
+
|
|
272
|
+
# Run linter with auto-fix
|
|
273
|
+
lint_report = await lint_engine.lint_file(output_path, fix=True)
|
|
274
|
+
lint_results = {
|
|
275
|
+
"total_issues": lint_report.total_issues,
|
|
276
|
+
"auto_fixed": len(lint_report.fixed),
|
|
277
|
+
"warnings": lint_report.warnings,
|
|
278
|
+
"fixed_rules": lint_report.fixed,
|
|
279
|
+
"original_path": str(original_path)
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
logger.info(
|
|
283
|
+
f"Linting: {lint_report.total_issues} issues found, "
|
|
284
|
+
f"{len(lint_report.fixed)} auto-fixed. "
|
|
285
|
+
f"Original saved to {original_path.name}"
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.warning(f"Linting failed (file still saved): {e}")
|
|
290
|
+
lint_results = {"error": str(e)}
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
"success": True,
|
|
294
|
+
"output_path": str(output_path),
|
|
295
|
+
"pages_transcribed": summary["completed"],
|
|
296
|
+
"total_pages": summary["total"],
|
|
297
|
+
"partial_content": None,
|
|
298
|
+
"error": None,
|
|
299
|
+
"metadata": {
|
|
300
|
+
"title": paper_meta.title,
|
|
301
|
+
"authors": paper_meta.authors,
|
|
302
|
+
"keywords": paper_meta.keywords,
|
|
303
|
+
"year": paper_meta.year
|
|
304
|
+
},
|
|
305
|
+
"lint_results": lint_results
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
@mcp.tool()
|
|
309
|
+
async def clear_transcription_cache() -> dict:
|
|
310
|
+
"""
|
|
311
|
+
Clear the cached transcription engine to free memory.
|
|
312
|
+
|
|
313
|
+
The transcription engine caches Marker OCR models (~2GB) to speed up
|
|
314
|
+
sequential transcriptions. Call this tool when you're done transcribing
|
|
315
|
+
to reclaim memory.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Dictionary with:
|
|
319
|
+
- cleared (int): Number of cached engines that were cleared
|
|
320
|
+
- message (str): Status message
|
|
321
|
+
"""
|
|
322
|
+
count = clear_engine_cache()
|
|
323
|
+
|
|
324
|
+
if count > 0:
|
|
325
|
+
message = f"Cleared {count} cached engine(s), freeing ~2GB memory"
|
|
326
|
+
logger.info(message)
|
|
327
|
+
else:
|
|
328
|
+
message = "No cached engines to clear"
|
|
329
|
+
|
|
330
|
+
return {
|
|
331
|
+
"cleared": count,
|
|
332
|
+
"message": message
|
|
333
|
+
}
|