pdf-transcriber 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,333 @@
1
+ """transcribe_pdf tool implementation."""
2
+ from pathlib import Path
3
+ import logging
4
+
5
+ from pdf_transcriber.config import Config
6
+ from pdf_transcriber.core.pdf_processor import PDFProcessor
7
+ from pdf_transcriber.core.transcription import get_transcription_engine, clear_engine_cache
8
+ from pdf_transcriber.core.state_manager import StateManager
9
+ from pdf_transcriber.core.metadata_parser import (
10
+ create_initial_metadata,
11
+ generate_frontmatter
12
+ )
13
+ from pdf_transcriber.core.linter import engine as lint_engine
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def register(mcp, config: Config):
19
+ """Register transcribe_pdf tool with MCP server."""
20
+
21
+ @mcp.tool()
22
+ async def transcribe_pdf(
23
+ pdf_path: str,
24
+ quality: str = "balanced",
25
+ mode: str = "streaming",
26
+ output_dir: str | None = None,
27
+ resume: bool = True,
28
+ metadata: dict | None = None,
29
+ lint: bool = True,
30
+ chunk_size: int | None = None
31
+ ) -> dict:
32
+ """
33
+ Convert a PDF to Markdown using vision-based transcription.
34
+
35
+ This tool uses Marker OCR with optional LLM enhancement to transcribe
36
+ PDF pages to Markdown. It supports resume-on-failure, quality presets,
37
+ and rich metadata.
38
+
39
+ Args:
40
+ pdf_path: Path to the PDF file to transcribe
41
+ quality: Quality preset - "fast" (100 DPI), "balanced" (150 DPI, default), or "high-quality" (200 DPI)
42
+ mode: Processing mode - "streaming" (page-by-page, default) or "batch" (concurrent)
43
+ output_dir: Override default output directory (default: ./transcriptions)
44
+ resume: If True, resume from previous progress if available (default: True)
45
+ metadata: Optional metadata dict with fields: title, authors (list), year (int), journal, arxiv_id, doi, keywords (list)
46
+ lint: If True (default), run linting with auto-fix after transcription. Original saved as {name}.original.md
47
+ chunk_size: Pages per processing chunk (None = auto-detect based on PDF size, 0 = disable chunking)
48
+
49
+ Returns:
50
+ Dictionary with keys:
51
+ - success (bool): Whether transcription succeeded
52
+ - output_path (str | None): Path to final output file
53
+ - pages_transcribed (int): Number of pages successfully transcribed
54
+ - total_pages (int): Total pages in PDF
55
+ - partial_content (str | None): Partial transcription if failed
56
+ - error (str | None): Error message if failed
57
+ - metadata (dict): Final metadata applied
58
+ - lint_results (dict | None): Linting results if lint=True
59
+
60
+ Example:
61
+ {
62
+ "pdf_path": "~/Downloads/paper.pdf",
63
+ "quality": "balanced",
64
+ "metadata": {
65
+ "title": "Introduction to Algebraic Geometry",
66
+ "authors": ["Hartshorne"],
67
+ "keywords": ["algebraic geometry", "sheaves"]
68
+ }
69
+ }
70
+ """
71
+ # Validate and expand paths
72
+ pdf_path = Path(pdf_path).expanduser().resolve()
73
+ if not pdf_path.exists():
74
+ return {
75
+ "success": False,
76
+ "output_path": None,
77
+ "pages_transcribed": 0,
78
+ "total_pages": 0,
79
+ "partial_content": None,
80
+ "error": f"PDF not found: {pdf_path}",
81
+ "metadata": {},
82
+ "lint_results": None
83
+ }
84
+
85
+ if quality not in config.quality_presets:
86
+ return {
87
+ "success": False,
88
+ "output_path": None,
89
+ "pages_transcribed": 0,
90
+ "total_pages": 0,
91
+ "partial_content": None,
92
+ "error": f"Invalid quality: {quality}. Must be one of {list(config.quality_presets.keys())}",
93
+ "metadata": {},
94
+ "lint_results": None
95
+ }
96
+
97
+ # Determine output location
98
+ paper_name = pdf_path.stem
99
+ out_dir = Path(output_dir).expanduser() if output_dir else config.output_dir
100
+ paper_dir = out_dir / paper_name
101
+ paper_dir.mkdir(parents=True, exist_ok=True)
102
+
103
+ # Get DPI from quality preset
104
+ dpi = config.get_dpi(quality)
105
+
106
+ logger.info(
107
+ f"Starting transcription: {pdf_path.name} "
108
+ f"(quality={quality}/{dpi}dpi, mode={mode})"
109
+ )
110
+
111
+ # Initialize state manager
112
+ state_mgr = StateManager(out_dir, paper_name)
113
+
114
+ # Check for existing job
115
+ if resume and state_mgr.has_existing_job():
116
+ state = state_mgr.load_state()
117
+ if state:
118
+ logger.info(
119
+ f"Resuming job: {len(state.completed_pages)}/{state.total_pages} "
120
+ f"pages done"
121
+ )
122
+ else:
123
+ # Start fresh
124
+ try:
125
+ with PDFProcessor(str(pdf_path), dpi) as proc:
126
+ total_pages = proc.total_pages
127
+ except Exception as e:
128
+ return {
129
+ "success": False,
130
+ "output_path": None,
131
+ "pages_transcribed": 0,
132
+ "total_pages": 0,
133
+ "partial_content": None,
134
+ "error": f"Failed to open PDF: {e}",
135
+ "metadata": {},
136
+ "lint_results": None
137
+ }
138
+
139
+ state = state_mgr.create_job(
140
+ str(pdf_path), total_pages, "markdown", quality
141
+ )
142
+
143
+ # Get transcription engine (cached to avoid reloading models)
144
+ engine = get_transcription_engine(
145
+ use_gpu=config.use_gpu,
146
+ batch_size=config.marker_batch_size,
147
+ langs=config.marker_langs,
148
+ # LLM-enhanced OCR settings
149
+ use_llm=config.use_llm,
150
+ llm_service=config.llm_service,
151
+ ollama_base_url=config.ollama_base_url,
152
+ ollama_model=config.ollama_model
153
+ )
154
+
155
+ # Determine actual chunk size (auto-chunking logic)
156
+ if chunk_size is not None:
157
+ # Explicit chunk_size: use it (0 = disable chunking)
158
+ actual_chunk_size = chunk_size
159
+ elif state.total_pages > config.auto_chunk_threshold:
160
+ # Large PDF: auto-enable chunking with default size
161
+ actual_chunk_size = config.chunk_size
162
+ logger.info(
163
+ f"Auto-chunking enabled: {state.total_pages} pages > "
164
+ f"{config.auto_chunk_threshold} threshold (chunk_size={actual_chunk_size})"
165
+ )
166
+ else:
167
+ # Small PDF: process all at once
168
+ actual_chunk_size = 0
169
+
170
+ # Transcribe
171
+ try:
172
+ with PDFProcessor(str(pdf_path), dpi) as proc:
173
+ if mode == "streaming":
174
+ content = await engine.transcribe_streaming(
175
+ proc, "markdown", state_mgr,
176
+ chunk_size=actual_chunk_size
177
+ )
178
+ elif mode == "batch":
179
+ content = await engine.transcribe_batch(
180
+ proc, "markdown", state_mgr, config.max_concurrent_pages
181
+ )
182
+ else:
183
+ return {
184
+ "success": False,
185
+ "output_path": None,
186
+ "pages_transcribed": 0,
187
+ "total_pages": state.total_pages,
188
+ "partial_content": None,
189
+ "error": f"Invalid mode: {mode}. Must be 'streaming' or 'batch'",
190
+ "metadata": {},
191
+ "lint_results": None
192
+ }
193
+
194
+ except Exception as e:
195
+ # Return partial result on failure
196
+ partial = state_mgr.assemble_output()
197
+ summary = state_mgr.get_progress_summary()
198
+
199
+ logger.error(f"Transcription failed: {e}")
200
+
201
+ return {
202
+ "success": False,
203
+ "output_path": None,
204
+ "pages_transcribed": summary["completed"],
205
+ "total_pages": summary["total"],
206
+ "partial_content": partial if partial else None,
207
+ "error": f"Transcription failed: {e}",
208
+ "metadata": metadata or {},
209
+ "lint_results": None
210
+ }
211
+
212
+ # Build metadata
213
+ meta_dict = metadata or {}
214
+ paper_title = meta_dict.get("title", paper_name)
215
+ paper_authors = meta_dict.get("authors", [])
216
+ paper_year = meta_dict.get("year")
217
+
218
+ paper_meta = create_initial_metadata(
219
+ title=paper_title,
220
+ pdf_source=pdf_path,
221
+ total_pages=state.total_pages,
222
+ output_format="markdown",
223
+ quality=quality,
224
+ authors=paper_authors,
225
+ year=paper_year,
226
+ journal=meta_dict.get("journal"),
227
+ arxiv_id=meta_dict.get("arxiv_id"),
228
+ doi=meta_dict.get("doi"),
229
+ keywords=meta_dict.get("keywords", [])
230
+ )
231
+
232
+ # Update transcribed_pages count
233
+ summary = state_mgr.get_progress_summary()
234
+ paper_meta.transcribed_pages = summary["completed"]
235
+
236
+ # Write final output with frontmatter
237
+ output_path = paper_dir / f"{paper_name}.md"
238
+
239
+ try:
240
+ final_content = generate_frontmatter(paper_meta) + "\n" + content
241
+ output_path.write_text(final_content, encoding="utf-8")
242
+ except Exception as e:
243
+ return {
244
+ "success": False,
245
+ "output_path": None,
246
+ "pages_transcribed": summary["completed"],
247
+ "total_pages": summary["total"],
248
+ "partial_content": content,
249
+ "error": f"Failed to write output file: {e}",
250
+ "metadata": paper_meta.to_dict(),
251
+ "lint_results": None
252
+ }
253
+
254
+ # Cleanup progress files on success
255
+ if summary["completed"] == summary["total"]:
256
+ state_mgr.cleanup()
257
+
258
+ logger.info(
259
+ f"Transcription complete: {output_path} "
260
+ f"({summary['completed']}/{summary['total']} pages)"
261
+ )
262
+
263
+ # Run linting if enabled
264
+ lint_results = None
265
+ if lint:
266
+ try:
267
+ # Save original (non-linted) version for manual review
268
+ original_path = paper_dir / f"{paper_name}.original.md"
269
+ original_path.write_text(final_content, encoding="utf-8")
270
+ logger.info(f"Saved original (pre-lint) to: {original_path}")
271
+
272
+ # Run linter with auto-fix
273
+ lint_report = await lint_engine.lint_file(output_path, fix=True)
274
+ lint_results = {
275
+ "total_issues": lint_report.total_issues,
276
+ "auto_fixed": len(lint_report.fixed),
277
+ "warnings": lint_report.warnings,
278
+ "fixed_rules": lint_report.fixed,
279
+ "original_path": str(original_path)
280
+ }
281
+
282
+ logger.info(
283
+ f"Linting: {lint_report.total_issues} issues found, "
284
+ f"{len(lint_report.fixed)} auto-fixed. "
285
+ f"Original saved to {original_path.name}"
286
+ )
287
+
288
+ except Exception as e:
289
+ logger.warning(f"Linting failed (file still saved): {e}")
290
+ lint_results = {"error": str(e)}
291
+
292
+ return {
293
+ "success": True,
294
+ "output_path": str(output_path),
295
+ "pages_transcribed": summary["completed"],
296
+ "total_pages": summary["total"],
297
+ "partial_content": None,
298
+ "error": None,
299
+ "metadata": {
300
+ "title": paper_meta.title,
301
+ "authors": paper_meta.authors,
302
+ "keywords": paper_meta.keywords,
303
+ "year": paper_meta.year
304
+ },
305
+ "lint_results": lint_results
306
+ }
307
+
308
+ @mcp.tool()
309
+ async def clear_transcription_cache() -> dict:
310
+ """
311
+ Clear the cached transcription engine to free memory.
312
+
313
+ The transcription engine caches Marker OCR models (~2GB) to speed up
314
+ sequential transcriptions. Call this tool when you're done transcribing
315
+ to reclaim memory.
316
+
317
+ Returns:
318
+ Dictionary with:
319
+ - cleared (int): Number of cached engines that were cleared
320
+ - message (str): Status message
321
+ """
322
+ count = clear_engine_cache()
323
+
324
+ if count > 0:
325
+ message = f"Cleared {count} cached engine(s), freeing ~2GB memory"
326
+ logger.info(message)
327
+ else:
328
+ message = "No cached engines to clear"
329
+
330
+ return {
331
+ "cleared": count,
332
+ "message": message
333
+ }