pdf-transcriber 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_transcriber/__init__.py +6 -0
- pdf_transcriber/cli.py +291 -0
- pdf_transcriber/config.py +109 -0
- pdf_transcriber/core/__init__.py +21 -0
- pdf_transcriber/core/linter/__init__.py +5 -0
- pdf_transcriber/core/linter/engine.py +184 -0
- pdf_transcriber/core/linter/models.py +72 -0
- pdf_transcriber/core/linter/rules/__init__.py +55 -0
- pdf_transcriber/core/linter/rules/artifacts.py +1030 -0
- pdf_transcriber/core/linter/rules/markdown.py +191 -0
- pdf_transcriber/core/linter/rules/math.py +633 -0
- pdf_transcriber/core/metadata_parser.py +245 -0
- pdf_transcriber/core/pdf_processor.py +173 -0
- pdf_transcriber/core/state_manager.py +325 -0
- pdf_transcriber/core/transcription.py +476 -0
- pdf_transcriber/server.py +50 -0
- pdf_transcriber/skills/__init__.py +1 -0
- pdf_transcriber/skills/transcribe.md +48 -0
- pdf_transcriber/tools/__init__.py +4 -0
- pdf_transcriber/tools/lint.py +72 -0
- pdf_transcriber/tools/transcribe.py +333 -0
- pdf_transcriber-1.0.0.dist-info/METADATA +401 -0
- pdf_transcriber-1.0.0.dist-info/RECORD +26 -0
- pdf_transcriber-1.0.0.dist-info/WHEEL +4 -0
- pdf_transcriber-1.0.0.dist-info/entry_points.txt +3 -0
- pdf_transcriber-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
"""Resume-capable state management for PDF transcription jobs."""
|
|
2
|
+
from dataclasses import dataclass, asdict
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import shutil
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class TranscriptionState:
|
|
14
|
+
"""State for a transcription job."""
|
|
15
|
+
|
|
16
|
+
pdf_source: str
|
|
17
|
+
total_pages: int
|
|
18
|
+
completed_pages: list[int]
|
|
19
|
+
failed_pages: list[int]
|
|
20
|
+
output_format: str
|
|
21
|
+
quality: str
|
|
22
|
+
started_at: str
|
|
23
|
+
last_updated: str
|
|
24
|
+
version: int = 1
|
|
25
|
+
|
|
26
|
+
def to_dict(self) -> dict:
|
|
27
|
+
"""Convert to dictionary."""
|
|
28
|
+
return asdict(self)
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def from_dict(cls, data: dict) -> "TranscriptionState":
|
|
32
|
+
"""Create from dictionary."""
|
|
33
|
+
return cls(**data)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class StateManager:
|
|
37
|
+
"""
|
|
38
|
+
Manages transcription progress and resume capability.
|
|
39
|
+
|
|
40
|
+
Creates a .pdf-progress/ directory with:
|
|
41
|
+
- state.json: Lightweight state (~500 bytes)
|
|
42
|
+
- page_NNN.md: Individual completed pages (for assembly)
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, output_dir: Path, paper_name: str):
|
|
46
|
+
"""
|
|
47
|
+
Initialize state manager.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
output_dir: Base output directory
|
|
51
|
+
paper_name: Name of the paper (used for subdirectory)
|
|
52
|
+
"""
|
|
53
|
+
self.output_dir = Path(output_dir)
|
|
54
|
+
self.paper_name = paper_name
|
|
55
|
+
self.progress_dir = self.output_dir / paper_name / ".pdf-progress"
|
|
56
|
+
self.state_file = self.progress_dir / "state.json"
|
|
57
|
+
|
|
58
|
+
def has_existing_job(self) -> bool:
|
|
59
|
+
"""Check if a resumable job exists."""
|
|
60
|
+
return self.state_file.exists()
|
|
61
|
+
|
|
62
|
+
def load_state(self) -> TranscriptionState | None:
|
|
63
|
+
"""
|
|
64
|
+
Load existing state for resume.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
TranscriptionState if exists, None otherwise
|
|
68
|
+
"""
|
|
69
|
+
if not self.state_file.exists():
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
data = json.loads(self.state_file.read_text())
|
|
74
|
+
state = TranscriptionState.from_dict(data)
|
|
75
|
+
logger.info(
|
|
76
|
+
f"Loaded state: {len(state.completed_pages)}/{state.total_pages} pages complete"
|
|
77
|
+
)
|
|
78
|
+
return state
|
|
79
|
+
except (json.JSONDecodeError, TypeError, KeyError) as e:
|
|
80
|
+
logger.error(f"Corrupt state file: {e}")
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
def create_job(
|
|
84
|
+
self,
|
|
85
|
+
pdf_source: str,
|
|
86
|
+
total_pages: int,
|
|
87
|
+
output_format: str,
|
|
88
|
+
quality: str
|
|
89
|
+
) -> TranscriptionState:
|
|
90
|
+
"""
|
|
91
|
+
Initialize new transcription job.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
pdf_source: Path to source PDF
|
|
95
|
+
total_pages: Total number of pages
|
|
96
|
+
output_format: "markdown" or "latex"
|
|
97
|
+
quality: Quality preset name
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
New TranscriptionState
|
|
101
|
+
"""
|
|
102
|
+
self.progress_dir.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
|
|
104
|
+
now = datetime.utcnow().isoformat() + "Z"
|
|
105
|
+
state = TranscriptionState(
|
|
106
|
+
pdf_source=pdf_source,
|
|
107
|
+
total_pages=total_pages,
|
|
108
|
+
completed_pages=[],
|
|
109
|
+
failed_pages=[],
|
|
110
|
+
output_format=output_format,
|
|
111
|
+
quality=quality,
|
|
112
|
+
started_at=now,
|
|
113
|
+
last_updated=now
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
self._save_state(state)
|
|
117
|
+
logger.info(
|
|
118
|
+
f"Created new job: {total_pages} pages, "
|
|
119
|
+
f"format={output_format}, quality={quality}"
|
|
120
|
+
)
|
|
121
|
+
return state
|
|
122
|
+
|
|
123
|
+
def mark_page_complete(self, page_num: int, content: str) -> None:
|
|
124
|
+
"""
|
|
125
|
+
Save completed page and update state.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
page_num: 1-indexed page number
|
|
129
|
+
content: Transcribed content for this page
|
|
130
|
+
"""
|
|
131
|
+
state = self.load_state()
|
|
132
|
+
if state is None:
|
|
133
|
+
raise RuntimeError("No active job. Call create_job() first.")
|
|
134
|
+
|
|
135
|
+
# Don't add duplicates
|
|
136
|
+
if page_num not in state.completed_pages:
|
|
137
|
+
state.completed_pages.append(page_num)
|
|
138
|
+
state.completed_pages.sort() # Keep sorted for assembly
|
|
139
|
+
|
|
140
|
+
state.last_updated = datetime.utcnow().isoformat() + "Z"
|
|
141
|
+
|
|
142
|
+
# Save page content to temp file
|
|
143
|
+
page_file = self.progress_dir / f"page_{page_num:03d}.md"
|
|
144
|
+
page_file.write_text(content, encoding="utf-8")
|
|
145
|
+
|
|
146
|
+
self._save_state(state)
|
|
147
|
+
logger.info(
|
|
148
|
+
f"Page {page_num} complete ({len(state.completed_pages)}/{state.total_pages})"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def mark_page_failed(self, page_num: int, error: str) -> None:
|
|
152
|
+
"""
|
|
153
|
+
Record page failure for later retry.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
page_num: 1-indexed page number
|
|
157
|
+
error: Error message
|
|
158
|
+
"""
|
|
159
|
+
state = self.load_state()
|
|
160
|
+
if state is None:
|
|
161
|
+
raise RuntimeError("No active job.")
|
|
162
|
+
|
|
163
|
+
if page_num not in state.failed_pages:
|
|
164
|
+
state.failed_pages.append(page_num)
|
|
165
|
+
|
|
166
|
+
state.last_updated = datetime.utcnow().isoformat() + "Z"
|
|
167
|
+
self._save_state(state)
|
|
168
|
+
|
|
169
|
+
logger.warning(f"Page {page_num} failed: {error}")
|
|
170
|
+
|
|
171
|
+
def get_pending_pages(self) -> list[int]:
|
|
172
|
+
"""
|
|
173
|
+
Get pages that still need processing.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
List of page numbers (1-indexed) that haven't been completed or failed
|
|
177
|
+
"""
|
|
178
|
+
state = self.load_state()
|
|
179
|
+
if state is None:
|
|
180
|
+
return []
|
|
181
|
+
|
|
182
|
+
all_done = set(state.completed_pages) | set(state.failed_pages)
|
|
183
|
+
pending = [
|
|
184
|
+
page_num
|
|
185
|
+
for page_num in range(1, state.total_pages + 1)
|
|
186
|
+
if page_num not in all_done
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
return pending
|
|
190
|
+
|
|
191
|
+
def get_failed_pages(self) -> list[int]:
|
|
192
|
+
"""Get list of failed pages for retry."""
|
|
193
|
+
state = self.load_state()
|
|
194
|
+
if state is None:
|
|
195
|
+
return []
|
|
196
|
+
return state.failed_pages.copy()
|
|
197
|
+
|
|
198
|
+
def get_next_chunk(self, chunk_size: int) -> list[int]:
|
|
199
|
+
"""
|
|
200
|
+
Get the next batch of pending pages for chunk-based processing.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
chunk_size: Maximum number of pages to return (0 = all pending)
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
List of 1-indexed page numbers to process next.
|
|
207
|
+
Returns empty list if no pages remaining.
|
|
208
|
+
"""
|
|
209
|
+
pending = self.get_pending_pages()
|
|
210
|
+
if not pending:
|
|
211
|
+
return []
|
|
212
|
+
|
|
213
|
+
# Sort to ensure consistent ordering
|
|
214
|
+
pending.sort()
|
|
215
|
+
|
|
216
|
+
if chunk_size <= 0:
|
|
217
|
+
# No chunking - return all pending pages
|
|
218
|
+
return pending
|
|
219
|
+
|
|
220
|
+
# Return first chunk_size pages
|
|
221
|
+
return pending[:chunk_size]
|
|
222
|
+
|
|
223
|
+
def update_chunk_progress(self, last_page: int) -> None:
|
|
224
|
+
"""
|
|
225
|
+
Update state timestamp after successful chunk completion.
|
|
226
|
+
|
|
227
|
+
This provides a checkpoint for crash recovery - we know all pages
|
|
228
|
+
up to and including those in the completed chunk are saved.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
last_page: Last page number in the completed chunk
|
|
232
|
+
"""
|
|
233
|
+
state = self.load_state()
|
|
234
|
+
if state is None:
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
state.last_updated = datetime.utcnow().isoformat() + "Z"
|
|
238
|
+
self._save_state(state)
|
|
239
|
+
logger.info(f"Chunk complete (through page {last_page})")
|
|
240
|
+
|
|
241
|
+
def assemble_output(self, include_page_markers: bool = True) -> str:
|
|
242
|
+
"""
|
|
243
|
+
Combine all completed pages into final output.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
include_page_markers: If True, add page number comments between pages
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Combined content from all completed pages
|
|
250
|
+
"""
|
|
251
|
+
state = self.load_state()
|
|
252
|
+
if state is None or not state.completed_pages:
|
|
253
|
+
return ""
|
|
254
|
+
|
|
255
|
+
pages = []
|
|
256
|
+
for page_num in sorted(state.completed_pages):
|
|
257
|
+
page_file = self.progress_dir / f"page_{page_num:03d}.md"
|
|
258
|
+
if page_file.exists():
|
|
259
|
+
content = page_file.read_text(encoding="utf-8")
|
|
260
|
+
|
|
261
|
+
if include_page_markers:
|
|
262
|
+
marker = (
|
|
263
|
+
f"<!-- Page {page_num} -->"
|
|
264
|
+
if state.output_format == "markdown"
|
|
265
|
+
else f"% Page {page_num}"
|
|
266
|
+
)
|
|
267
|
+
pages.append(f"{marker}\n\n{content}")
|
|
268
|
+
else:
|
|
269
|
+
pages.append(content)
|
|
270
|
+
|
|
271
|
+
separator = "\n\n---\n\n"
|
|
272
|
+
return separator.join(pages)
|
|
273
|
+
|
|
274
|
+
def get_progress_summary(self) -> dict:
|
|
275
|
+
"""
|
|
276
|
+
Get summary of current progress.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
Dictionary with progress metrics
|
|
280
|
+
"""
|
|
281
|
+
state = self.load_state()
|
|
282
|
+
if state is None:
|
|
283
|
+
return {
|
|
284
|
+
"active": False,
|
|
285
|
+
"completed": 0,
|
|
286
|
+
"failed": 0,
|
|
287
|
+
"pending": 0,
|
|
288
|
+
"total": 0,
|
|
289
|
+
"completion_percentage": 0.0
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
pending = self.get_pending_pages()
|
|
293
|
+
completed_count = len(state.completed_pages)
|
|
294
|
+
|
|
295
|
+
return {
|
|
296
|
+
"active": True,
|
|
297
|
+
"completed": completed_count,
|
|
298
|
+
"failed": len(state.failed_pages),
|
|
299
|
+
"pending": len(pending),
|
|
300
|
+
"total": state.total_pages,
|
|
301
|
+
"completion_percentage": (completed_count / state.total_pages * 100)
|
|
302
|
+
if state.total_pages > 0 else 0.0,
|
|
303
|
+
"started_at": state.started_at,
|
|
304
|
+
"last_updated": state.last_updated
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
def cleanup(self) -> None:
|
|
308
|
+
"""Remove progress directory after successful completion."""
|
|
309
|
+
if self.progress_dir.exists():
|
|
310
|
+
try:
|
|
311
|
+
shutil.rmtree(self.progress_dir)
|
|
312
|
+
logger.info(f"Cleaned up progress directory: {self.progress_dir}")
|
|
313
|
+
except Exception as e:
|
|
314
|
+
logger.warning(f"Failed to cleanup progress directory: {e}")
|
|
315
|
+
|
|
316
|
+
def _save_state(self, state: TranscriptionState) -> None:
|
|
317
|
+
"""Save state to JSON file."""
|
|
318
|
+
try:
|
|
319
|
+
self.state_file.write_text(
|
|
320
|
+
json.dumps(state.to_dict(), indent=2),
|
|
321
|
+
encoding="utf-8"
|
|
322
|
+
)
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.error(f"Failed to save state: {e}")
|
|
325
|
+
raise
|