pdf-transcriber 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,325 @@
1
+ """Resume-capable state management for PDF transcription jobs."""
2
+ from dataclasses import dataclass, asdict
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ import json
6
+ import logging
7
+ import shutil
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ @dataclass
13
+ class TranscriptionState:
14
+ """State for a transcription job."""
15
+
16
+ pdf_source: str
17
+ total_pages: int
18
+ completed_pages: list[int]
19
+ failed_pages: list[int]
20
+ output_format: str
21
+ quality: str
22
+ started_at: str
23
+ last_updated: str
24
+ version: int = 1
25
+
26
+ def to_dict(self) -> dict:
27
+ """Convert to dictionary."""
28
+ return asdict(self)
29
+
30
+ @classmethod
31
+ def from_dict(cls, data: dict) -> "TranscriptionState":
32
+ """Create from dictionary."""
33
+ return cls(**data)
34
+
35
+
36
+ class StateManager:
37
+ """
38
+ Manages transcription progress and resume capability.
39
+
40
+ Creates a .pdf-progress/ directory with:
41
+ - state.json: Lightweight state (~500 bytes)
42
+ - page_NNN.md: Individual completed pages (for assembly)
43
+ """
44
+
45
+ def __init__(self, output_dir: Path, paper_name: str):
46
+ """
47
+ Initialize state manager.
48
+
49
+ Args:
50
+ output_dir: Base output directory
51
+ paper_name: Name of the paper (used for subdirectory)
52
+ """
53
+ self.output_dir = Path(output_dir)
54
+ self.paper_name = paper_name
55
+ self.progress_dir = self.output_dir / paper_name / ".pdf-progress"
56
+ self.state_file = self.progress_dir / "state.json"
57
+
58
+ def has_existing_job(self) -> bool:
59
+ """Check if a resumable job exists."""
60
+ return self.state_file.exists()
61
+
62
+ def load_state(self) -> TranscriptionState | None:
63
+ """
64
+ Load existing state for resume.
65
+
66
+ Returns:
67
+ TranscriptionState if exists, None otherwise
68
+ """
69
+ if not self.state_file.exists():
70
+ return None
71
+
72
+ try:
73
+ data = json.loads(self.state_file.read_text())
74
+ state = TranscriptionState.from_dict(data)
75
+ logger.info(
76
+ f"Loaded state: {len(state.completed_pages)}/{state.total_pages} pages complete"
77
+ )
78
+ return state
79
+ except (json.JSONDecodeError, TypeError, KeyError) as e:
80
+ logger.error(f"Corrupt state file: {e}")
81
+ return None
82
+
83
+ def create_job(
84
+ self,
85
+ pdf_source: str,
86
+ total_pages: int,
87
+ output_format: str,
88
+ quality: str
89
+ ) -> TranscriptionState:
90
+ """
91
+ Initialize new transcription job.
92
+
93
+ Args:
94
+ pdf_source: Path to source PDF
95
+ total_pages: Total number of pages
96
+ output_format: "markdown" or "latex"
97
+ quality: Quality preset name
98
+
99
+ Returns:
100
+ New TranscriptionState
101
+ """
102
+ self.progress_dir.mkdir(parents=True, exist_ok=True)
103
+
104
+ now = datetime.utcnow().isoformat() + "Z"
105
+ state = TranscriptionState(
106
+ pdf_source=pdf_source,
107
+ total_pages=total_pages,
108
+ completed_pages=[],
109
+ failed_pages=[],
110
+ output_format=output_format,
111
+ quality=quality,
112
+ started_at=now,
113
+ last_updated=now
114
+ )
115
+
116
+ self._save_state(state)
117
+ logger.info(
118
+ f"Created new job: {total_pages} pages, "
119
+ f"format={output_format}, quality={quality}"
120
+ )
121
+ return state
122
+
123
+ def mark_page_complete(self, page_num: int, content: str) -> None:
124
+ """
125
+ Save completed page and update state.
126
+
127
+ Args:
128
+ page_num: 1-indexed page number
129
+ content: Transcribed content for this page
130
+ """
131
+ state = self.load_state()
132
+ if state is None:
133
+ raise RuntimeError("No active job. Call create_job() first.")
134
+
135
+ # Don't add duplicates
136
+ if page_num not in state.completed_pages:
137
+ state.completed_pages.append(page_num)
138
+ state.completed_pages.sort() # Keep sorted for assembly
139
+
140
+ state.last_updated = datetime.utcnow().isoformat() + "Z"
141
+
142
+ # Save page content to temp file
143
+ page_file = self.progress_dir / f"page_{page_num:03d}.md"
144
+ page_file.write_text(content, encoding="utf-8")
145
+
146
+ self._save_state(state)
147
+ logger.info(
148
+ f"Page {page_num} complete ({len(state.completed_pages)}/{state.total_pages})"
149
+ )
150
+
151
+ def mark_page_failed(self, page_num: int, error: str) -> None:
152
+ """
153
+ Record page failure for later retry.
154
+
155
+ Args:
156
+ page_num: 1-indexed page number
157
+ error: Error message
158
+ """
159
+ state = self.load_state()
160
+ if state is None:
161
+ raise RuntimeError("No active job.")
162
+
163
+ if page_num not in state.failed_pages:
164
+ state.failed_pages.append(page_num)
165
+
166
+ state.last_updated = datetime.utcnow().isoformat() + "Z"
167
+ self._save_state(state)
168
+
169
+ logger.warning(f"Page {page_num} failed: {error}")
170
+
171
+ def get_pending_pages(self) -> list[int]:
172
+ """
173
+ Get pages that still need processing.
174
+
175
+ Returns:
176
+ List of page numbers (1-indexed) that haven't been completed or failed
177
+ """
178
+ state = self.load_state()
179
+ if state is None:
180
+ return []
181
+
182
+ all_done = set(state.completed_pages) | set(state.failed_pages)
183
+ pending = [
184
+ page_num
185
+ for page_num in range(1, state.total_pages + 1)
186
+ if page_num not in all_done
187
+ ]
188
+
189
+ return pending
190
+
191
+ def get_failed_pages(self) -> list[int]:
192
+ """Get list of failed pages for retry."""
193
+ state = self.load_state()
194
+ if state is None:
195
+ return []
196
+ return state.failed_pages.copy()
197
+
198
+ def get_next_chunk(self, chunk_size: int) -> list[int]:
199
+ """
200
+ Get the next batch of pending pages for chunk-based processing.
201
+
202
+ Args:
203
+ chunk_size: Maximum number of pages to return (0 = all pending)
204
+
205
+ Returns:
206
+ List of 1-indexed page numbers to process next.
207
+ Returns empty list if no pages remaining.
208
+ """
209
+ pending = self.get_pending_pages()
210
+ if not pending:
211
+ return []
212
+
213
+ # Sort to ensure consistent ordering
214
+ pending.sort()
215
+
216
+ if chunk_size <= 0:
217
+ # No chunking - return all pending pages
218
+ return pending
219
+
220
+ # Return first chunk_size pages
221
+ return pending[:chunk_size]
222
+
223
+ def update_chunk_progress(self, last_page: int) -> None:
224
+ """
225
+ Update state timestamp after successful chunk completion.
226
+
227
+ This provides a checkpoint for crash recovery - we know all pages
228
+ up to and including those in the completed chunk are saved.
229
+
230
+ Args:
231
+ last_page: Last page number in the completed chunk
232
+ """
233
+ state = self.load_state()
234
+ if state is None:
235
+ return
236
+
237
+ state.last_updated = datetime.utcnow().isoformat() + "Z"
238
+ self._save_state(state)
239
+ logger.info(f"Chunk complete (through page {last_page})")
240
+
241
+ def assemble_output(self, include_page_markers: bool = True) -> str:
242
+ """
243
+ Combine all completed pages into final output.
244
+
245
+ Args:
246
+ include_page_markers: If True, add page number comments between pages
247
+
248
+ Returns:
249
+ Combined content from all completed pages
250
+ """
251
+ state = self.load_state()
252
+ if state is None or not state.completed_pages:
253
+ return ""
254
+
255
+ pages = []
256
+ for page_num in sorted(state.completed_pages):
257
+ page_file = self.progress_dir / f"page_{page_num:03d}.md"
258
+ if page_file.exists():
259
+ content = page_file.read_text(encoding="utf-8")
260
+
261
+ if include_page_markers:
262
+ marker = (
263
+ f"<!-- Page {page_num} -->"
264
+ if state.output_format == "markdown"
265
+ else f"% Page {page_num}"
266
+ )
267
+ pages.append(f"{marker}\n\n{content}")
268
+ else:
269
+ pages.append(content)
270
+
271
+ separator = "\n\n---\n\n"
272
+ return separator.join(pages)
273
+
274
+ def get_progress_summary(self) -> dict:
275
+ """
276
+ Get summary of current progress.
277
+
278
+ Returns:
279
+ Dictionary with progress metrics
280
+ """
281
+ state = self.load_state()
282
+ if state is None:
283
+ return {
284
+ "active": False,
285
+ "completed": 0,
286
+ "failed": 0,
287
+ "pending": 0,
288
+ "total": 0,
289
+ "completion_percentage": 0.0
290
+ }
291
+
292
+ pending = self.get_pending_pages()
293
+ completed_count = len(state.completed_pages)
294
+
295
+ return {
296
+ "active": True,
297
+ "completed": completed_count,
298
+ "failed": len(state.failed_pages),
299
+ "pending": len(pending),
300
+ "total": state.total_pages,
301
+ "completion_percentage": (completed_count / state.total_pages * 100)
302
+ if state.total_pages > 0 else 0.0,
303
+ "started_at": state.started_at,
304
+ "last_updated": state.last_updated
305
+ }
306
+
307
+ def cleanup(self) -> None:
308
+ """Remove progress directory after successful completion."""
309
+ if self.progress_dir.exists():
310
+ try:
311
+ shutil.rmtree(self.progress_dir)
312
+ logger.info(f"Cleaned up progress directory: {self.progress_dir}")
313
+ except Exception as e:
314
+ logger.warning(f"Failed to cleanup progress directory: {e}")
315
+
316
+ def _save_state(self, state: TranscriptionState) -> None:
317
+ """Save state to JSON file."""
318
+ try:
319
+ self.state_file.write_text(
320
+ json.dumps(state.to_dict(), indent=2),
321
+ encoding="utf-8"
322
+ )
323
+ except Exception as e:
324
+ logger.error(f"Failed to save state: {e}")
325
+ raise