pdf-transcriber 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_transcriber/__init__.py +6 -0
- pdf_transcriber/cli.py +291 -0
- pdf_transcriber/config.py +109 -0
- pdf_transcriber/core/__init__.py +21 -0
- pdf_transcriber/core/linter/__init__.py +5 -0
- pdf_transcriber/core/linter/engine.py +184 -0
- pdf_transcriber/core/linter/models.py +72 -0
- pdf_transcriber/core/linter/rules/__init__.py +55 -0
- pdf_transcriber/core/linter/rules/artifacts.py +1030 -0
- pdf_transcriber/core/linter/rules/markdown.py +191 -0
- pdf_transcriber/core/linter/rules/math.py +633 -0
- pdf_transcriber/core/metadata_parser.py +245 -0
- pdf_transcriber/core/pdf_processor.py +173 -0
- pdf_transcriber/core/state_manager.py +325 -0
- pdf_transcriber/core/transcription.py +476 -0
- pdf_transcriber/server.py +50 -0
- pdf_transcriber/skills/__init__.py +1 -0
- pdf_transcriber/skills/transcribe.md +48 -0
- pdf_transcriber/tools/__init__.py +4 -0
- pdf_transcriber/tools/lint.py +72 -0
- pdf_transcriber/tools/transcribe.py +333 -0
- pdf_transcriber-1.0.0.dist-info/METADATA +401 -0
- pdf_transcriber-1.0.0.dist-info/RECORD +26 -0
- pdf_transcriber-1.0.0.dist-info/WHEEL +4 -0
- pdf_transcriber-1.0.0.dist-info/entry_points.txt +3 -0
- pdf_transcriber-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""Markdown structure linting rules."""
|
|
2
|
+
import re
|
|
3
|
+
from typing import Generator
|
|
4
|
+
|
|
5
|
+
from ..models import LintIssue, Severity, Fix
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def excessive_blank_lines(content: str) -> Generator[LintIssue, None, None]:
|
|
9
|
+
"""
|
|
10
|
+
Flag more than 2 consecutive blank lines.
|
|
11
|
+
|
|
12
|
+
Multiple blank lines waste tokens and don't improve readability.
|
|
13
|
+
Normalizes to exactly 2 blank lines (one empty line between paragraphs).
|
|
14
|
+
"""
|
|
15
|
+
pattern = re.compile(r'\n{4,}')
|
|
16
|
+
|
|
17
|
+
for match in pattern.finditer(content):
|
|
18
|
+
num_blanks = len(match.group()) - 1
|
|
19
|
+
line_num = content[:match.start()].count('\n') + 1
|
|
20
|
+
|
|
21
|
+
yield LintIssue(
|
|
22
|
+
rule="excessive_blank_lines",
|
|
23
|
+
severity=Severity.AUTO_FIX,
|
|
24
|
+
line=line_num,
|
|
25
|
+
message=f"{num_blanks} consecutive blank lines (max 2)",
|
|
26
|
+
fix=Fix(
|
|
27
|
+
old=match.group(),
|
|
28
|
+
new="\n\n\n" # Normalize to 2 blank lines
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def trailing_whitespace(content: str) -> Generator[LintIssue, None, None]:
|
|
34
|
+
"""
|
|
35
|
+
Flag trailing whitespace on lines.
|
|
36
|
+
|
|
37
|
+
Trailing whitespace wastes tokens and can cause diff noise.
|
|
38
|
+
"""
|
|
39
|
+
lines = content.split('\n')
|
|
40
|
+
|
|
41
|
+
for i, line in enumerate(lines, 1):
|
|
42
|
+
stripped = line.rstrip()
|
|
43
|
+
trailing_count = len(line) - len(stripped)
|
|
44
|
+
|
|
45
|
+
if trailing_count > 0:
|
|
46
|
+
yield LintIssue(
|
|
47
|
+
rule="trailing_whitespace",
|
|
48
|
+
severity=Severity.AUTO_FIX,
|
|
49
|
+
line=i,
|
|
50
|
+
message=f"Trailing whitespace ({trailing_count} chars)",
|
|
51
|
+
fix=Fix(old=line, new=stripped)
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def sparse_table_row(content: str) -> Generator[LintIssue, None, None]:
|
|
56
|
+
"""
|
|
57
|
+
Flag table rows that are more than 50% empty cells.
|
|
58
|
+
|
|
59
|
+
Common artifact from TOC transcription where vision models
|
|
60
|
+
create tables with many empty columns.
|
|
61
|
+
"""
|
|
62
|
+
table_row_pattern = re.compile(r'^\|.*\|$', re.MULTILINE)
|
|
63
|
+
|
|
64
|
+
for match in table_row_pattern.finditer(content):
|
|
65
|
+
row = match.group()
|
|
66
|
+
cells = row.split('|')[1:-1] # Exclude outer pipes
|
|
67
|
+
|
|
68
|
+
if len(cells) <= 3:
|
|
69
|
+
continue # Small tables are fine
|
|
70
|
+
|
|
71
|
+
empty_cells = sum(1 for c in cells if c.strip() == '')
|
|
72
|
+
empty_ratio = empty_cells / len(cells)
|
|
73
|
+
|
|
74
|
+
if empty_ratio > 0.5:
|
|
75
|
+
line_num = content[:match.start()].count('\n') + 1
|
|
76
|
+
yield LintIssue(
|
|
77
|
+
rule="sparse_table_row",
|
|
78
|
+
severity=Severity.WARNING,
|
|
79
|
+
line=line_num,
|
|
80
|
+
message=f"Table row is {empty_cells}/{len(cells)} empty ({empty_ratio:.0%})",
|
|
81
|
+
fix=None # Needs manual review - might need table restructure
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def orphaned_list_marker(content: str) -> Generator[LintIssue, None, None]:
|
|
86
|
+
"""
|
|
87
|
+
Flag list markers that have no content after them.
|
|
88
|
+
|
|
89
|
+
Often caused by transcription errors where list content
|
|
90
|
+
ends up on the next line or is missing entirely.
|
|
91
|
+
"""
|
|
92
|
+
# Match: start of line, optional whitespace, list marker, only whitespace to EOL
|
|
93
|
+
pattern = re.compile(r'^([ \t]*(?:[-*+]|\d+\.))[ \t]*$', re.MULTILINE)
|
|
94
|
+
|
|
95
|
+
for match in pattern.finditer(content):
|
|
96
|
+
line_num = content[:match.start()].count('\n') + 1
|
|
97
|
+
marker = match.group(1).strip()
|
|
98
|
+
|
|
99
|
+
yield LintIssue(
|
|
100
|
+
rule="orphaned_list_marker",
|
|
101
|
+
severity=Severity.WARNING,
|
|
102
|
+
line=line_num,
|
|
103
|
+
message=f"List marker '{marker}' with no content",
|
|
104
|
+
fix=Fix(old=match.group() + '\n', new='')
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def leading_whitespace(content: str) -> Generator[LintIssue, None, None]:
|
|
109
|
+
"""
|
|
110
|
+
Flag leading whitespace on lines (outside of code blocks).
|
|
111
|
+
|
|
112
|
+
Leading whitespace in transcribed papers is almost always an OCR artifact.
|
|
113
|
+
Preserves indentation inside fenced code blocks.
|
|
114
|
+
"""
|
|
115
|
+
lines = content.split('\n')
|
|
116
|
+
in_code_block = False
|
|
117
|
+
|
|
118
|
+
for i, line in enumerate(lines, 1):
|
|
119
|
+
# Track fenced code blocks
|
|
120
|
+
if line.strip().startswith('```'):
|
|
121
|
+
in_code_block = not in_code_block
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
if in_code_block:
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
# Check for leading whitespace (spaces or tabs)
|
|
128
|
+
if line and line[0] in ' \t':
|
|
129
|
+
stripped = line.lstrip()
|
|
130
|
+
leading_count = len(line) - len(stripped)
|
|
131
|
+
|
|
132
|
+
# Skip if it's a blank line (all whitespace)
|
|
133
|
+
if not stripped:
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
yield LintIssue(
|
|
137
|
+
rule="leading_whitespace",
|
|
138
|
+
severity=Severity.AUTO_FIX,
|
|
139
|
+
line=i,
|
|
140
|
+
message=f"Leading whitespace ({leading_count} chars)",
|
|
141
|
+
fix=Fix(old=line, new=stripped)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def header_whitespace(content: str) -> Generator[LintIssue, None, None]:
|
|
146
|
+
"""
|
|
147
|
+
Remove extra blank lines before headers.
|
|
148
|
+
|
|
149
|
+
In transcribed papers, headers often have unnecessary blank lines
|
|
150
|
+
before them from page breaks or section transitions. This normalizes
|
|
151
|
+
to have exactly one blank line before headers.
|
|
152
|
+
"""
|
|
153
|
+
# Pattern: 2+ blank lines followed by a header line
|
|
154
|
+
# Matches: \n\n\n# Header or \n\n\n## Subsection etc.
|
|
155
|
+
pattern = re.compile(r'\n(\n{2,})(#{1,6}\s+[^\n]+)')
|
|
156
|
+
|
|
157
|
+
for match in pattern.finditer(content):
|
|
158
|
+
blank_lines = match.group(1)
|
|
159
|
+
header = match.group(2)
|
|
160
|
+
line_num = content[:match.start()].count('\n') + 1
|
|
161
|
+
|
|
162
|
+
yield LintIssue(
|
|
163
|
+
rule="header_whitespace",
|
|
164
|
+
severity=Severity.AUTO_FIX,
|
|
165
|
+
line=line_num,
|
|
166
|
+
message=f"Extra blank lines before header: '{header[:40]}...'",
|
|
167
|
+
fix=Fix(old=match.group(), new=f'\n\n{header}')
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def long_line(content: str, max_length: int = 500) -> Generator[LintIssue, None, None]:
|
|
172
|
+
"""
|
|
173
|
+
Flag extremely long lines.
|
|
174
|
+
|
|
175
|
+
Very long lines often indicate broken content that wasn't
|
|
176
|
+
properly line-wrapped, or tables that didn't parse correctly.
|
|
177
|
+
"""
|
|
178
|
+
lines = content.split('\n')
|
|
179
|
+
|
|
180
|
+
for i, line in enumerate(lines, 1):
|
|
181
|
+
if len(line) > max_length:
|
|
182
|
+
# Show a preview of the line start
|
|
183
|
+
preview = line[:60] + "..." if len(line) > 60 else line
|
|
184
|
+
|
|
185
|
+
yield LintIssue(
|
|
186
|
+
rule="long_line",
|
|
187
|
+
severity=Severity.WARNING,
|
|
188
|
+
line=i,
|
|
189
|
+
message=f"Line is {len(line)} chars (max {max_length}): {preview}",
|
|
190
|
+
fix=None # Needs manual review
|
|
191
|
+
)
|