pdf-transcriber 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,191 @@
1
+ """Markdown structure linting rules."""
2
+ import re
3
+ from typing import Generator
4
+
5
+ from ..models import LintIssue, Severity, Fix
6
+
7
+
8
+ def excessive_blank_lines(content: str) -> Generator[LintIssue, None, None]:
9
+ """
10
+ Flag more than 2 consecutive blank lines.
11
+
12
+ Multiple blank lines waste tokens and don't improve readability.
13
+ Normalizes to exactly 2 blank lines (one empty line between paragraphs).
14
+ """
15
+ pattern = re.compile(r'\n{4,}')
16
+
17
+ for match in pattern.finditer(content):
18
+ num_blanks = len(match.group()) - 1
19
+ line_num = content[:match.start()].count('\n') + 1
20
+
21
+ yield LintIssue(
22
+ rule="excessive_blank_lines",
23
+ severity=Severity.AUTO_FIX,
24
+ line=line_num,
25
+ message=f"{num_blanks} consecutive blank lines (max 2)",
26
+ fix=Fix(
27
+ old=match.group(),
28
+ new="\n\n\n" # Normalize to 2 blank lines
29
+ )
30
+ )
31
+
32
+
33
+ def trailing_whitespace(content: str) -> Generator[LintIssue, None, None]:
34
+ """
35
+ Flag trailing whitespace on lines.
36
+
37
+ Trailing whitespace wastes tokens and can cause diff noise.
38
+ """
39
+ lines = content.split('\n')
40
+
41
+ for i, line in enumerate(lines, 1):
42
+ stripped = line.rstrip()
43
+ trailing_count = len(line) - len(stripped)
44
+
45
+ if trailing_count > 0:
46
+ yield LintIssue(
47
+ rule="trailing_whitespace",
48
+ severity=Severity.AUTO_FIX,
49
+ line=i,
50
+ message=f"Trailing whitespace ({trailing_count} chars)",
51
+ fix=Fix(old=line, new=stripped)
52
+ )
53
+
54
+
55
+ def sparse_table_row(content: str) -> Generator[LintIssue, None, None]:
56
+ """
57
+ Flag table rows that are more than 50% empty cells.
58
+
59
+ Common artifact from TOC transcription where vision models
60
+ create tables with many empty columns.
61
+ """
62
+ table_row_pattern = re.compile(r'^\|.*\|$', re.MULTILINE)
63
+
64
+ for match in table_row_pattern.finditer(content):
65
+ row = match.group()
66
+ cells = row.split('|')[1:-1] # Exclude outer pipes
67
+
68
+ if len(cells) <= 3:
69
+ continue # Small tables are fine
70
+
71
+ empty_cells = sum(1 for c in cells if c.strip() == '')
72
+ empty_ratio = empty_cells / len(cells)
73
+
74
+ if empty_ratio > 0.5:
75
+ line_num = content[:match.start()].count('\n') + 1
76
+ yield LintIssue(
77
+ rule="sparse_table_row",
78
+ severity=Severity.WARNING,
79
+ line=line_num,
80
+ message=f"Table row is {empty_cells}/{len(cells)} empty ({empty_ratio:.0%})",
81
+ fix=None # Needs manual review - might need table restructure
82
+ )
83
+
84
+
85
+ def orphaned_list_marker(content: str) -> Generator[LintIssue, None, None]:
86
+ """
87
+ Flag list markers that have no content after them.
88
+
89
+ Often caused by transcription errors where list content
90
+ ends up on the next line or is missing entirely.
91
+ """
92
+ # Match: start of line, optional whitespace, list marker, only whitespace to EOL
93
+ pattern = re.compile(r'^([ \t]*(?:[-*+]|\d+\.))[ \t]*$', re.MULTILINE)
94
+
95
+ for match in pattern.finditer(content):
96
+ line_num = content[:match.start()].count('\n') + 1
97
+ marker = match.group(1).strip()
98
+
99
+ yield LintIssue(
100
+ rule="orphaned_list_marker",
101
+ severity=Severity.WARNING,
102
+ line=line_num,
103
+ message=f"List marker '{marker}' with no content",
104
+ fix=Fix(old=match.group() + '\n', new='')
105
+ )
106
+
107
+
108
+ def leading_whitespace(content: str) -> Generator[LintIssue, None, None]:
109
+ """
110
+ Flag leading whitespace on lines (outside of code blocks).
111
+
112
+ Leading whitespace in transcribed papers is almost always an OCR artifact.
113
+ Preserves indentation inside fenced code blocks.
114
+ """
115
+ lines = content.split('\n')
116
+ in_code_block = False
117
+
118
+ for i, line in enumerate(lines, 1):
119
+ # Track fenced code blocks
120
+ if line.strip().startswith('```'):
121
+ in_code_block = not in_code_block
122
+ continue
123
+
124
+ if in_code_block:
125
+ continue
126
+
127
+ # Check for leading whitespace (spaces or tabs)
128
+ if line and line[0] in ' \t':
129
+ stripped = line.lstrip()
130
+ leading_count = len(line) - len(stripped)
131
+
132
+ # Skip if it's a blank line (all whitespace)
133
+ if not stripped:
134
+ continue
135
+
136
+ yield LintIssue(
137
+ rule="leading_whitespace",
138
+ severity=Severity.AUTO_FIX,
139
+ line=i,
140
+ message=f"Leading whitespace ({leading_count} chars)",
141
+ fix=Fix(old=line, new=stripped)
142
+ )
143
+
144
+
145
+ def header_whitespace(content: str) -> Generator[LintIssue, None, None]:
146
+ """
147
+ Remove extra blank lines before headers.
148
+
149
+ In transcribed papers, headers often have unnecessary blank lines
150
+ before them from page breaks or section transitions. This normalizes
151
+ to have exactly one blank line before headers.
152
+ """
153
+ # Pattern: 2+ blank lines followed by a header line
154
+ # Matches: \n\n\n# Header or \n\n\n## Subsection etc.
155
+ pattern = re.compile(r'\n(\n{2,})(#{1,6}\s+[^\n]+)')
156
+
157
+ for match in pattern.finditer(content):
158
+ blank_lines = match.group(1)
159
+ header = match.group(2)
160
+ line_num = content[:match.start()].count('\n') + 1
161
+
162
+ yield LintIssue(
163
+ rule="header_whitespace",
164
+ severity=Severity.AUTO_FIX,
165
+ line=line_num,
166
+ message=f"Extra blank lines before header: '{header[:40]}...'",
167
+ fix=Fix(old=match.group(), new=f'\n\n{header}')
168
+ )
169
+
170
+
171
+ def long_line(content: str, max_length: int = 500) -> Generator[LintIssue, None, None]:
172
+ """
173
+ Flag extremely long lines.
174
+
175
+ Very long lines often indicate broken content that wasn't
176
+ properly line-wrapped, or tables that didn't parse correctly.
177
+ """
178
+ lines = content.split('\n')
179
+
180
+ for i, line in enumerate(lines, 1):
181
+ if len(line) > max_length:
182
+ # Show a preview of the line start
183
+ preview = line[:60] + "..." if len(line) > 60 else line
184
+
185
+ yield LintIssue(
186
+ rule="long_line",
187
+ severity=Severity.WARNING,
188
+ line=i,
189
+ message=f"Line is {len(line)} chars (max {max_length}): {preview}",
190
+ fix=None # Needs manual review
191
+ )