thinkpdf 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfbrain/__init__.py +22 -0
- pdfbrain/app_gui.py +530 -0
- pdfbrain/cache/__init__.py +5 -0
- pdfbrain/cache/cache_manager.py +252 -0
- pdfbrain/cli.py +255 -0
- pdfbrain/core/__init__.py +6 -0
- pdfbrain/core/converter.py +332 -0
- pdfbrain/core/equations.py +635 -0
- pdfbrain/core/extract.py +469 -0
- pdfbrain/core/extractor.py +272 -0
- pdfbrain/core/models.py +196 -0
- pdfbrain/core/pipeline.py +287 -0
- pdfbrain/core/render.py +574 -0
- pdfbrain/core/tables.py +871 -0
- pdfbrain/core/transform.py +604 -0
- pdfbrain/core/utils.py +229 -0
- pdfbrain/engine.py +392 -0
- pdfbrain/mcp_server.py +315 -0
- pdfbrain/utils/__init__.py +1 -0
- thinkpdf-1.0.1.dist-info/METADATA +138 -0
- thinkpdf-1.0.1.dist-info/RECORD +25 -0
- thinkpdf-1.0.1.dist-info/WHEEL +5 -0
- thinkpdf-1.0.1.dist-info/entry_points.txt +4 -0
- thinkpdf-1.0.1.dist-info/licenses/LICENSE +620 -0
- thinkpdf-1.0.1.dist-info/top_level.txt +1 -0
pdfbrain/core/tables.py
ADDED
|
@@ -0,0 +1,871 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""Table detection and extraction for pdfmd.
|
|
4
|
+
|
|
5
|
+
This module detects and extracts tabular data from PDF text blocks, supporting
|
|
6
|
+
three detection strategies:
|
|
7
|
+
|
|
8
|
+
1. **Bordered tables**: Tables with explicit | or ¦ delimiters (highest priority)
|
|
9
|
+
2. **ASCII tables**: Tables with whitespace-separated columns (most common)
|
|
10
|
+
3. **Vertical tables**: Multi-block tables where each block is a row
|
|
11
|
+
|
|
12
|
+
The module uses heuristic scoring to distinguish tables from prose, lists,
|
|
13
|
+
and code blocks, with careful attention to:
|
|
14
|
+
- Column alignment consistency
|
|
15
|
+
- Cell content types (numeric, tokens, sentences)
|
|
16
|
+
- Structural patterns and density
|
|
17
|
+
|
|
18
|
+
Detection results are returned as TableDetection objects containing the
|
|
19
|
+
extracted grid and metadata for rendering.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from dataclasses import dataclass
|
|
23
|
+
from typing import Iterable, List, Optional, Tuple, Dict
|
|
24
|
+
import re
|
|
25
|
+
|
|
26
|
+
from .models import PageText, Block, Line
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Core dataclasses
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class TableDetection:
|
|
36
|
+
"""Represents a detected table region within a page.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
block_index: Starting block index within PageText.blocks
|
|
40
|
+
grid: Rectangular grid of cell strings (rows × columns)
|
|
41
|
+
score: Confidence score from profiling heuristics
|
|
42
|
+
n_blocks: Number of consecutive blocks this table spans
|
|
43
|
+
detection_type: Method used to detect this table
|
|
44
|
+
"""
|
|
45
|
+
block_index: int
|
|
46
|
+
grid: List[List[str]]
|
|
47
|
+
score: float = 0.0
|
|
48
|
+
n_blocks: int = 1
|
|
49
|
+
detection_type: str = "ascii" # "bordered", "ascii", or "vertical"
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def n_rows(self) -> int:
|
|
53
|
+
"""Number of rows in the table."""
|
|
54
|
+
return len(self.grid)
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def n_cols(self) -> int:
|
|
58
|
+
"""Maximum number of columns across all rows."""
|
|
59
|
+
return max((len(row) for row in self.grid), default=0)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class GridProfile:
|
|
64
|
+
"""Statistical profile of a candidate table grid.
|
|
65
|
+
|
|
66
|
+
Used to score and filter table candidates based on content characteristics.
|
|
67
|
+
"""
|
|
68
|
+
n_rows: int
|
|
69
|
+
n_cols: int
|
|
70
|
+
non_empty_cells: int
|
|
71
|
+
short_token_cells: int
|
|
72
|
+
numeric_cells: int
|
|
73
|
+
sentence_cells: int
|
|
74
|
+
avg_len: float
|
|
75
|
+
max_len: int
|
|
76
|
+
header_rows: int
|
|
77
|
+
score: float
|
|
78
|
+
density: float = 0.0 # Fraction of non-empty cells
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Public API
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def detect_tables_on_page(page: PageText, debug: bool = False) -> List[TableDetection]:
|
|
87
|
+
"""Detect all tables on a single page using multiple strategies.
|
|
88
|
+
|
|
89
|
+
Detection priority (highest to lowest):
|
|
90
|
+
1. Bordered tables (explicit | delimiters)
|
|
91
|
+
2. Vertical multi-block tables
|
|
92
|
+
3. ASCII whitespace-separated tables
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
page: PageText object containing blocks to analyze
|
|
96
|
+
debug: If True, log detection statistics to stderr
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
List of TableDetection objects, sorted by block_index
|
|
100
|
+
"""
|
|
101
|
+
detections: List[TableDetection] = []
|
|
102
|
+
|
|
103
|
+
bordered_candidates: Dict[int, TableDetection] = {}
|
|
104
|
+
ascii_candidates: Dict[int, TableDetection] = {}
|
|
105
|
+
vertical_candidates: Dict[int, TableDetection] = {}
|
|
106
|
+
|
|
107
|
+
# Strategy 1: Bordered table detection (highest confidence)
|
|
108
|
+
for idx, block in enumerate(page.blocks):
|
|
109
|
+
if _block_is_obviously_non_table(block):
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
grid = _detect_bordered_table(block)
|
|
113
|
+
if grid is None:
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
prof = _profile_grid(grid)
|
|
117
|
+
if not _grid_passes_profile(prof):
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
bordered_candidates[idx] = TableDetection(
|
|
121
|
+
block_index=idx,
|
|
122
|
+
grid=grid,
|
|
123
|
+
score=prof.score + 2.0, # Bonus for explicit structure
|
|
124
|
+
n_blocks=1,
|
|
125
|
+
detection_type="bordered",
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Strategy 2: ASCII / single-block detection (most common case)
|
|
129
|
+
for idx, block in enumerate(page.blocks):
|
|
130
|
+
if idx in bordered_candidates: # Skip if already detected as bordered
|
|
131
|
+
continue
|
|
132
|
+
if _block_is_obviously_non_table(block):
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
grid = _detect_ascii_table_in_block(block)
|
|
136
|
+
if grid is None:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
prof = _profile_grid(grid)
|
|
140
|
+
if not _grid_passes_profile(prof):
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
ascii_candidates[idx] = TableDetection(
|
|
144
|
+
block_index=idx,
|
|
145
|
+
grid=grid,
|
|
146
|
+
score=prof.score,
|
|
147
|
+
n_blocks=1,
|
|
148
|
+
detection_type="ascii",
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Strategy 3: Vertical multi-block table detection (strict)
|
|
152
|
+
n_blocks = len(page.blocks)
|
|
153
|
+
start = 0
|
|
154
|
+
while start < n_blocks:
|
|
155
|
+
if start in bordered_candidates or start in ascii_candidates:
|
|
156
|
+
start += 1
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
run = _detect_vertical_run(page, start)
|
|
160
|
+
if run is None:
|
|
161
|
+
start += 1
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
start_idx, end_idx, grid = run
|
|
165
|
+
prof = _profile_grid(grid)
|
|
166
|
+
if not _grid_passes_profile(prof):
|
|
167
|
+
start = end_idx
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
vertical_candidates[start_idx] = TableDetection(
|
|
171
|
+
block_index=start_idx,
|
|
172
|
+
grid=grid,
|
|
173
|
+
score=prof.score,
|
|
174
|
+
n_blocks=end_idx - start_idx,
|
|
175
|
+
detection_type="vertical",
|
|
176
|
+
)
|
|
177
|
+
start = end_idx
|
|
178
|
+
|
|
179
|
+
# Merge candidates with priority: bordered > vertical > ascii
|
|
180
|
+
# Avoid overlapping detections
|
|
181
|
+
used_blocks: set[int] = set()
|
|
182
|
+
for idx in range(len(page.blocks)):
|
|
183
|
+
cand: Optional[TableDetection] = None
|
|
184
|
+
b = bordered_candidates.get(idx)
|
|
185
|
+
v = vertical_candidates.get(idx)
|
|
186
|
+
a = ascii_candidates.get(idx)
|
|
187
|
+
|
|
188
|
+
# Priority order
|
|
189
|
+
if b:
|
|
190
|
+
cand = b
|
|
191
|
+
elif v:
|
|
192
|
+
cand = v
|
|
193
|
+
elif a:
|
|
194
|
+
cand = a
|
|
195
|
+
|
|
196
|
+
if cand is None:
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
# Check for conflicts with already-used blocks
|
|
200
|
+
conflict = False
|
|
201
|
+
for blk_idx in range(idx, idx + cand.n_blocks):
|
|
202
|
+
if blk_idx in used_blocks:
|
|
203
|
+
conflict = True
|
|
204
|
+
break
|
|
205
|
+
if conflict:
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
# Mark blocks as used
|
|
209
|
+
for blk_idx in range(idx, idx + cand.n_blocks):
|
|
210
|
+
used_blocks.add(blk_idx)
|
|
211
|
+
|
|
212
|
+
detections.append(cand)
|
|
213
|
+
|
|
214
|
+
# Debug logging
|
|
215
|
+
if debug:
|
|
216
|
+
try:
|
|
217
|
+
from .utils import log
|
|
218
|
+
log(f"[tables] Page has {len(page.blocks)} blocks")
|
|
219
|
+
log(f"[tables] Candidates: {len(bordered_candidates)} bordered, "
|
|
220
|
+
f"{len(ascii_candidates)} ASCII, {len(vertical_candidates)} vertical")
|
|
221
|
+
log(f"[tables] Final detections: {len(detections)}")
|
|
222
|
+
for i, det in enumerate(detections):
|
|
223
|
+
log(f"[tables] {i+1}. {det.detection_type}: {det.n_rows}x{det.n_cols}, "
|
|
224
|
+
f"score={det.score:.2f}, blocks={det.n_blocks}")
|
|
225
|
+
except ImportError:
|
|
226
|
+
pass # utils not available, skip debug output
|
|
227
|
+
|
|
228
|
+
return detections
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# ---------------------------------------------------------------------------
|
|
232
|
+
# Line helpers
|
|
233
|
+
# ---------------------------------------------------------------------------
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _line_text(line: Line) -> str:
|
|
237
|
+
"""Join all span texts in a line."""
|
|
238
|
+
return "".join(sp.text for sp in line.spans)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _block_line_texts(block: Block) -> List[str]:
|
|
242
|
+
"""Extract non-empty line texts from a block."""
|
|
243
|
+
texts: List[str] = []
|
|
244
|
+
for ln in block.lines:
|
|
245
|
+
t = _line_text(ln)
|
|
246
|
+
if t.strip():
|
|
247
|
+
texts.append(t.rstrip("\n"))
|
|
248
|
+
return texts
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
# ---------------------------------------------------------------------------
|
|
252
|
+
# Bordered table detection (Strategy 1)
|
|
253
|
+
# ---------------------------------------------------------------------------
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _detect_bordered_table(block: Block) -> Optional[List[List[str]]]:
|
|
257
|
+
"""Detect tables with | or ¦ delimiters (Markdown-style or plain text).
|
|
258
|
+
|
|
259
|
+
Examples:
|
|
260
|
+
| Name | Age | City |
|
|
261
|
+
|------|-----|----------|
|
|
262
|
+
| Alice| 30 | New York |
|
|
263
|
+
|
|
264
|
+
Name | Age | City
|
|
265
|
+
Alice| 30 | New York
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Grid of cells if a valid bordered table is found, None otherwise.
|
|
269
|
+
"""
|
|
270
|
+
texts = _block_line_texts(block)
|
|
271
|
+
if len(texts) < 2:
|
|
272
|
+
return None
|
|
273
|
+
|
|
274
|
+
# Check if lines contain pipe delimiters
|
|
275
|
+
pipe_lines = [t for t in texts if '|' in t or '¦' in t]
|
|
276
|
+
if len(pipe_lines) < 2:
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
# Count pipes per line to estimate consistency
|
|
280
|
+
pipe_counts = [t.count('|') + t.count('¦') for t in pipe_lines]
|
|
281
|
+
if not pipe_counts or max(pipe_counts) < 2:
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
grid = []
|
|
285
|
+
|
|
286
|
+
for line in pipe_lines:
|
|
287
|
+
# Normalize ¦ to |
|
|
288
|
+
line = line.replace('¦', '|')
|
|
289
|
+
|
|
290
|
+
# Skip separator lines like |---|---| or |:---|---:|
|
|
291
|
+
if re.match(r'^[\s|:\-]+$', line):
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
# Split on pipes
|
|
295
|
+
cells = [c.strip() for c in line.split('|')]
|
|
296
|
+
|
|
297
|
+
# Remove empty first/last cells from leading/trailing pipes
|
|
298
|
+
if cells and not cells[0]:
|
|
299
|
+
cells = cells[1:]
|
|
300
|
+
if cells and not cells[-1]:
|
|
301
|
+
cells = cells[:-1]
|
|
302
|
+
|
|
303
|
+
if cells and len(cells) >= 2:
|
|
304
|
+
grid.append(cells)
|
|
305
|
+
|
|
306
|
+
# Need at least 2 data rows for a valid table
|
|
307
|
+
if len(grid) < 2:
|
|
308
|
+
return None
|
|
309
|
+
|
|
310
|
+
# Normalize to rectangular grid
|
|
311
|
+
max_cols = max(len(row) for row in grid)
|
|
312
|
+
normalized = []
|
|
313
|
+
for row in grid:
|
|
314
|
+
if len(row) < max_cols:
|
|
315
|
+
row = row + [''] * (max_cols - len(row))
|
|
316
|
+
normalized.append(row)
|
|
317
|
+
|
|
318
|
+
return normalized
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
# ---------------------------------------------------------------------------
|
|
322
|
+
# Vertical multi-block detection (Strategy 3)
|
|
323
|
+
# ---------------------------------------------------------------------------
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _block_can_start_vertical(block: Block) -> bool:
|
|
327
|
+
"""Check if a block can be the first row of a vertical table.
|
|
328
|
+
|
|
329
|
+
Vertical tables have each row as a separate block, with consistent
|
|
330
|
+
line counts across blocks.
|
|
331
|
+
"""
|
|
332
|
+
texts = _block_line_texts(block)
|
|
333
|
+
n = len(texts)
|
|
334
|
+
if n < 2 or n > 6:
|
|
335
|
+
return False
|
|
336
|
+
|
|
337
|
+
if any(_is_list_like_line(t) for t in texts):
|
|
338
|
+
return False
|
|
339
|
+
if _is_code_like_block(texts):
|
|
340
|
+
return False
|
|
341
|
+
|
|
342
|
+
avg_len = sum(len(t.strip()) for t in texts) / n
|
|
343
|
+
if n <= 3 and avg_len > 80:
|
|
344
|
+
return False
|
|
345
|
+
|
|
346
|
+
return True
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _detect_vertical_run(
|
|
350
|
+
page: PageText, start_idx: int
|
|
351
|
+
) -> Optional[Tuple[int, int, List[List[str]]]]:
|
|
352
|
+
"""Detect a vertical multi-block table starting at start_idx.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
Tuple of (start_idx, end_idx, grid) if valid, None otherwise.
|
|
356
|
+
end_idx is exclusive (one past the last block in the table).
|
|
357
|
+
"""
|
|
358
|
+
if start_idx >= len(page.blocks):
|
|
359
|
+
return None
|
|
360
|
+
|
|
361
|
+
first = page.blocks[start_idx]
|
|
362
|
+
if not _block_can_start_vertical(first):
|
|
363
|
+
return None
|
|
364
|
+
|
|
365
|
+
first_texts = _block_line_texts(first)
|
|
366
|
+
col_count = len(first_texts)
|
|
367
|
+
if col_count < 2:
|
|
368
|
+
return None
|
|
369
|
+
|
|
370
|
+
blocks: List[Block] = [first]
|
|
371
|
+
idx = start_idx + 1
|
|
372
|
+
n_blocks = len(page.blocks)
|
|
373
|
+
|
|
374
|
+
while idx < n_blocks:
|
|
375
|
+
blk = page.blocks[idx]
|
|
376
|
+
texts = _block_line_texts(blk)
|
|
377
|
+
|
|
378
|
+
if len(texts) != col_count:
|
|
379
|
+
break
|
|
380
|
+
if any(_is_list_like_line(t) for t in texts):
|
|
381
|
+
break
|
|
382
|
+
if _is_code_like_block(texts):
|
|
383
|
+
break
|
|
384
|
+
|
|
385
|
+
blocks.append(blk)
|
|
386
|
+
idx += 1
|
|
387
|
+
|
|
388
|
+
# Need ≥3 blocks to avoid 2-block paragraph pairs
|
|
389
|
+
if len(blocks) < 3:
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
grid: List[List[str]] = []
|
|
393
|
+
for blk in blocks:
|
|
394
|
+
row = [t.strip() for t in _block_line_texts(blk)]
|
|
395
|
+
if len(row) < col_count:
|
|
396
|
+
row.extend('' for _ in range(col_count - len(row)))
|
|
397
|
+
elif len(row) > col_count:
|
|
398
|
+
row = row[:col_count]
|
|
399
|
+
grid.append(row)
|
|
400
|
+
|
|
401
|
+
if len(grid) < 2:
|
|
402
|
+
return None
|
|
403
|
+
|
|
404
|
+
return start_idx, idx, grid
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
# ---------------------------------------------------------------------------
|
|
408
|
+
# ASCII single-block detection (Strategy 2)
|
|
409
|
+
# ---------------------------------------------------------------------------
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
_CELL_SPLIT_RE_CONSERVATIVE = re.compile(r"[ \t]{3,}")
|
|
413
|
+
_CELL_SPLIT_RE_RELAXED = re.compile(r"[ \t]{2,}")
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def _split_cells(text: str) -> List[str]:
|
|
417
|
+
"""Split text into cells based on whitespace.
|
|
418
|
+
|
|
419
|
+
Tries 3+ spaces first (conservative), falls back to 2+ spaces.
|
|
420
|
+
This helps distinguish tables from prose with occasional double spaces.
|
|
421
|
+
"""
|
|
422
|
+
s = text.rstrip()
|
|
423
|
+
if not s:
|
|
424
|
+
return [""]
|
|
425
|
+
|
|
426
|
+
# Try conservative split first
|
|
427
|
+
cells = _CELL_SPLIT_RE_CONSERVATIVE.split(s)
|
|
428
|
+
if len(cells) >= 2:
|
|
429
|
+
return cells
|
|
430
|
+
|
|
431
|
+
# Fall back to relaxed split
|
|
432
|
+
return _CELL_SPLIT_RE_RELAXED.split(s)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _block_is_obviously_non_table(block: Block) -> bool:
|
|
436
|
+
"""Quick filter to skip blocks that are clearly not tables.
|
|
437
|
+
|
|
438
|
+
Checks for:
|
|
439
|
+
- Too few lines
|
|
440
|
+
- Short prose blocks without columns
|
|
441
|
+
- High concentration of list markers
|
|
442
|
+
- Lines starting with bullets
|
|
443
|
+
"""
|
|
444
|
+
texts = _block_line_texts(block)
|
|
445
|
+
if len(texts) < 2:
|
|
446
|
+
return True
|
|
447
|
+
|
|
448
|
+
# Short blocks without multi-column structure
|
|
449
|
+
if len(texts) <= 3 and all(len(t.strip()) <= 40 for t in texts):
|
|
450
|
+
if not any(len(_split_cells(t)) >= 2 for t in texts):
|
|
451
|
+
return True
|
|
452
|
+
|
|
453
|
+
# High concentration of list markers
|
|
454
|
+
list_like = sum(1 for t in texts if _is_list_like_line(t))
|
|
455
|
+
if list_like >= max(2, int(0.8 * len(texts))):
|
|
456
|
+
return True
|
|
457
|
+
|
|
458
|
+
# Nearly all lines start with bullets (strong list signal)
|
|
459
|
+
bullet_chars = set('•◦◦-*')
|
|
460
|
+
bullet_starters = sum(1 for t in texts if t.lstrip()[:1] in bullet_chars)
|
|
461
|
+
if bullet_starters >= len(texts) * 0.9:
|
|
462
|
+
return True
|
|
463
|
+
|
|
464
|
+
return False
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _detect_ascii_table_in_block(block: Block) -> Optional[List[List[str]]]:
|
|
468
|
+
"""Detect whitespace-separated tables within a single block.
|
|
469
|
+
|
|
470
|
+
Uses the most common column count as the target and normalizes rows
|
|
471
|
+
to that width, merging overflow content into the last column.
|
|
472
|
+
"""
|
|
473
|
+
texts = _block_line_texts(block)
|
|
474
|
+
if len(texts) < 2:
|
|
475
|
+
return None
|
|
476
|
+
|
|
477
|
+
if _is_code_like_block(texts):
|
|
478
|
+
return None
|
|
479
|
+
|
|
480
|
+
split_lines: List[List[str]] = [_split_cells(t) for t in texts]
|
|
481
|
+
is_row = [len(cells) >= 2 for cells in split_lines]
|
|
482
|
+
if sum(is_row) < 2:
|
|
483
|
+
return None
|
|
484
|
+
|
|
485
|
+
# Find first and last valid table rows
|
|
486
|
+
first_row = next(i for i, flag in enumerate(is_row) if flag)
|
|
487
|
+
last_row = next(i for i in range(len(is_row) - 1, -1, -1) if is_row[i])
|
|
488
|
+
|
|
489
|
+
core_lines = split_lines[first_row : last_row + 1]
|
|
490
|
+
core_flags = is_row[first_row : last_row + 1]
|
|
491
|
+
|
|
492
|
+
# Determine target column count (most common)
|
|
493
|
+
row_counts = [len(cells) for cells, flag in zip(core_lines, core_flags) if flag]
|
|
494
|
+
target_cols, freq = _most_common_int(row_counts)
|
|
495
|
+
if target_cols < 2 or freq < max(2, int(0.6 * len(row_counts))):
|
|
496
|
+
return None
|
|
497
|
+
|
|
498
|
+
grid: List[List[str]] = []
|
|
499
|
+
for cells in core_lines:
|
|
500
|
+
if len(cells) < target_cols:
|
|
501
|
+
# Pad short rows
|
|
502
|
+
cells = cells + [''] * (target_cols - len(cells))
|
|
503
|
+
elif len(cells) > target_cols:
|
|
504
|
+
# Merge overflow into last column
|
|
505
|
+
head = cells[: target_cols - 1]
|
|
506
|
+
tail = ' '.join(cells[target_cols - 1 :]).strip()
|
|
507
|
+
tail = _strip_repeated_row_tail(tail, head)
|
|
508
|
+
cells = head + ([tail] if tail else [''])
|
|
509
|
+
|
|
510
|
+
cleaned = [c.strip() for c in cells]
|
|
511
|
+
if any(cleaned): # Skip empty rows
|
|
512
|
+
grid.append(cleaned)
|
|
513
|
+
|
|
514
|
+
if len(grid) < 2:
|
|
515
|
+
return None
|
|
516
|
+
|
|
517
|
+
return grid
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def _strip_repeated_row_tail(tail: str, head_cells: List[str]) -> str:
|
|
521
|
+
"""Clean up repeated content in overflow cells.
|
|
522
|
+
|
|
523
|
+
Sometimes PDF extraction duplicates header text or repeats patterns
|
|
524
|
+
in the tail. This function attempts to detect and remove such artifacts.
|
|
525
|
+
"""
|
|
526
|
+
t = tail.strip()
|
|
527
|
+
if not t:
|
|
528
|
+
return ""
|
|
529
|
+
|
|
530
|
+
# Remove if tail starts with concatenated header text
|
|
531
|
+
joined_head = ' '.join(h.strip() for h in head_cells if h.strip())
|
|
532
|
+
if joined_head and t.startswith(joined_head):
|
|
533
|
+
rest = t[len(joined_head):].strip()
|
|
534
|
+
if not rest:
|
|
535
|
+
return ""
|
|
536
|
+
t = rest
|
|
537
|
+
|
|
538
|
+
# Detect repeated chunks (e.g., "data data data data")
|
|
539
|
+
parts = t.split()
|
|
540
|
+
if len(parts) >= 4:
|
|
541
|
+
chunk = ' '.join(parts[: len(parts) // 2])
|
|
542
|
+
if chunk and t.count(chunk) >= 3:
|
|
543
|
+
return "" # Likely a repetition artifact
|
|
544
|
+
|
|
545
|
+
return t
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
# ---------------------------------------------------------------------------
|
|
549
|
+
# Grid profiling and scoring
|
|
550
|
+
# ---------------------------------------------------------------------------
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
_SENTENCE_END_RE = re.compile(r"[.!?…]+$")
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def _cell_is_short_token(text: str) -> bool:
|
|
557
|
+
"""Check if a cell contains a short token (identifier, number, code).
|
|
558
|
+
|
|
559
|
+
Short tokens are:
|
|
560
|
+
- ≤24 characters
|
|
561
|
+
- No internal spaces
|
|
562
|
+
- Alphanumeric or numeric with punctuation
|
|
563
|
+
"""
|
|
564
|
+
s = text.strip()
|
|
565
|
+
if not s:
|
|
566
|
+
return False
|
|
567
|
+
if len(s) > 24:
|
|
568
|
+
return False
|
|
569
|
+
if ' ' in s:
|
|
570
|
+
return False
|
|
571
|
+
|
|
572
|
+
s_clean = s.strip('()[]{}%$€£+-')
|
|
573
|
+
if not s_clean:
|
|
574
|
+
return False
|
|
575
|
+
|
|
576
|
+
# Pure digits or decimals
|
|
577
|
+
if s_clean.isdigit():
|
|
578
|
+
return True
|
|
579
|
+
if s_clean.replace('.', '', 1).isdigit():
|
|
580
|
+
return True
|
|
581
|
+
|
|
582
|
+
# Alphanumeric identifiers
|
|
583
|
+
if s_clean.isalnum():
|
|
584
|
+
return True
|
|
585
|
+
|
|
586
|
+
return False
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def _cell_is_numeric(text: str) -> bool:
|
|
590
|
+
"""Check if a cell contains numeric data (including percentages)."""
|
|
591
|
+
s = text.strip().replace(',', '')
|
|
592
|
+
if not s:
|
|
593
|
+
return False
|
|
594
|
+
|
|
595
|
+
# Handle percentages and decimals
|
|
596
|
+
s_clean = s.replace('.', '', 1).replace('%', '', 1)
|
|
597
|
+
if s_clean.isdigit():
|
|
598
|
+
return True
|
|
599
|
+
|
|
600
|
+
# Handle negative numbers
|
|
601
|
+
if s_clean.startswith('-') and s_clean[1:].replace('.', '', 1).isdigit():
|
|
602
|
+
return True
|
|
603
|
+
|
|
604
|
+
return False
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def _cell_is_sentence(text: str) -> bool:
|
|
608
|
+
"""Check if a cell contains a complete sentence.
|
|
609
|
+
|
|
610
|
+
Sentences have:
|
|
611
|
+
- ≥5 words
|
|
612
|
+
- Sentence-ending punctuation
|
|
613
|
+
- Optional internal punctuation
|
|
614
|
+
"""
|
|
615
|
+
s = text.strip()
|
|
616
|
+
if not s:
|
|
617
|
+
return False
|
|
618
|
+
|
|
619
|
+
words = s.split()
|
|
620
|
+
if len(words) < 5:
|
|
621
|
+
return False
|
|
622
|
+
|
|
623
|
+
if not _SENTENCE_END_RE.search(s):
|
|
624
|
+
return False
|
|
625
|
+
|
|
626
|
+
# Presence of commas/semicolons strengthens sentence signal
|
|
627
|
+
if ',' in s or ';' in s:
|
|
628
|
+
return True
|
|
629
|
+
|
|
630
|
+
return True
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def _profile_grid(grid: List[List[str]]) -> GridProfile:
|
|
634
|
+
"""Compute statistical profile and score for a candidate table grid.
|
|
635
|
+
|
|
636
|
+
Scoring factors (positive):
|
|
637
|
+
- More rows and columns
|
|
638
|
+
- Higher ratio of short tokens and numeric cells
|
|
639
|
+
- Consistent rectangular structure
|
|
640
|
+
- Reasonable cell lengths
|
|
641
|
+
|
|
642
|
+
Scoring factors (negative):
|
|
643
|
+
- High ratio of sentence-like cells (suggests prose)
|
|
644
|
+
- Very long average cell length
|
|
645
|
+
- Low cell density
|
|
646
|
+
"""
|
|
647
|
+
if not grid or len(grid) < 2:
|
|
648
|
+
return GridProfile(0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0.0)
|
|
649
|
+
|
|
650
|
+
n_rows = len(grid)
|
|
651
|
+
n_cols = max(len(row) for row in grid)
|
|
652
|
+
if n_cols < 2:
|
|
653
|
+
return GridProfile(n_rows, n_cols, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0.0)
|
|
654
|
+
|
|
655
|
+
non_empty = 0
|
|
656
|
+
short_tokens = 0
|
|
657
|
+
numeric = 0
|
|
658
|
+
sentences = 0
|
|
659
|
+
lengths: List[int] = []
|
|
660
|
+
header_rows = 1 # Default assumption
|
|
661
|
+
|
|
662
|
+
for r, row in enumerate(grid):
|
|
663
|
+
for c, cell in enumerate(row):
|
|
664
|
+
s = cell.strip()
|
|
665
|
+
if not s:
|
|
666
|
+
continue
|
|
667
|
+
non_empty += 1
|
|
668
|
+
L = len(s)
|
|
669
|
+
lengths.append(L)
|
|
670
|
+
|
|
671
|
+
if _cell_is_short_token(s):
|
|
672
|
+
short_tokens += 1
|
|
673
|
+
if _cell_is_numeric(s):
|
|
674
|
+
numeric += 1
|
|
675
|
+
if _cell_is_sentence(s):
|
|
676
|
+
sentences += 1
|
|
677
|
+
|
|
678
|
+
avg_len = (sum(lengths) / len(lengths)) if lengths else 0.0
|
|
679
|
+
max_len = max(lengths) if lengths else 0
|
|
680
|
+
|
|
681
|
+
# Calculate cell density
|
|
682
|
+
total_cells = n_rows * n_cols
|
|
683
|
+
density = non_empty / total_cells if total_cells > 0 else 0.0
|
|
684
|
+
|
|
685
|
+
# Build score
|
|
686
|
+
score = 0.0
|
|
687
|
+
|
|
688
|
+
# Base score from dimensions
|
|
689
|
+
score += 1.0 * n_rows
|
|
690
|
+
score += 0.8 * n_cols
|
|
691
|
+
|
|
692
|
+
if non_empty > 0:
|
|
693
|
+
# Reward tabular content types
|
|
694
|
+
score += 3.0 * (short_tokens / non_empty)
|
|
695
|
+
score += 2.0 * (numeric / non_empty)
|
|
696
|
+
|
|
697
|
+
# Penalize sentence-heavy content (more nuanced)
|
|
698
|
+
sentence_ratio = sentences / non_empty
|
|
699
|
+
if sentence_ratio > 0.8:
|
|
700
|
+
score -= 4.0 * sentence_ratio
|
|
701
|
+
elif sentence_ratio > 0.4:
|
|
702
|
+
score -= 2.0 * sentence_ratio
|
|
703
|
+
|
|
704
|
+
# Penalize very long cells (suggests paragraphs)
|
|
705
|
+
if avg_len > 120:
|
|
706
|
+
score -= 5.0
|
|
707
|
+
|
|
708
|
+
# Bonus for substantial tables
|
|
709
|
+
if n_rows >= 4 and n_cols >= 3:
|
|
710
|
+
score += 2.0
|
|
711
|
+
|
|
712
|
+
# Bonus for consistent column structure
|
|
713
|
+
col_lengths = [len(row) for row in grid]
|
|
714
|
+
if len(set(col_lengths)) == 1: # All rows same length
|
|
715
|
+
score += 1.5
|
|
716
|
+
|
|
717
|
+
# Bonus for good density
|
|
718
|
+
if density >= 0.6:
|
|
719
|
+
score += 1.0
|
|
720
|
+
|
|
721
|
+
return GridProfile(
|
|
722
|
+
n_rows=n_rows,
|
|
723
|
+
n_cols=n_cols,
|
|
724
|
+
non_empty_cells=non_empty,
|
|
725
|
+
short_token_cells=short_tokens,
|
|
726
|
+
numeric_cells=numeric,
|
|
727
|
+
sentence_cells=sentences,
|
|
728
|
+
avg_len=avg_len,
|
|
729
|
+
max_len=max_len,
|
|
730
|
+
header_rows=header_rows,
|
|
731
|
+
score=score,
|
|
732
|
+
density=density,
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
def _grid_passes_profile(prof: GridProfile) -> bool:
|
|
737
|
+
"""Filter grids based on profile thresholds.
|
|
738
|
+
|
|
739
|
+
A grid passes if it:
|
|
740
|
+
- Has sufficient dimensions (≥2x2)
|
|
741
|
+
- Has non-empty content
|
|
742
|
+
- Has reasonable density (≥25%)
|
|
743
|
+
- Isn't too prose-heavy
|
|
744
|
+
- Has adequate structural signals
|
|
745
|
+
- Meets minimum score threshold
|
|
746
|
+
"""
|
|
747
|
+
if prof.n_rows < 2 or prof.n_cols < 2:
|
|
748
|
+
return False
|
|
749
|
+
|
|
750
|
+
if prof.non_empty_cells == 0:
|
|
751
|
+
return False
|
|
752
|
+
|
|
753
|
+
# Require minimum cell density
|
|
754
|
+
if prof.density < 0.25:
|
|
755
|
+
return False
|
|
756
|
+
|
|
757
|
+
# Sentence-heavy content check (more lenient with good structure)
|
|
758
|
+
if prof.sentence_cells >= 0.6 * prof.non_empty_cells:
|
|
759
|
+
# Allow if we have strong structural signals
|
|
760
|
+
has_structure = (
|
|
761
|
+
prof.numeric_cells > 0 or
|
|
762
|
+
prof.short_token_cells >= 0.1 * prof.non_empty_cells or
|
|
763
|
+
(prof.n_rows >= 3 and prof.n_cols >= 3)
|
|
764
|
+
)
|
|
765
|
+
if not has_structure:
|
|
766
|
+
return False
|
|
767
|
+
|
|
768
|
+
# Tables should have some tokens or numbers
|
|
769
|
+
if prof.short_token_cells < 0.15 * prof.non_empty_cells and prof.numeric_cells == 0:
|
|
770
|
+
# More lenient for larger tables
|
|
771
|
+
if prof.n_rows < 3 or prof.n_cols < 3:
|
|
772
|
+
return False
|
|
773
|
+
|
|
774
|
+
# Score threshold
|
|
775
|
+
if prof.score < 1.0:
|
|
776
|
+
return False
|
|
777
|
+
|
|
778
|
+
return True
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
# ---------------------------------------------------------------------------
|
|
782
|
+
# Misc heuristics
|
|
783
|
+
# ---------------------------------------------------------------------------
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
def _is_list_like_line(text: str) -> bool:
|
|
787
|
+
"""Check if a line starts with a list marker.
|
|
788
|
+
|
|
789
|
+
Recognized markers:
|
|
790
|
+
- Bullets: -, •, ◦, ◦, *
|
|
791
|
+
- Numbered: 1. or 1)
|
|
792
|
+
- Lettered: A. or a)
|
|
793
|
+
"""
|
|
794
|
+
s = text.lstrip()
|
|
795
|
+
if not s:
|
|
796
|
+
return False
|
|
797
|
+
|
|
798
|
+
# Bullet markers
|
|
799
|
+
if s[0] in ('-', '•', '◦', '◦', '*') and (len(s) == 1 or s[1].isspace()):
|
|
800
|
+
return True
|
|
801
|
+
|
|
802
|
+
# Numbered or lettered lists
|
|
803
|
+
if re.match(r'^(\d+|[A-Za-z])(\.|\))\s+', s):
|
|
804
|
+
return True
|
|
805
|
+
|
|
806
|
+
return False
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
_CODE_SYMBOLS = set('{}[]();<>/=*+-')
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
def _is_code_like_block(lines: Iterable[str]) -> bool:
|
|
813
|
+
"""Check if a block looks like code rather than a table.
|
|
814
|
+
|
|
815
|
+
Code indicators:
|
|
816
|
+
- High density of programming symbols
|
|
817
|
+
- Keywords like def, class, for, while, if
|
|
818
|
+
- Type annotations (->)
|
|
819
|
+
"""
|
|
820
|
+
texts = [ln.strip() for ln in lines if ln.strip()]
|
|
821
|
+
if not texts:
|
|
822
|
+
return False
|
|
823
|
+
|
|
824
|
+
suspicious = 0
|
|
825
|
+
for t in texts:
|
|
826
|
+
lower = t.lower()
|
|
827
|
+
|
|
828
|
+
# Programming keywords
|
|
829
|
+
if lower.startswith(('def ', 'class ', 'for ', 'while ', 'if ')):
|
|
830
|
+
suspicious += 1
|
|
831
|
+
continue
|
|
832
|
+
|
|
833
|
+
# Type annotations
|
|
834
|
+
if ' -> ' in t:
|
|
835
|
+
suspicious += 1
|
|
836
|
+
continue
|
|
837
|
+
|
|
838
|
+
# Symbol density
|
|
839
|
+
non_space = [c for c in t if not c.isspace()]
|
|
840
|
+
if not non_space:
|
|
841
|
+
continue
|
|
842
|
+
|
|
843
|
+
code_ratio = sum(c in _CODE_SYMBOLS for c in non_space) / float(len(non_space))
|
|
844
|
+
if code_ratio >= 0.35:
|
|
845
|
+
suspicious += 1
|
|
846
|
+
|
|
847
|
+
return suspicious >= max(2, len(texts) // 2)
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
def _most_common_int(vals: List[int]) -> Tuple[int, int]:
|
|
851
|
+
"""Find the most common integer in a list.
|
|
852
|
+
|
|
853
|
+
Returns:
|
|
854
|
+
Tuple of (most_common_value, frequency)
|
|
855
|
+
"""
|
|
856
|
+
if not vals:
|
|
857
|
+
return 0, 0
|
|
858
|
+
|
|
859
|
+
counts: Dict[int, int] = {}
|
|
860
|
+
for v in vals:
|
|
861
|
+
counts[v] = counts.get(v, 0) + 1
|
|
862
|
+
|
|
863
|
+
best = max(counts, key=lambda x: counts[x])
|
|
864
|
+
return best, counts[best]
|
|
865
|
+
|
|
866
|
+
|
|
867
|
+
__all__ = [
|
|
868
|
+
"TableDetection",
|
|
869
|
+
"GridProfile",
|
|
870
|
+
"detect_tables_on_page",
|
|
871
|
+
]
|