thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,871 @@
1
+ from __future__ import annotations
2
+
3
+ """Table detection and extraction for pdfmd.
4
+
5
+ This module detects and extracts tabular data from PDF text blocks, supporting
6
+ three detection strategies:
7
+
8
+ 1. **Bordered tables**: Tables with explicit | or ¦ delimiters (highest priority)
9
+ 2. **ASCII tables**: Tables with whitespace-separated columns (most common)
10
+ 3. **Vertical tables**: Multi-block tables where each block is a row
11
+
12
+ The module uses heuristic scoring to distinguish tables from prose, lists,
13
+ and code blocks, with careful attention to:
14
+ - Column alignment consistency
15
+ - Cell content types (numeric, tokens, sentences)
16
+ - Structural patterns and density
17
+
18
+ Detection results are returned as TableDetection objects containing the
19
+ extracted grid and metadata for rendering.
20
+ """
21
+
22
+ from dataclasses import dataclass
23
+ from typing import Iterable, List, Optional, Tuple, Dict
24
+ import re
25
+
26
+ from .models import PageText, Block, Line
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Core dataclasses
31
+ # ---------------------------------------------------------------------------
32
+
33
+
34
+ @dataclass
35
+ class TableDetection:
36
+ """Represents a detected table region within a page.
37
+
38
+ Attributes:
39
+ block_index: Starting block index within PageText.blocks
40
+ grid: Rectangular grid of cell strings (rows × columns)
41
+ score: Confidence score from profiling heuristics
42
+ n_blocks: Number of consecutive blocks this table spans
43
+ detection_type: Method used to detect this table
44
+ """
45
+ block_index: int
46
+ grid: List[List[str]]
47
+ score: float = 0.0
48
+ n_blocks: int = 1
49
+ detection_type: str = "ascii" # "bordered", "ascii", or "vertical"
50
+
51
+ @property
52
+ def n_rows(self) -> int:
53
+ """Number of rows in the table."""
54
+ return len(self.grid)
55
+
56
+ @property
57
+ def n_cols(self) -> int:
58
+ """Maximum number of columns across all rows."""
59
+ return max((len(row) for row in self.grid), default=0)
60
+
61
+
62
+ @dataclass
63
+ class GridProfile:
64
+ """Statistical profile of a candidate table grid.
65
+
66
+ Used to score and filter table candidates based on content characteristics.
67
+ """
68
+ n_rows: int
69
+ n_cols: int
70
+ non_empty_cells: int
71
+ short_token_cells: int
72
+ numeric_cells: int
73
+ sentence_cells: int
74
+ avg_len: float
75
+ max_len: int
76
+ header_rows: int
77
+ score: float
78
+ density: float = 0.0 # Fraction of non-empty cells
79
+
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Public API
83
+ # ---------------------------------------------------------------------------
84
+
85
+
86
+ def detect_tables_on_page(page: PageText, debug: bool = False) -> List[TableDetection]:
87
+ """Detect all tables on a single page using multiple strategies.
88
+
89
+ Detection priority (highest to lowest):
90
+ 1. Bordered tables (explicit | delimiters)
91
+ 2. Vertical multi-block tables
92
+ 3. ASCII whitespace-separated tables
93
+
94
+ Args:
95
+ page: PageText object containing blocks to analyze
96
+ debug: If True, log detection statistics to stderr
97
+
98
+ Returns:
99
+ List of TableDetection objects, sorted by block_index
100
+ """
101
+ detections: List[TableDetection] = []
102
+
103
+ bordered_candidates: Dict[int, TableDetection] = {}
104
+ ascii_candidates: Dict[int, TableDetection] = {}
105
+ vertical_candidates: Dict[int, TableDetection] = {}
106
+
107
+ # Strategy 1: Bordered table detection (highest confidence)
108
+ for idx, block in enumerate(page.blocks):
109
+ if _block_is_obviously_non_table(block):
110
+ continue
111
+
112
+ grid = _detect_bordered_table(block)
113
+ if grid is None:
114
+ continue
115
+
116
+ prof = _profile_grid(grid)
117
+ if not _grid_passes_profile(prof):
118
+ continue
119
+
120
+ bordered_candidates[idx] = TableDetection(
121
+ block_index=idx,
122
+ grid=grid,
123
+ score=prof.score + 2.0, # Bonus for explicit structure
124
+ n_blocks=1,
125
+ detection_type="bordered",
126
+ )
127
+
128
+ # Strategy 2: ASCII / single-block detection (most common case)
129
+ for idx, block in enumerate(page.blocks):
130
+ if idx in bordered_candidates: # Skip if already detected as bordered
131
+ continue
132
+ if _block_is_obviously_non_table(block):
133
+ continue
134
+
135
+ grid = _detect_ascii_table_in_block(block)
136
+ if grid is None:
137
+ continue
138
+
139
+ prof = _profile_grid(grid)
140
+ if not _grid_passes_profile(prof):
141
+ continue
142
+
143
+ ascii_candidates[idx] = TableDetection(
144
+ block_index=idx,
145
+ grid=grid,
146
+ score=prof.score,
147
+ n_blocks=1,
148
+ detection_type="ascii",
149
+ )
150
+
151
+ # Strategy 3: Vertical multi-block table detection (strict)
152
+ n_blocks = len(page.blocks)
153
+ start = 0
154
+ while start < n_blocks:
155
+ if start in bordered_candidates or start in ascii_candidates:
156
+ start += 1
157
+ continue
158
+
159
+ run = _detect_vertical_run(page, start)
160
+ if run is None:
161
+ start += 1
162
+ continue
163
+
164
+ start_idx, end_idx, grid = run
165
+ prof = _profile_grid(grid)
166
+ if not _grid_passes_profile(prof):
167
+ start = end_idx
168
+ continue
169
+
170
+ vertical_candidates[start_idx] = TableDetection(
171
+ block_index=start_idx,
172
+ grid=grid,
173
+ score=prof.score,
174
+ n_blocks=end_idx - start_idx,
175
+ detection_type="vertical",
176
+ )
177
+ start = end_idx
178
+
179
+ # Merge candidates with priority: bordered > vertical > ascii
180
+ # Avoid overlapping detections
181
+ used_blocks: set[int] = set()
182
+ for idx in range(len(page.blocks)):
183
+ cand: Optional[TableDetection] = None
184
+ b = bordered_candidates.get(idx)
185
+ v = vertical_candidates.get(idx)
186
+ a = ascii_candidates.get(idx)
187
+
188
+ # Priority order
189
+ if b:
190
+ cand = b
191
+ elif v:
192
+ cand = v
193
+ elif a:
194
+ cand = a
195
+
196
+ if cand is None:
197
+ continue
198
+
199
+ # Check for conflicts with already-used blocks
200
+ conflict = False
201
+ for blk_idx in range(idx, idx + cand.n_blocks):
202
+ if blk_idx in used_blocks:
203
+ conflict = True
204
+ break
205
+ if conflict:
206
+ continue
207
+
208
+ # Mark blocks as used
209
+ for blk_idx in range(idx, idx + cand.n_blocks):
210
+ used_blocks.add(blk_idx)
211
+
212
+ detections.append(cand)
213
+
214
+ # Debug logging
215
+ if debug:
216
+ try:
217
+ from .utils import log
218
+ log(f"[tables] Page has {len(page.blocks)} blocks")
219
+ log(f"[tables] Candidates: {len(bordered_candidates)} bordered, "
220
+ f"{len(ascii_candidates)} ASCII, {len(vertical_candidates)} vertical")
221
+ log(f"[tables] Final detections: {len(detections)}")
222
+ for i, det in enumerate(detections):
223
+ log(f"[tables] {i+1}. {det.detection_type}: {det.n_rows}x{det.n_cols}, "
224
+ f"score={det.score:.2f}, blocks={det.n_blocks}")
225
+ except ImportError:
226
+ pass # utils not available, skip debug output
227
+
228
+ return detections
229
+
230
+
231
+ # ---------------------------------------------------------------------------
232
+ # Line helpers
233
+ # ---------------------------------------------------------------------------
234
+
235
+
236
+ def _line_text(line: Line) -> str:
237
+ """Join all span texts in a line."""
238
+ return "".join(sp.text for sp in line.spans)
239
+
240
+
241
+ def _block_line_texts(block: Block) -> List[str]:
242
+ """Extract non-empty line texts from a block."""
243
+ texts: List[str] = []
244
+ for ln in block.lines:
245
+ t = _line_text(ln)
246
+ if t.strip():
247
+ texts.append(t.rstrip("\n"))
248
+ return texts
249
+
250
+
251
+ # ---------------------------------------------------------------------------
252
+ # Bordered table detection (Strategy 1)
253
+ # ---------------------------------------------------------------------------
254
+
255
+
256
+ def _detect_bordered_table(block: Block) -> Optional[List[List[str]]]:
257
+ """Detect tables with | or ¦ delimiters (Markdown-style or plain text).
258
+
259
+ Examples:
260
+ | Name | Age | City |
261
+ |------|-----|----------|
262
+ | Alice| 30 | New York |
263
+
264
+ Name | Age | City
265
+ Alice| 30 | New York
266
+
267
+ Returns:
268
+ Grid of cells if a valid bordered table is found, None otherwise.
269
+ """
270
+ texts = _block_line_texts(block)
271
+ if len(texts) < 2:
272
+ return None
273
+
274
+ # Check if lines contain pipe delimiters
275
+ pipe_lines = [t for t in texts if '|' in t or '¦' in t]
276
+ if len(pipe_lines) < 2:
277
+ return None
278
+
279
+ # Count pipes per line to estimate consistency
280
+ pipe_counts = [t.count('|') + t.count('¦') for t in pipe_lines]
281
+ if not pipe_counts or max(pipe_counts) < 2:
282
+ return None
283
+
284
+ grid = []
285
+
286
+ for line in pipe_lines:
287
+ # Normalize ¦ to |
288
+ line = line.replace('¦', '|')
289
+
290
+ # Skip separator lines like |---|---| or |:---|---:|
291
+ if re.match(r'^[\s|:\-]+$', line):
292
+ continue
293
+
294
+ # Split on pipes
295
+ cells = [c.strip() for c in line.split('|')]
296
+
297
+ # Remove empty first/last cells from leading/trailing pipes
298
+ if cells and not cells[0]:
299
+ cells = cells[1:]
300
+ if cells and not cells[-1]:
301
+ cells = cells[:-1]
302
+
303
+ if cells and len(cells) >= 2:
304
+ grid.append(cells)
305
+
306
+ # Need at least 2 data rows for a valid table
307
+ if len(grid) < 2:
308
+ return None
309
+
310
+ # Normalize to rectangular grid
311
+ max_cols = max(len(row) for row in grid)
312
+ normalized = []
313
+ for row in grid:
314
+ if len(row) < max_cols:
315
+ row = row + [''] * (max_cols - len(row))
316
+ normalized.append(row)
317
+
318
+ return normalized
319
+
320
+
321
+ # ---------------------------------------------------------------------------
322
+ # Vertical multi-block detection (Strategy 3)
323
+ # ---------------------------------------------------------------------------
324
+
325
+
326
+ def _block_can_start_vertical(block: Block) -> bool:
327
+ """Check if a block can be the first row of a vertical table.
328
+
329
+ Vertical tables have each row as a separate block, with consistent
330
+ line counts across blocks.
331
+ """
332
+ texts = _block_line_texts(block)
333
+ n = len(texts)
334
+ if n < 2 or n > 6:
335
+ return False
336
+
337
+ if any(_is_list_like_line(t) for t in texts):
338
+ return False
339
+ if _is_code_like_block(texts):
340
+ return False
341
+
342
+ avg_len = sum(len(t.strip()) for t in texts) / n
343
+ if n <= 3 and avg_len > 80:
344
+ return False
345
+
346
+ return True
347
+
348
+
349
+ def _detect_vertical_run(
350
+ page: PageText, start_idx: int
351
+ ) -> Optional[Tuple[int, int, List[List[str]]]]:
352
+ """Detect a vertical multi-block table starting at start_idx.
353
+
354
+ Returns:
355
+ Tuple of (start_idx, end_idx, grid) if valid, None otherwise.
356
+ end_idx is exclusive (one past the last block in the table).
357
+ """
358
+ if start_idx >= len(page.blocks):
359
+ return None
360
+
361
+ first = page.blocks[start_idx]
362
+ if not _block_can_start_vertical(first):
363
+ return None
364
+
365
+ first_texts = _block_line_texts(first)
366
+ col_count = len(first_texts)
367
+ if col_count < 2:
368
+ return None
369
+
370
+ blocks: List[Block] = [first]
371
+ idx = start_idx + 1
372
+ n_blocks = len(page.blocks)
373
+
374
+ while idx < n_blocks:
375
+ blk = page.blocks[idx]
376
+ texts = _block_line_texts(blk)
377
+
378
+ if len(texts) != col_count:
379
+ break
380
+ if any(_is_list_like_line(t) for t in texts):
381
+ break
382
+ if _is_code_like_block(texts):
383
+ break
384
+
385
+ blocks.append(blk)
386
+ idx += 1
387
+
388
+ # Need ≥3 blocks to avoid 2-block paragraph pairs
389
+ if len(blocks) < 3:
390
+ return None
391
+
392
+ grid: List[List[str]] = []
393
+ for blk in blocks:
394
+ row = [t.strip() for t in _block_line_texts(blk)]
395
+ if len(row) < col_count:
396
+ row.extend('' for _ in range(col_count - len(row)))
397
+ elif len(row) > col_count:
398
+ row = row[:col_count]
399
+ grid.append(row)
400
+
401
+ if len(grid) < 2:
402
+ return None
403
+
404
+ return start_idx, idx, grid
405
+
406
+
407
+ # ---------------------------------------------------------------------------
408
+ # ASCII single-block detection (Strategy 2)
409
+ # ---------------------------------------------------------------------------
410
+
411
+
412
+ _CELL_SPLIT_RE_CONSERVATIVE = re.compile(r"[ \t]{3,}")
413
+ _CELL_SPLIT_RE_RELAXED = re.compile(r"[ \t]{2,}")
414
+
415
+
416
+ def _split_cells(text: str) -> List[str]:
417
+ """Split text into cells based on whitespace.
418
+
419
+ Tries 3+ spaces first (conservative), falls back to 2+ spaces.
420
+ This helps distinguish tables from prose with occasional double spaces.
421
+ """
422
+ s = text.rstrip()
423
+ if not s:
424
+ return [""]
425
+
426
+ # Try conservative split first
427
+ cells = _CELL_SPLIT_RE_CONSERVATIVE.split(s)
428
+ if len(cells) >= 2:
429
+ return cells
430
+
431
+ # Fall back to relaxed split
432
+ return _CELL_SPLIT_RE_RELAXED.split(s)
433
+
434
+
435
+ def _block_is_obviously_non_table(block: Block) -> bool:
436
+ """Quick filter to skip blocks that are clearly not tables.
437
+
438
+ Checks for:
439
+ - Too few lines
440
+ - Short prose blocks without columns
441
+ - High concentration of list markers
442
+ - Lines starting with bullets
443
+ """
444
+ texts = _block_line_texts(block)
445
+ if len(texts) < 2:
446
+ return True
447
+
448
+ # Short blocks without multi-column structure
449
+ if len(texts) <= 3 and all(len(t.strip()) <= 40 for t in texts):
450
+ if not any(len(_split_cells(t)) >= 2 for t in texts):
451
+ return True
452
+
453
+ # High concentration of list markers
454
+ list_like = sum(1 for t in texts if _is_list_like_line(t))
455
+ if list_like >= max(2, int(0.8 * len(texts))):
456
+ return True
457
+
458
+ # Nearly all lines start with bullets (strong list signal)
459
+ bullet_chars = set('•◦◦-*')
460
+ bullet_starters = sum(1 for t in texts if t.lstrip()[:1] in bullet_chars)
461
+ if bullet_starters >= len(texts) * 0.9:
462
+ return True
463
+
464
+ return False
465
+
466
+
467
+ def _detect_ascii_table_in_block(block: Block) -> Optional[List[List[str]]]:
468
+ """Detect whitespace-separated tables within a single block.
469
+
470
+ Uses the most common column count as the target and normalizes rows
471
+ to that width, merging overflow content into the last column.
472
+ """
473
+ texts = _block_line_texts(block)
474
+ if len(texts) < 2:
475
+ return None
476
+
477
+ if _is_code_like_block(texts):
478
+ return None
479
+
480
+ split_lines: List[List[str]] = [_split_cells(t) for t in texts]
481
+ is_row = [len(cells) >= 2 for cells in split_lines]
482
+ if sum(is_row) < 2:
483
+ return None
484
+
485
+ # Find first and last valid table rows
486
+ first_row = next(i for i, flag in enumerate(is_row) if flag)
487
+ last_row = next(i for i in range(len(is_row) - 1, -1, -1) if is_row[i])
488
+
489
+ core_lines = split_lines[first_row : last_row + 1]
490
+ core_flags = is_row[first_row : last_row + 1]
491
+
492
+ # Determine target column count (most common)
493
+ row_counts = [len(cells) for cells, flag in zip(core_lines, core_flags) if flag]
494
+ target_cols, freq = _most_common_int(row_counts)
495
+ if target_cols < 2 or freq < max(2, int(0.6 * len(row_counts))):
496
+ return None
497
+
498
+ grid: List[List[str]] = []
499
+ for cells in core_lines:
500
+ if len(cells) < target_cols:
501
+ # Pad short rows
502
+ cells = cells + [''] * (target_cols - len(cells))
503
+ elif len(cells) > target_cols:
504
+ # Merge overflow into last column
505
+ head = cells[: target_cols - 1]
506
+ tail = ' '.join(cells[target_cols - 1 :]).strip()
507
+ tail = _strip_repeated_row_tail(tail, head)
508
+ cells = head + ([tail] if tail else [''])
509
+
510
+ cleaned = [c.strip() for c in cells]
511
+ if any(cleaned): # Skip empty rows
512
+ grid.append(cleaned)
513
+
514
+ if len(grid) < 2:
515
+ return None
516
+
517
+ return grid
518
+
519
+
520
+ def _strip_repeated_row_tail(tail: str, head_cells: List[str]) -> str:
521
+ """Clean up repeated content in overflow cells.
522
+
523
+ Sometimes PDF extraction duplicates header text or repeats patterns
524
+ in the tail. This function attempts to detect and remove such artifacts.
525
+ """
526
+ t = tail.strip()
527
+ if not t:
528
+ return ""
529
+
530
+ # Remove if tail starts with concatenated header text
531
+ joined_head = ' '.join(h.strip() for h in head_cells if h.strip())
532
+ if joined_head and t.startswith(joined_head):
533
+ rest = t[len(joined_head):].strip()
534
+ if not rest:
535
+ return ""
536
+ t = rest
537
+
538
+ # Detect repeated chunks (e.g., "data data data data")
539
+ parts = t.split()
540
+ if len(parts) >= 4:
541
+ chunk = ' '.join(parts[: len(parts) // 2])
542
+ if chunk and t.count(chunk) >= 3:
543
+ return "" # Likely a repetition artifact
544
+
545
+ return t
546
+
547
+
548
+ # ---------------------------------------------------------------------------
549
+ # Grid profiling and scoring
550
+ # ---------------------------------------------------------------------------
551
+
552
+
553
+ _SENTENCE_END_RE = re.compile(r"[.!?…]+$")
554
+
555
+
556
+ def _cell_is_short_token(text: str) -> bool:
557
+ """Check if a cell contains a short token (identifier, number, code).
558
+
559
+ Short tokens are:
560
+ - ≤24 characters
561
+ - No internal spaces
562
+ - Alphanumeric or numeric with punctuation
563
+ """
564
+ s = text.strip()
565
+ if not s:
566
+ return False
567
+ if len(s) > 24:
568
+ return False
569
+ if ' ' in s:
570
+ return False
571
+
572
+ s_clean = s.strip('()[]{}%$€£+-')
573
+ if not s_clean:
574
+ return False
575
+
576
+ # Pure digits or decimals
577
+ if s_clean.isdigit():
578
+ return True
579
+ if s_clean.replace('.', '', 1).isdigit():
580
+ return True
581
+
582
+ # Alphanumeric identifiers
583
+ if s_clean.isalnum():
584
+ return True
585
+
586
+ return False
587
+
588
+
589
+ def _cell_is_numeric(text: str) -> bool:
590
+ """Check if a cell contains numeric data (including percentages)."""
591
+ s = text.strip().replace(',', '')
592
+ if not s:
593
+ return False
594
+
595
+ # Handle percentages and decimals
596
+ s_clean = s.replace('.', '', 1).replace('%', '', 1)
597
+ if s_clean.isdigit():
598
+ return True
599
+
600
+ # Handle negative numbers
601
+ if s_clean.startswith('-') and s_clean[1:].replace('.', '', 1).isdigit():
602
+ return True
603
+
604
+ return False
605
+
606
+
607
+ def _cell_is_sentence(text: str) -> bool:
608
+ """Check if a cell contains a complete sentence.
609
+
610
+ Sentences have:
611
+ - ≥5 words
612
+ - Sentence-ending punctuation
613
+ - Optional internal punctuation
614
+ """
615
+ s = text.strip()
616
+ if not s:
617
+ return False
618
+
619
+ words = s.split()
620
+ if len(words) < 5:
621
+ return False
622
+
623
+ if not _SENTENCE_END_RE.search(s):
624
+ return False
625
+
626
+ # Presence of commas/semicolons strengthens sentence signal
627
+ if ',' in s or ';' in s:
628
+ return True
629
+
630
+ return True
631
+
632
+
633
+ def _profile_grid(grid: List[List[str]]) -> GridProfile:
634
+ """Compute statistical profile and score for a candidate table grid.
635
+
636
+ Scoring factors (positive):
637
+ - More rows and columns
638
+ - Higher ratio of short tokens and numeric cells
639
+ - Consistent rectangular structure
640
+ - Reasonable cell lengths
641
+
642
+ Scoring factors (negative):
643
+ - High ratio of sentence-like cells (suggests prose)
644
+ - Very long average cell length
645
+ - Low cell density
646
+ """
647
+ if not grid or len(grid) < 2:
648
+ return GridProfile(0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0.0)
649
+
650
+ n_rows = len(grid)
651
+ n_cols = max(len(row) for row in grid)
652
+ if n_cols < 2:
653
+ return GridProfile(n_rows, n_cols, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0.0)
654
+
655
+ non_empty = 0
656
+ short_tokens = 0
657
+ numeric = 0
658
+ sentences = 0
659
+ lengths: List[int] = []
660
+ header_rows = 1 # Default assumption
661
+
662
+ for r, row in enumerate(grid):
663
+ for c, cell in enumerate(row):
664
+ s = cell.strip()
665
+ if not s:
666
+ continue
667
+ non_empty += 1
668
+ L = len(s)
669
+ lengths.append(L)
670
+
671
+ if _cell_is_short_token(s):
672
+ short_tokens += 1
673
+ if _cell_is_numeric(s):
674
+ numeric += 1
675
+ if _cell_is_sentence(s):
676
+ sentences += 1
677
+
678
+ avg_len = (sum(lengths) / len(lengths)) if lengths else 0.0
679
+ max_len = max(lengths) if lengths else 0
680
+
681
+ # Calculate cell density
682
+ total_cells = n_rows * n_cols
683
+ density = non_empty / total_cells if total_cells > 0 else 0.0
684
+
685
+ # Build score
686
+ score = 0.0
687
+
688
+ # Base score from dimensions
689
+ score += 1.0 * n_rows
690
+ score += 0.8 * n_cols
691
+
692
+ if non_empty > 0:
693
+ # Reward tabular content types
694
+ score += 3.0 * (short_tokens / non_empty)
695
+ score += 2.0 * (numeric / non_empty)
696
+
697
+ # Penalize sentence-heavy content (more nuanced)
698
+ sentence_ratio = sentences / non_empty
699
+ if sentence_ratio > 0.8:
700
+ score -= 4.0 * sentence_ratio
701
+ elif sentence_ratio > 0.4:
702
+ score -= 2.0 * sentence_ratio
703
+
704
+ # Penalize very long cells (suggests paragraphs)
705
+ if avg_len > 120:
706
+ score -= 5.0
707
+
708
+ # Bonus for substantial tables
709
+ if n_rows >= 4 and n_cols >= 3:
710
+ score += 2.0
711
+
712
+ # Bonus for consistent column structure
713
+ col_lengths = [len(row) for row in grid]
714
+ if len(set(col_lengths)) == 1: # All rows same length
715
+ score += 1.5
716
+
717
+ # Bonus for good density
718
+ if density >= 0.6:
719
+ score += 1.0
720
+
721
+ return GridProfile(
722
+ n_rows=n_rows,
723
+ n_cols=n_cols,
724
+ non_empty_cells=non_empty,
725
+ short_token_cells=short_tokens,
726
+ numeric_cells=numeric,
727
+ sentence_cells=sentences,
728
+ avg_len=avg_len,
729
+ max_len=max_len,
730
+ header_rows=header_rows,
731
+ score=score,
732
+ density=density,
733
+ )
734
+
735
+
736
+ def _grid_passes_profile(prof: GridProfile) -> bool:
737
+ """Filter grids based on profile thresholds.
738
+
739
+ A grid passes if it:
740
+ - Has sufficient dimensions (≥2x2)
741
+ - Has non-empty content
742
+ - Has reasonable density (≥25%)
743
+ - Isn't too prose-heavy
744
+ - Has adequate structural signals
745
+ - Meets minimum score threshold
746
+ """
747
+ if prof.n_rows < 2 or prof.n_cols < 2:
748
+ return False
749
+
750
+ if prof.non_empty_cells == 0:
751
+ return False
752
+
753
+ # Require minimum cell density
754
+ if prof.density < 0.25:
755
+ return False
756
+
757
+ # Sentence-heavy content check (more lenient with good structure)
758
+ if prof.sentence_cells >= 0.6 * prof.non_empty_cells:
759
+ # Allow if we have strong structural signals
760
+ has_structure = (
761
+ prof.numeric_cells > 0 or
762
+ prof.short_token_cells >= 0.1 * prof.non_empty_cells or
763
+ (prof.n_rows >= 3 and prof.n_cols >= 3)
764
+ )
765
+ if not has_structure:
766
+ return False
767
+
768
+ # Tables should have some tokens or numbers
769
+ if prof.short_token_cells < 0.15 * prof.non_empty_cells and prof.numeric_cells == 0:
770
+ # More lenient for larger tables
771
+ if prof.n_rows < 3 or prof.n_cols < 3:
772
+ return False
773
+
774
+ # Score threshold
775
+ if prof.score < 1.0:
776
+ return False
777
+
778
+ return True
779
+
780
+
781
+ # ---------------------------------------------------------------------------
782
+ # Misc heuristics
783
+ # ---------------------------------------------------------------------------
784
+
785
+
786
+ def _is_list_like_line(text: str) -> bool:
787
+ """Check if a line starts with a list marker.
788
+
789
+ Recognized markers:
790
+ - Bullets: -, •, ◦, ◦, *
791
+ - Numbered: 1. or 1)
792
+ - Lettered: A. or a)
793
+ """
794
+ s = text.lstrip()
795
+ if not s:
796
+ return False
797
+
798
+ # Bullet markers
799
+ if s[0] in ('-', '•', '◦', '◦', '*') and (len(s) == 1 or s[1].isspace()):
800
+ return True
801
+
802
+ # Numbered or lettered lists
803
+ if re.match(r'^(\d+|[A-Za-z])(\.|\))\s+', s):
804
+ return True
805
+
806
+ return False
807
+
808
+
809
+ _CODE_SYMBOLS = set('{}[]();<>/=*+-')
810
+
811
+
812
+ def _is_code_like_block(lines: Iterable[str]) -> bool:
813
+ """Check if a block looks like code rather than a table.
814
+
815
+ Code indicators:
816
+ - High density of programming symbols
817
+ - Keywords like def, class, for, while, if
818
+ - Type annotations (->)
819
+ """
820
+ texts = [ln.strip() for ln in lines if ln.strip()]
821
+ if not texts:
822
+ return False
823
+
824
+ suspicious = 0
825
+ for t in texts:
826
+ lower = t.lower()
827
+
828
+ # Programming keywords
829
+ if lower.startswith(('def ', 'class ', 'for ', 'while ', 'if ')):
830
+ suspicious += 1
831
+ continue
832
+
833
+ # Type annotations
834
+ if ' -> ' in t:
835
+ suspicious += 1
836
+ continue
837
+
838
+ # Symbol density
839
+ non_space = [c for c in t if not c.isspace()]
840
+ if not non_space:
841
+ continue
842
+
843
+ code_ratio = sum(c in _CODE_SYMBOLS for c in non_space) / float(len(non_space))
844
+ if code_ratio >= 0.35:
845
+ suspicious += 1
846
+
847
+ return suspicious >= max(2, len(texts) // 2)
848
+
849
+
850
+ def _most_common_int(vals: List[int]) -> Tuple[int, int]:
851
+ """Find the most common integer in a list.
852
+
853
+ Returns:
854
+ Tuple of (most_common_value, frequency)
855
+ """
856
+ if not vals:
857
+ return 0, 0
858
+
859
+ counts: Dict[int, int] = {}
860
+ for v in vals:
861
+ counts[v] = counts.get(v, 0) + 1
862
+
863
+ best = max(counts, key=lambda x: counts[x])
864
+ return best, counts[best]
865
+
866
+
867
+ __all__ = [
868
+ "TableDetection",
869
+ "GridProfile",
870
+ "detect_tables_on_page",
871
+ ]