vlmparse 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. vlmparse/clients/docling.py +2 -2
  2. vlmparse/clients/dotsocr.py +11 -2
  3. vlmparse/clients/mineru.py +8 -7
  4. vlmparse/clients/openai_converter.py +1 -0
  5. vlmparse/converter_with_server.py +5 -4
  6. vlmparse/registries.py +2 -4
  7. vlmparse/servers/docker_server.py +1 -1
  8. vlmparse/servers/utils.py +3 -2
  9. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/METADATA +17 -3
  10. vlmparse-0.1.5.dist-info/RECORD +36 -0
  11. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
  12. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  13. vlmparse/benchpdf2md/create_dataset.py +0 -60
  14. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
  15. vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
  16. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
  17. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
  18. vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
  19. vlmparse/benchpdf2md/run_benchmark.py +0 -296
  20. vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
  21. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
  22. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
  23. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
  24. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
  25. vlmparse/benchpdf2md/utils.py +0 -56
  26. vlmparse-0.1.4.dist-info/RECORD +0 -51
  27. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/WHEEL +0 -0
  28. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/entry_points.txt +0 -0
  29. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/licenses/LICENSE +0 -0
  30. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/top_level.txt +0 -0
@@ -1,1763 +0,0 @@
1
- # Adapted from https://github.com/allenai/olmocr/blob/main/olmocr/bench/tests.py
2
-
3
- import json
4
- import math
5
- import re
6
- import unicodedata
7
- from typing import List, Optional, Set, Tuple
8
-
9
- import numpy as np
10
- from bs4 import BeautifulSoup
11
- from pydantic import BaseModel, ConfigDict, Field, model_validator
12
- from rapidfuzz import fuzz, process
13
- from rapidfuzz.distance import Levenshtein
14
- from typing_extensions import Literal
15
- from unidecode import unidecode
16
-
17
-
18
- class Match:
19
- def __init__(self, start, end, dist):
20
- self.start = start
21
- self.end = end
22
- self.dist = dist
23
-
24
-
25
- def find_near_matches(pattern: str, text: str, max_l_dist: int) -> List[Match]:
26
- if not pattern or not text:
27
- return []
28
-
29
- matches = []
30
- pattern_len = len(pattern)
31
-
32
- for window_size in [pattern_len, pattern_len - 1, pattern_len + 1]:
33
- if window_size <= 0 or window_size > len(text):
34
- continue
35
-
36
- chunks = [
37
- (text[i : i + window_size], i) for i in range(len(text) - window_size + 1)
38
- ]
39
- if not chunks:
40
- continue
41
-
42
- result = process.extractOne(
43
- pattern, [c[0] for c in chunks], scorer=Levenshtein.distance
44
- )
45
-
46
- if result:
47
- matched_text, score, idx = result
48
- dist = int(score)
49
- if dist <= max_l_dist:
50
- start_pos = chunks[idx][1]
51
- matches.append(Match(start_pos, start_pos + window_size, dist))
52
-
53
- return matches
54
-
55
-
56
- class RepeatDetector:
57
- def __init__(self, max_ngram_size: int = 10):
58
- self.max_ngram_size = max_ngram_size
59
- self.data = ""
60
-
61
- def add_letters(self, new_str: str):
62
- self.data += new_str
63
-
64
- def ngram_repeats(self) -> list[int]:
65
- result = [0] * self.max_ngram_size
66
-
67
- if not self.data:
68
- return result
69
-
70
- # Normalize all whitespace to single spaces
71
- text = re.sub(r"\s+", " ", self.data)
72
-
73
- # For each n-gram size
74
- for size in range(1, self.max_ngram_size + 1):
75
- if len(text) < size:
76
- continue
77
-
78
- # Get the last n-gram
79
- target = text[-size:]
80
-
81
- # Count backwards from the end to find repeats
82
- count = 0
83
- pos = len(text) - size # Start position for previous n-gram
84
-
85
- while pos >= 0:
86
- if text[pos : pos + size] == target:
87
- count += 1
88
- pos -= size # Move back by the size of the n-gram
89
- else:
90
- break
91
-
92
- result[size - 1] = count
93
-
94
- return result
95
-
96
-
97
- class TableData(BaseModel):
98
- """Class to hold table data and metadata about headers."""
99
-
100
- model_config = ConfigDict(
101
- arbitrary_types_allowed=True,
102
- validate_assignment=True,
103
- strict=True,
104
- )
105
-
106
- data: np.ndarray # The actual table data
107
- header_rows: Set[int] = Field(
108
- default_factory=set
109
- ) # Indices of rows that are headers
110
- header_cols: Set[int] = Field(
111
- default_factory=set
112
- ) # Indices of columns that are headers
113
- col_headers: dict = Field(
114
- default_factory=dict
115
- ) # Maps column index to header text, handling colspan
116
- row_headers: dict = Field(
117
- default_factory=dict
118
- ) # Maps row index to header text, handling rowspan
119
-
120
- def __repr__(self) -> str:
121
- """Returns a concise representation of the TableData object for debugging."""
122
- return f"TableData(shape={self.data.shape}, header_rows={len(self.header_rows)}, header_cols={len(self.header_cols)})"
123
-
124
- def __str__(self) -> str:
125
- """Returns a pretty string representation of the table with header information."""
126
- output = []
127
-
128
- # Table dimensions
129
- output.append(
130
- f"Table: {self.data.shape[0]} rows × {self.data.shape[1]} columns"
131
- )
132
-
133
- # Header info
134
- output.append(f"Header rows: {sorted(self.header_rows)}")
135
- output.append(f"Header columns: {sorted(self.header_cols)}")
136
-
137
- # Table content with formatting
138
- separator = "+" + "+".join(["-" * 17] * self.data.shape[1]) + "+"
139
-
140
- # Add a header for row indices
141
- output.append(separator)
142
- headers = [""] + [f"Column {i}" for i in range(self.data.shape[1])]
143
- output.append(
144
- "| {:<5} | ".format("Row")
145
- + " | ".join(["{:<15}".format(h) for h in headers[1:]])
146
- + " |"
147
- )
148
- output.append(separator)
149
-
150
- # Format each row
151
- for i in range(min(self.data.shape[0], 15)): # Limit to 15 rows for readability
152
- # Format cells, mark header cells
153
- cells = []
154
- for j in range(self.data.shape[1]):
155
- cell = str(self.data[i, j])
156
- if len(cell) > 15:
157
- cell = cell[:12] + "..."
158
- # Mark header cells with *
159
- if i in self.header_rows or j in self.header_cols:
160
- cell = f"*{cell}*"
161
- cells.append(cell)
162
-
163
- row_str = (
164
- "| {:<5} | ".format(i)
165
- + " | ".join(["{:<15}".format(c) for c in cells])
166
- + " |"
167
- )
168
- output.append(row_str)
169
- output.append(separator)
170
-
171
- # If table is too large, indicate truncation
172
- if self.data.shape[0] > 15:
173
- output.append(f"... {self.data.shape[0] - 15} more rows ...")
174
-
175
- # Column header details if available
176
- if self.col_headers:
177
- output.append("\nColumn header mappings:")
178
- for col, headers in sorted(self.col_headers.items()):
179
- header_strs = [f"({row}, '{text}')" for row, text in headers]
180
- output.append(f" Column {col}: {', '.join(header_strs)}")
181
-
182
- # Row header details if available
183
- if self.row_headers:
184
- output.append("\nRow header mappings:")
185
- for row, headers in sorted(self.row_headers.items()):
186
- header_strs = [f"({col}, '{text}')" for col, text in headers]
187
- output.append(f" Row {row}: {', '.join(header_strs)}")
188
-
189
- return "\n".join(output)
190
-
191
-
192
- TestType = Literal["baseline", "present", "absent", "order", "table", "math"]
193
-
194
-
195
- TestChecked = Literal["verified", "rejected"]
196
-
197
-
198
- class ValidationError(Exception):
199
- """Exception raised for validation errors."""
200
-
201
- pass
202
-
203
-
204
- def normalize_text(md_content: str) -> str:
205
- if md_content is None:
206
- return None
207
-
208
- # Normalize <br> and <br/> to newlines
209
- md_content = re.sub(r"<br/?>", " ", md_content)
210
-
211
- # Normalize whitespace in the md_content
212
- md_content = re.sub(r"\s+", " ", md_content)
213
-
214
- # Remove markdown bold formatting (** or __ for bold)
215
- md_content = re.sub(r"\*\*(.*?)\*\*", r"\1", md_content)
216
- md_content = re.sub(r"\\\*(.*?)\\\*", r"\1", md_content)
217
- md_content = re.sub(r"__(.*?)__", r"\1", md_content)
218
- md_content = re.sub(r"</?b>", "", md_content) # Remove <b> tags if they exist
219
- md_content = re.sub(r"</?i>", "", md_content) # Remove <i> tags if they exist
220
-
221
- # Remove markdown italics formatting (* or _ for italics)
222
- md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
223
- md_content = re.sub(r"_(.*?)_", r"\1", md_content)
224
-
225
- # Convert down to a consistent unicode form, so é == e + accent, unicode forms
226
- md_content = unicodedata.normalize("NFC", md_content)
227
-
228
- # Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
229
- replacements = {
230
- "‘": "'",
231
- "’": "'",
232
- "‚": "'",
233
- "“": '"',
234
- "”": '"',
235
- "„": '"',
236
- "_": "_",
237
- "–": "-",
238
- "—": "-",
239
- "‑": "-",
240
- "‒": "-",
241
- "−": "-",
242
- "\u00b5": "\u03bc",
243
- "º": "°",
244
- "œ": "oe",
245
- r"\*": "",
246
- r"\*\*": "",
247
- "’": "'", # noqa
248
- "« ": "«",
249
- " »": "»",
250
- " .": ".",
251
- " :": ":",
252
- " ,": ",",
253
- "É": "E",
254
- "☑": "[x]",
255
- "☐": "[ ]",
256
- "☒": "[x]",
257
- "✅": "[x]",
258
- "❌": "[x]",
259
- "❎": "[x]",
260
- "✓": "[x]",
261
- "✔": "[x]",
262
- "✗": "[x]",
263
- "✖": "[x]",
264
- "🗹": "[x]",
265
- "[X]": "[x]",
266
- }
267
- for fancy_char, ascii_char in replacements.items():
268
- md_content = md_content.replace(fancy_char, ascii_char)
269
-
270
- return md_content
271
-
272
-
273
- def format_diff_text(reference: str, found: str) -> str:
274
- from rapidfuzz.distance import Indel
275
-
276
- opcodes = Indel.opcodes(reference, found)
277
- result = []
278
- for tag, i1, i2, j1, j2 in opcodes:
279
- if tag == "equal":
280
- result.append(reference[i1:i2])
281
- elif tag == "delete":
282
- result.append(f":red-background[{reference[i1:i2]}]")
283
- elif tag == "insert":
284
- result.append(f":green-background[{found[j1:j2]}]")
285
- elif tag == "replace":
286
- result.append(f":red-background[{reference[i1:i2]}]")
287
- result.append(f":green-background[{found[j1:j2]}]")
288
- return "".join(result)
289
-
290
-
291
- def parse_markdown_tables(md_content: str) -> List[TableData]:
292
- """
293
- Extract and parse all markdown tables from the provided content.
294
- Uses a direct approach to find and parse tables, which is more robust for tables
295
- at the end of files or with irregular formatting.
296
-
297
- Args:
298
- md_content: The markdown content containing tables
299
-
300
- Returns:
301
- A list of TableData objects, each containing the table data and header information
302
- """
303
- # Split the content into lines and process line by line
304
- lines = md_content.strip().split("\n")
305
-
306
- parsed_tables = []
307
- current_table_lines = []
308
- in_table = False
309
-
310
- # Identify potential tables by looking for lines with pipe characters
311
- for _, line in enumerate(lines):
312
- # Check if this line has pipe characters (a table row indicator)
313
- if "|" in line:
314
- # If we weren't in a table before, start a new one
315
- if not in_table:
316
- in_table = True
317
- current_table_lines = [line]
318
- else:
319
- # Continue adding to the current table
320
- current_table_lines.append(line)
321
- else:
322
- # No pipes in this line, so if we were in a table, we've reached its end
323
- if in_table:
324
- # Process the completed table if it has at least 2 rows
325
- if len(current_table_lines) >= 2:
326
- table_data = _process_table_lines(current_table_lines)
327
- if table_data and len(table_data) > 0:
328
- # Convert to numpy array for easier manipulation
329
- max_cols = max(len(row) for row in table_data)
330
- padded_data = [
331
- row + [""] * (max_cols - len(row)) for row in table_data
332
- ]
333
- table_array = np.array(padded_data)
334
-
335
- # In markdown tables, the first row is typically a header row
336
- header_rows = {0} if len(table_array) > 0 else set()
337
-
338
- # Set up col_headers with first row headers for each column
339
- col_headers = {}
340
- if len(table_array) > 0:
341
- for col_idx in range(table_array.shape[1]):
342
- if col_idx < len(table_array[0]):
343
- col_headers[col_idx] = [
344
- (0, table_array[0, col_idx])
345
- ]
346
-
347
- # Set up row_headers with first column headers for each row
348
- row_headers = {}
349
- if table_array.shape[1] > 0:
350
- for row_idx in range(
351
- 1, table_array.shape[0]
352
- ): # Skip header row
353
- row_headers[row_idx] = [
354
- (0, table_array[row_idx, 0])
355
- ] # First column as heading
356
-
357
- # Create TableData object
358
- parsed_tables.append(
359
- TableData(
360
- data=table_array,
361
- header_rows=header_rows,
362
- header_cols={0}
363
- if table_array.shape[1] > 0
364
- else set(), # First column as header
365
- col_headers=col_headers,
366
- row_headers=row_headers,
367
- )
368
- )
369
- in_table = False
370
-
371
- # Process the last table if we're still tracking one at the end of the file
372
- if in_table and len(current_table_lines) >= 2:
373
- table_data = _process_table_lines(current_table_lines)
374
- if table_data and len(table_data) > 0:
375
- # Convert to numpy array
376
- max_cols = max(len(row) for row in table_data)
377
- padded_data = [row + [""] * (max_cols - len(row)) for row in table_data]
378
- table_array = np.array(padded_data)
379
-
380
- # In markdown tables, the first row is typically a header row
381
- header_rows = {0} if len(table_array) > 0 else set()
382
-
383
- # Set up col_headers with first row headers for each column
384
- col_headers = {}
385
- if len(table_array) > 0:
386
- for col_idx in range(table_array.shape[1]):
387
- if col_idx < len(table_array[0]):
388
- col_headers[col_idx] = [(0, table_array[0, col_idx])]
389
-
390
- # Set up row_headers with first column headers for each row
391
- row_headers = {}
392
- if table_array.shape[1] > 0:
393
- for row_idx in range(1, table_array.shape[0]): # Skip header row
394
- row_headers[row_idx] = [
395
- (0, table_array[row_idx, 0])
396
- ] # First column as heading
397
-
398
- # Create TableData object
399
- parsed_tables.append(
400
- TableData(
401
- data=table_array,
402
- header_rows=header_rows,
403
- header_cols={0}
404
- if table_array.shape[1] > 0
405
- else set(), # First column as header
406
- col_headers=col_headers,
407
- row_headers=row_headers,
408
- )
409
- )
410
-
411
- return parsed_tables
412
-
413
-
414
- def _process_table_lines(table_lines: List[str]) -> List[List[str]]:
415
- """
416
- Process a list of lines that potentially form a markdown table.
417
-
418
- Args:
419
- table_lines: List of strings, each representing a line in a potential markdown table
420
-
421
- Returns:
422
- A list of rows, each a list of cell values
423
- """
424
- table_data = []
425
- separator_row_index = None
426
-
427
- # First, identify the separator row (the row with dashes)
428
- for i, line in enumerate(table_lines):
429
- # Check if this looks like a separator row (contains mostly dashes)
430
- content_without_pipes = line.replace("|", "").strip()
431
- if content_without_pipes and all(c in "- :" for c in content_without_pipes):
432
- separator_row_index = i
433
- break
434
-
435
- # Process each line, filtering out the separator row
436
- for i, line in enumerate(table_lines):
437
- # Skip the separator row
438
- if i == separator_row_index:
439
- continue
440
-
441
- # Skip lines that are entirely formatting
442
- if line.strip() and all(c in "- :|" for c in line):
443
- continue
444
-
445
- # Process the cells in this row
446
- cells = [cell.strip() for cell in line.split("|")]
447
-
448
- # Remove empty cells at the beginning and end (caused by leading/trailing pipes)
449
- if cells and cells[0] == "":
450
- cells = cells[1:]
451
- if cells and cells[-1] == "":
452
- cells = cells[:-1]
453
-
454
- if cells: # Only add non-empty rows
455
- table_data.append(cells)
456
-
457
- return table_data
458
-
459
-
460
- def parse_html_tables(html_content: str) -> List[TableData]:
461
- """
462
- Extract and parse all HTML tables from the provided content.
463
- Identifies header rows and columns, and maps them properly handling rowspan/colspan.
464
-
465
- Args:
466
- html_content: The HTML content containing tables
467
-
468
- Returns:
469
- A list of TableData objects, each containing the table data and header information
470
- """
471
- soup = BeautifulSoup(html_content, "html.parser")
472
- tables = soup.find_all("table")
473
-
474
- parsed_tables = []
475
-
476
- for table in tables:
477
- rows = table.find_all(["tr"])
478
- table_data = []
479
- header_rows = set()
480
- header_cols = set()
481
- col_headers = {} # Maps column index to all header cells above it
482
- row_headers = {} # Maps row index to all header cells to its left
483
-
484
- # Find rows inside thead tags - these are definitely header rows
485
- thead = table.find("thead")
486
- if thead:
487
- thead_rows = thead.find_all("tr")
488
- for tr in thead_rows:
489
- header_rows.add(rows.index(tr))
490
-
491
- # Initialize a grid to track filled cells due to rowspan/colspan
492
- cell_grid = {}
493
- col_span_info = {} # Tracks which columns contain headers
494
- row_span_info = {} # Tracks which rows contain headers
495
-
496
- # First pass: process each row to build the raw table data and identify headers
497
- for row_idx, row in enumerate(rows):
498
- cells = row.find_all(["th", "td"])
499
- row_data = []
500
- col_idx = 0
501
-
502
- # If there are th elements in this row, it's likely a header row
503
- if row.find("th"):
504
- header_rows.add(row_idx)
505
-
506
- for cell in cells:
507
- # Skip positions already filled by rowspans from above
508
- while (row_idx, col_idx) in cell_grid:
509
- row_data.append(cell_grid[(row_idx, col_idx)])
510
- col_idx += 1
511
-
512
- # Replace <br> and <br/> tags with newlines before getting text
513
- for br in cell.find_all("br"):
514
- br.replace_with("\n")
515
- cell_text = cell.get_text().strip()
516
-
517
- # Handle rowspan/colspan
518
- rowspan = int(cell.get("rowspan", 1))
519
- colspan = int(cell.get("colspan", 1))
520
-
521
- # Add the cell to the row data
522
- row_data.append(cell_text)
523
-
524
- # Fill the grid for this cell and its rowspan/colspan
525
- for i in range(rowspan):
526
- for j in range(colspan):
527
- if i == 0 and j == 0:
528
- continue # Skip the main cell position
529
- # For rowspan cells, preserve the text in all spanned rows
530
- if j == 0 and i > 0: # Only for cells directly below
531
- cell_grid[(row_idx + i, col_idx + j)] = cell_text
532
- else:
533
- cell_grid[(row_idx + i, col_idx + j)] = (
534
- "" # Mark other spans as empty
535
- )
536
-
537
- # If this is a header cell (th), mark it and its span
538
- if cell.name == "th":
539
- # Mark columns as header columns
540
- for j in range(colspan):
541
- header_cols.add(col_idx + j)
542
-
543
- # For rowspan, mark spanned rows as part of header
544
- for i in range(1, rowspan):
545
- if row_idx + i < len(rows):
546
- header_rows.add(row_idx + i)
547
-
548
- # Record this header for all spanned columns
549
- for j in range(colspan):
550
- curr_col = col_idx + j
551
- if curr_col not in col_headers:
552
- col_headers[curr_col] = []
553
- col_headers[curr_col].append((row_idx, cell_text))
554
-
555
- # Store which columns are covered by this header
556
- if cell_text and colspan > 1:
557
- if cell_text not in col_span_info:
558
- col_span_info[cell_text] = set()
559
- col_span_info[cell_text].add(curr_col)
560
-
561
- # Store which rows are covered by this header for rowspan
562
- if cell_text and rowspan > 1:
563
- if cell_text not in row_span_info:
564
- row_span_info[cell_text] = set()
565
- for i in range(rowspan):
566
- row_span_info[cell_text].add(row_idx + i)
567
-
568
- # Also handle row headers from data cells that have rowspan
569
- if cell.name == "td" and rowspan > 1 and col_idx in header_cols:
570
- for i in range(1, rowspan):
571
- if row_idx + i < len(rows):
572
- if row_idx + i not in row_headers:
573
- row_headers[row_idx + i] = []
574
- row_headers[row_idx + i].append((col_idx, cell_text))
575
-
576
- col_idx += colspan
577
-
578
- # Pad the row if needed to handle different row lengths
579
- table_data.append(row_data)
580
-
581
- # Second pass: expand headers to cells that should inherit them
582
- # First handle column headers
583
- for header_text, columns in col_span_info.items():
584
- for col in columns:
585
- # Add this header to all columns it spans over
586
- for row_idx in range(len(table_data)):
587
- if row_idx not in header_rows: # Only apply to data rows
588
- for j in range(
589
- col,
590
- len(table_data[row_idx])
591
- if row_idx < len(table_data)
592
- else 0,
593
- ):
594
- # Add header info to data cells in these columns
595
- if j not in col_headers:
596
- col_headers[j] = []
597
- if not any(h[1] == header_text for h in col_headers[j]):
598
- header_row = min(
599
- [r for r, t in col_headers.get(col, [(0, "")])]
600
- )
601
- col_headers[j].append((header_row, header_text))
602
-
603
- # Handle row headers
604
- for header_text, rows in row_span_info.items():
605
- for row in rows:
606
- if row < len(table_data):
607
- # Find first header column
608
- header_col = min(header_cols) if header_cols else 0
609
- if row not in row_headers:
610
- row_headers[row] = []
611
- if not any(h[1] == header_text for h in row_headers.get(row, [])):
612
- row_headers[row].append((header_col, header_text))
613
-
614
- # Process regular row headers - each cell in a header column becomes a header for its row
615
- for col_idx in header_cols:
616
- for row_idx, row in enumerate(table_data):
617
- if col_idx < len(row) and row[col_idx].strip():
618
- if row_idx not in row_headers:
619
- row_headers[row_idx] = []
620
- if not any(
621
- h[1] == row[col_idx] for h in row_headers.get(row_idx, [])
622
- ):
623
- row_headers[row_idx].append((col_idx, row[col_idx]))
624
-
625
- # Calculate max columns for padding
626
- max_cols = max(len(row) for row in table_data) if table_data else 0
627
-
628
- # Ensure all rows have the same number of columns
629
- if table_data:
630
- padded_data = [row + [""] * (max_cols - len(row)) for row in table_data]
631
- table_array = np.array(padded_data)
632
-
633
- # Create TableData object with the table and header information
634
- parsed_tables.append(
635
- TableData(
636
- data=table_array,
637
- header_rows=header_rows,
638
- header_cols=header_cols,
639
- col_headers=col_headers,
640
- row_headers=row_headers,
641
- )
642
- )
643
-
644
- return parsed_tables
645
-
646
-
647
- class PageMetadata(BaseModel):
648
- doc_type: Literal["long_text", "multi_page", "large_table"] | None = None
649
- original_doc_path: str
650
- pdf: str
651
- page: int = Field(ge=0)
652
-
653
-
654
- class BasePDFTest(BaseModel):
655
- """
656
- Base class for all PDF test types.
657
-
658
- Attributes:
659
- pdf: The PDF filename.
660
- page: The page number for the test.
661
- id: Unique identifier for the test.
662
- type: The type of test.
663
- threshold: A float between 0 and 1 representing the threshold for fuzzy matching.
664
- """
665
-
666
- pdf: str = Field(min_length=1)
667
- page: int
668
- id: str = Field(min_length=1)
669
- type: TestType
670
- max_diffs: int = Field(ge=0, default=0)
671
- alphanum: bool = False
672
- """Filter only on alphanumeric characters + dots and commas"""
673
- unidecode: bool = False
674
- """Convert text to ASCII using unidecode"""
675
- ignore_space_and_newlines: bool = False
676
- """Ignore space and newlines in the text deprecated, use ignore_space and ignore_newlines instead"""
677
- ignore_space: bool = False
678
- """Ignore space in the text"""
679
- ignore_newlines: bool = True
680
- """Ignore newlines in the text"""
681
- ignore_chars: str = ""
682
- """Characters to ignore in the text"""
683
- ignore_str: list[str] = []
684
- """Strings to ignore in the text"""
685
- checked: Optional[TestChecked] | bool = None
686
- url: Optional[str] = None
687
- category: Optional[str] = None
688
- """subcategory of the test to identify what the test is supposed to measure"""
689
- display_diffs: bool = True
690
- """Whether to display diffs in the explanation"""
691
-
692
- def normalise(self, text: str) -> str:
693
- text = normalize_text(text)
694
- if self.unidecode:
695
- text = unidecode(text, errors="preserve")
696
-
697
- if self.alphanum:
698
- text = re.sub(r"[^a-zA-Z0-9\.,:;\+\(\)\'\"]", "", text).lower()
699
- # text = text.replace(",", ".")
700
- # text = text.replace(";", ":")
701
-
702
- # if self.ignore_space_and_newlines:
703
- # text = re.sub(r"\s+", "", text)
704
-
705
- if self.ignore_space:
706
- text = re.sub(r"[^\S\r\n]+", "", text)
707
- if self.ignore_newlines:
708
- text = re.sub(r"\n+", "", text)
709
-
710
- if self.ignore_chars:
711
- text = re.sub(f"[{self.ignore_chars}]", "", text)
712
-
713
- if self.ignore_str:
714
- for _str in self.ignore_str:
715
- text = text.replace(_str, "")
716
-
717
- return text
718
-
719
- def get_diff(self, reference: str, candidate: str) -> str:
720
- if self.display_diffs:
721
- matches = find_near_matches(
722
- reference, candidate, max_l_dist=len(reference) // 2
723
- )
724
- if matches:
725
- best_match = min(matches, key=lambda m: m.dist)
726
- best_match_text = candidate[best_match.start : best_match.end]
727
- diff_display = format_diff_text(reference, best_match_text)
728
- return diff_display
729
-
730
- def run(self, md_content: str) -> Tuple[bool, str, float]:
731
- """
732
- Run the test on the provided markdown content.
733
-
734
- Args:
735
- md_content: The content of the .md file.
736
-
737
- Returns:
738
- A tuple (passed, explanation) where 'passed' is True if the test passes,
739
- and 'explanation' provides details when the test fails.
740
- """
741
- raise NotImplementedError("Subclasses must implement the run method")
742
-
743
- def __repr__(self):
744
- from devtools import PrettyFormat
745
-
746
- pformat = PrettyFormat()
747
- return pformat(self, highlight=False)
748
-
749
-
750
- class TextPresenceTest(BasePDFTest):
751
- """
752
- Test to verify the presence or absence of specific text in a PDF.
753
-
754
- Attributes:
755
- text: The text string to search for.
756
- """
757
-
758
- text: str = Field()
759
- case_sensitive: bool = True
760
- first_n: Optional[int] = None
761
- last_n: Optional[int] = None
762
- type: Literal["present", "absent"] = Field(default="present")
763
- layout_cat: Literal[
764
- "text", "footer", "header", "footnote", "image", "image_caption"
765
- ] = Field(default="text")
766
-
767
- def run(self, md_content: str) -> Tuple[bool, str]:
768
- reference_query = self.normalise(self.text)
769
-
770
- # Normalize whitespace in the md_content
771
- md_content_n = self.normalise(md_content)
772
-
773
- if not self.case_sensitive:
774
- reference_query = reference_query.lower()
775
- md_content_n = md_content_n.lower()
776
-
777
- if self.first_n and self.last_n:
778
- md_content_n = md_content_n[: self.first_n] + md_content_n[-self.last_n :]
779
- elif self.first_n:
780
- md_content_n = md_content_n[: self.first_n]
781
- elif self.last_n:
782
- md_content_n = md_content_n[-self.last_n :]
783
-
784
- # Threshold for fuzzy matching derived from max_diffs
785
- threshold = 1.0 - (
786
- self.max_diffs / (len(reference_query) if len(reference_query) > 0 else 1)
787
- )
788
- best_ratio = fuzz.partial_ratio(reference_query, md_content_n) / 100.0
789
-
790
- if self.type == "present":
791
- if best_ratio >= threshold:
792
- return True, "", best_ratio
793
- else:
794
- best_match_text = ""
795
- diff_display = "No match found"
796
- if md_content:
797
- diff_display = self.get_diff(reference_query, md_content_n)
798
- msg = (
799
- f"Expected '{reference_query[:40]}' with threshold {threshold} "
800
- f"but best match ratio was {best_ratio:.3f}\n"
801
- f"Diff:\n\n{diff_display}"
802
- )
803
- return False, msg, best_ratio
804
- else: # ABSENT
805
- if best_ratio < threshold:
806
- return True, "", 1 - best_ratio
807
- else:
808
- reference = reference_query # normalize_text(self.text)
809
-
810
- best_match_text = ""
811
- diff_display = "No match found"
812
- if md_content:
813
- matches = find_near_matches(
814
- reference, md_content, max_l_dist=len(reference) // 2
815
- )
816
- if matches:
817
- best_match = min(matches, key=lambda m: m.dist)
818
- best_match_text = md_content[best_match.start : best_match.end]
819
- diff_display = format_diff_text(reference, best_match_text)
820
- msg = (
821
- f"Expected absence of '{reference[:40]}' with threshold {threshold} "
822
- f"but best match ratio was {best_ratio:.3f}\n"
823
- f"Diff:\n\n{diff_display}"
824
- )
825
- return False, msg, 1 - best_ratio
826
-
827
-
828
- class TextOrderTest(BasePDFTest):
829
- """
830
- Test to verify that one text appears before another in a PDF.
831
-
832
- Attributes:
833
- before: The text expected to appear first.
834
- after: The text expected to appear after the 'before' text.
835
- """
836
-
837
- before: str
838
- after: str
839
- type: Literal["order"] = Field(default="order")
840
-
841
- @model_validator(mode="after")
842
- def validate_max_diffs(self):
843
- if (
844
- self.max_diffs > len(self.before) // 2
845
- or self.max_diffs > len(self.after) // 2
846
- ):
847
- raise ValidationError(
848
- "Max diffs is too large for this test, greater than 50% of the search string"
849
- )
850
- return self
851
-
852
- def run(self, md_content: str) -> Tuple[bool, str, float]:
853
- md_content = self.normalise(md_content)
854
- before = self.normalise(self.before)
855
- after = self.normalise(self.after)
856
-
857
- before_matches = find_near_matches(
858
- before, md_content, max_l_dist=self.max_diffs
859
- )
860
- after_matches = find_near_matches(after, md_content, max_l_dist=self.max_diffs)
861
-
862
- if not before_matches:
863
- return (
864
- False,
865
- f"'before' text '{before[:40]}...' not found with max_l_dist {self.max_diffs}",
866
- 0.0,
867
- )
868
- if not after_matches:
869
- return (
870
- False,
871
- f"'after' text '{after[:40]}...' not found with max_l_dist {self.max_diffs}",
872
- 0.0,
873
- )
874
-
875
- for before_match in before_matches:
876
- for after_match in after_matches:
877
- if before_match.start < after_match.start:
878
- return (
879
- True,
880
- "",
881
- min(before_match.dist, after_match.dist)
882
- / max(len(before), len(after)),
883
- )
884
- return (
885
- False,
886
- f"Could not find a location where '{before[:40]}...' appears before "
887
- f"'{after[:40]}...'.",
888
- 0.0,
889
- )
890
-
891
-
892
- class TableTest(BasePDFTest):
893
- cell: str
894
- up: str = ""
895
- down: str = ""
896
- left: str = ""
897
- right: str = ""
898
- type: Literal["table"] = Field(default="table")
899
- top_heading: str = ""
900
- left_heading: str = ""
901
-
902
- def run(self, content: str) -> Tuple[bool, str, float]:
903
- from vlmparse.clients.pipe_utils.html_to_md_conversion import md_tables_to_html
904
-
905
- content = md_tables_to_html(content)
906
- # print(content)
907
-
908
- cell = self.normalise(self.cell)
909
- up = self.normalise(self.up)
910
- down = self.normalise(self.down)
911
- left = self.normalise(self.left)
912
- right = self.normalise(self.right)
913
- top_heading = self.normalise(self.top_heading)
914
- left_heading = self.normalise(self.left_heading)
915
-
916
- threshold = max(
917
- 0.5, 1.0 - (self.max_diffs / (len(cell) if len(cell) > 0 else 1))
918
- )
919
-
920
- soup = BeautifulSoup(content, "html.parser")
921
- tables = soup.find_all("table")
922
-
923
- if not tables:
924
- return False, "No HTML tables found in the content", 0.0
925
-
926
- best_match_score = -1
927
- best_match_reasons = []
928
-
929
- for table in tables:
930
- rows = table.find_all("tr")
931
- cells_info = []
932
- occupied = {}
933
-
934
- for row_idx, row in enumerate(rows):
935
- cells = row.find_all(["th", "td"])
936
- col_idx = 0
937
-
938
- for html_cell in cells:
939
- while (row_idx, col_idx) in occupied:
940
- col_idx += 1
941
- for br in html_cell.find_all("br"):
942
- br.replace_with("\n")
943
- cell_text_orig = html_cell.get_text().strip()
944
- cell_text = self.normalise(cell_text_orig)
945
-
946
- rowspan = int(html_cell.get("rowspan", 1))
947
- colspan = int(html_cell.get("colspan", 1))
948
- is_header = html_cell.name == "th"
949
-
950
- cells_info.append(
951
- {
952
- "row": row_idx,
953
- "col": col_idx,
954
- "rowspan": rowspan,
955
- "colspan": colspan,
956
- "text": cell_text,
957
- "text_orig": cell_text_orig,
958
- "is_header": is_header,
959
- }
960
- )
961
-
962
- for i in range(rowspan):
963
- for j in range(colspan):
964
- occupied[(row_idx + i, col_idx + j)] = True
965
-
966
- col_idx += colspan
967
-
968
- if not cells_info:
969
- continue
970
-
971
- best_cell = None
972
- best_similarity = -1
973
-
974
- for cell_info in cells_info:
975
- cell_content = cell_info["text"]
976
- similarity = fuzz.ratio(cell, cell_content) / 100.0
977
-
978
- if similarity > best_similarity:
979
- best_similarity = similarity
980
- best_cell = cell_info
981
-
982
- if similarity < threshold:
983
- continue
984
-
985
- all_satisfied = True
986
- reasons = []
987
- total_score = similarity
988
-
989
- row_start = cell_info["row"]
990
- row_end = row_start + cell_info["rowspan"]
991
- col_start = cell_info["col"]
992
- col_end = col_start + cell_info["colspan"]
993
-
994
- if up:
995
- up_neighbors = [
996
- c
997
- for c in cells_info
998
- if c != cell_info
999
- and c["row"] + c["rowspan"] == row_start
1000
- and not (
1001
- c["col"] >= col_end or c["col"] + c["colspan"] <= col_start
1002
- )
1003
- ]
1004
- if up_neighbors:
1005
- up_sim = [
1006
- fuzz.ratio(up, n["text"]) / 100.0 for n in up_neighbors
1007
- ]
1008
- best_up_sim = max(up_sim)
1009
- best_up_neighbors = up_neighbors[np.argmax(up_sim)]
1010
- total_score += best_up_sim
1011
- if best_up_sim < max(
1012
- 0.5,
1013
- 1.0 - (self.max_diffs / (len(up) if len(up) > 0 else 1)),
1014
- ):
1015
- all_satisfied = False
1016
- diff_display = self.get_diff(up, best_up_neighbors["text"])
1017
- reasons.append(
1018
- f"Up cell not found (sim: {best_up_sim:.2f})\nDiff:\n\n{diff_display}"
1019
- )
1020
- else:
1021
- all_satisfied = False
1022
- reasons.append("Up cell not found (sim: 0.00)")
1023
-
1024
- if down:
1025
- down_neighbors = [
1026
- c
1027
- for c in cells_info
1028
- if c != cell_info
1029
- and c["row"] == row_end
1030
- and not (
1031
- c["col"] >= col_end or c["col"] + c["colspan"] <= col_start
1032
- )
1033
- ]
1034
- if down_neighbors:
1035
- down_sim = [
1036
- fuzz.ratio(down, n["text"]) / 100.0 for n in down_neighbors
1037
- ]
1038
- best_down_sim = max(down_sim)
1039
- best_down_neighbors = down_neighbors[np.argmax(down_sim)]
1040
-
1041
- total_score += best_down_sim
1042
- if best_down_sim < max(
1043
- 0.5,
1044
- 1.0
1045
- - (self.max_diffs / (len(down) if len(down) > 0 else 1)),
1046
- ):
1047
- all_satisfied = False
1048
- diff_display = self.get_diff(
1049
- down, best_down_neighbors["text"]
1050
- )
1051
- reasons.append(
1052
- f"Down cell not found (sim: {best_down_sim:.2f})\nDiff:\n\n{diff_display}"
1053
- )
1054
- else:
1055
- all_satisfied = False
1056
- reasons.append("Down cell not found (sim: 0.00)")
1057
-
1058
- if left:
1059
- left_neighbors = [
1060
- c
1061
- for c in cells_info
1062
- if c != cell_info
1063
- and c["col"] + c["colspan"] == col_start
1064
- and not (
1065
- c["row"] >= row_end or c["row"] + c["rowspan"] <= row_start
1066
- )
1067
- ]
1068
- if left_neighbors:
1069
- left_sim = [
1070
- fuzz.ratio(left, n["text"]) / 100.0 for n in left_neighbors
1071
- ]
1072
-
1073
- best_left_cell_sim = max(left_sim)
1074
- best_left_cell = left_neighbors[np.argmax(left_sim)]
1075
- total_score += best_left_cell_sim
1076
- if best_left_cell_sim < max(
1077
- 0.5,
1078
- 1.0
1079
- - (self.max_diffs / (len(left) if len(left) > 0 else 1)),
1080
- ):
1081
- all_satisfied = False
1082
- diff_display = self.get_diff(left, best_left_cell["text"])
1083
- reasons.append(
1084
- f"Left cell not found (sim: {best_left_cell_sim:.2f})\nDiff:\n\n{diff_display}"
1085
- )
1086
- else:
1087
- all_satisfied = False
1088
- reasons.append("Left cell not found (sim: 0.00)")
1089
-
1090
- if right:
1091
- right_neighbors = [
1092
- c
1093
- for c in cells_info
1094
- if c != cell_info
1095
- and c["col"] == col_end
1096
- and not (
1097
- c["row"] >= row_end or c["row"] + c["rowspan"] <= row_start
1098
- )
1099
- ]
1100
- if right_neighbors:
1101
- right_sim = [
1102
- fuzz.ratio(right, n["text"]) / 100.0
1103
- for n in right_neighbors
1104
- ]
1105
- best_right_cell_sim = max(right_sim)
1106
- best_right_cell = right_neighbors[np.argmax(right_sim)]
1107
- total_score += best_right_cell_sim
1108
- if best_right_cell_sim < max(
1109
- 0.5,
1110
- 1.0
1111
- - (self.max_diffs / (len(right) if len(right) > 0 else 1)),
1112
- ):
1113
- all_satisfied = False
1114
- diff_display = self.get_diff(right, best_right_cell["text"])
1115
- reasons.append(
1116
- f"Right cell not found (sim: {best_right_cell_sim:.2f})\nDiff:\n\n{diff_display}"
1117
- )
1118
- else:
1119
- all_satisfied = False
1120
- reasons.append("Right cell not found (sim: 0.00)")
1121
-
1122
- if top_heading:
1123
- header_cells = [
1124
- c
1125
- for c in cells_info
1126
- if c["is_header"]
1127
- and not (
1128
- c["col"] >= col_end or c["col"] + c["colspan"] <= col_start
1129
- )
1130
- ]
1131
- if header_cells:
1132
- headers_sim = [
1133
- fuzz.ratio(top_heading, n["text"]) / 100.0
1134
- for n in header_cells
1135
- ]
1136
- best_header_cell_sim = max(headers_sim)
1137
- best_header_cell = header_cells[np.argmax(headers_sim)]
1138
- total_score += best_header_cell_sim
1139
- if best_header_cell_sim < max(
1140
- 0.5,
1141
- 1.0
1142
- - (
1143
- self.max_diffs
1144
- / (len(top_heading) if len(top_heading) > 0 else 1)
1145
- ),
1146
- ):
1147
- all_satisfied = False
1148
- diff_display = self.get_diff(
1149
- top_heading, best_header_cell["text"]
1150
- )
1151
- reasons.append(
1152
- f"Top heading not found (sim: {best_header_cell_sim:.2f})\nDiff:\n\n{diff_display}"
1153
- )
1154
- else:
1155
- all_satisfied = False
1156
- reasons.append("Top heading not found (sim: 0.00)")
1157
-
1158
- if left_heading:
1159
- header_cells = [
1160
- c
1161
- for c in cells_info
1162
- if c["col"] == 0
1163
- and not (
1164
- c["row"] >= row_end or c["row"] + c["rowspan"] <= row_start
1165
- )
1166
- ]
1167
- if header_cells:
1168
- headers_sim = [
1169
- fuzz.ratio(left_heading, n["text"]) / 100.0
1170
- for n in header_cells
1171
- ]
1172
- best_header_cell_sim = max(headers_sim)
1173
- best_header_cell = header_cells[np.argmax(headers_sim)]
1174
- total_score += best_header_cell_sim
1175
- if best_header_cell_sim < max(
1176
- 0.5,
1177
- 1.0
1178
- - (
1179
- self.max_diffs
1180
- / (len(left_heading) if len(left_heading) > 0 else 1)
1181
- ),
1182
- ):
1183
- all_satisfied = False
1184
- diff_display = self.get_diff(
1185
- left_heading, best_header_cell["text"]
1186
- )
1187
- reasons.append(
1188
- f"Left heading not found (sim: {best_header_cell_sim:.2f})\nDiff:\n\n{diff_display}"
1189
- )
1190
- else:
1191
- all_satisfied = False
1192
- reasons.append("Left heading not found (sim: 0.00)")
1193
-
1194
- if all_satisfied:
1195
- return True, "", best_match_score
1196
-
1197
- if total_score > best_match_score:
1198
- best_match_score = total_score
1199
- best_match_reasons = reasons
1200
-
1201
- if best_match_score < 0:
1202
- if best_cell:
1203
- diff_display = self.get_diff(left_heading, best_cell["text"])
1204
- else:
1205
- diff_display = ""
1206
- return (
1207
- False,
1208
- f"No cell matching '{cell}' found with threshold {threshold}\nDiff:\n\n{diff_display}",
1209
- best_match_score,
1210
- )
1211
- else:
1212
- exp = "\n\n".join(best_match_reasons)
1213
- return (
1214
- False,
1215
- f"Found cells matching '{cell}' but relationships not satisfied: {exp}",
1216
- best_match_score,
1217
- )
1218
-
1219
-
1220
- class TableTestOld(BasePDFTest):
1221
- """
1222
- Test to verify certain properties of a table are held, namely that some cells appear relative to other cells correctly
1223
- """
1224
-
1225
- cell: str
1226
- up: str = ""
1227
- down: str = ""
1228
- left: str = ""
1229
- right: str = ""
1230
- type: Literal["table"] = Field(default="table")
1231
- top_heading: str = ""
1232
- left_heading: str = ""
1233
-
1234
- def run(self, content: str) -> Tuple[bool, str]:
1235
- """
1236
- Run the table test on provided content.
1237
-
1238
- Finds all tables (markdown and/or HTML based on content_type) and checks if any cell
1239
- matches the target cell and satisfies the specified relationships.
1240
-
1241
- Args:
1242
- content: The content containing tables (markdown or HTML)
1243
-
1244
- Returns:
1245
- A tuple (passed, explanation) where 'passed' is True if the test passes,
1246
- and 'explanation' provides details when the test fails.
1247
- """
1248
- cell = self.normalise(self.cell)
1249
- up = self.normalise(self.up)
1250
- down = self.normalise(self.down)
1251
- left = self.normalise(self.left)
1252
- right = self.normalise(self.right)
1253
- top_heading = self.normalise(self.top_heading)
1254
- left_heading = self.normalise(self.left_heading)
1255
- # Initialize variables to track tables and results
1256
- tables_to_check = []
1257
- failed_reasons = []
1258
-
1259
- # Threshold for fuzzy matching derived from max_diffs
1260
- threshold = 1.0 - (self.max_diffs / (len(cell) if len(cell) > 0 else 1))
1261
- threshold = max(0.5, threshold)
1262
-
1263
- # Parse tables based on content_type
1264
- md_tables = parse_markdown_tables(content)
1265
- tables_to_check.extend(md_tables)
1266
-
1267
- html_tables = parse_html_tables(content)
1268
- tables_to_check.extend(html_tables)
1269
-
1270
- # If no tables found, return failure
1271
- if not tables_to_check:
1272
- return False, "No tables found in the content"
1273
-
1274
- # Check each table
1275
- for table_data in tables_to_check:
1276
- # Removed debug print statement
1277
- table_array = table_data.data
1278
- header_rows = table_data.header_rows
1279
- header_cols = table_data.header_cols
1280
-
1281
- # Find all cells that match the target cell using fuzzy matching
1282
- matches = []
1283
- for i in range(table_array.shape[0]):
1284
- for j in range(table_array.shape[1]):
1285
- cell_content = self.normalise(table_array[i, j])
1286
- similarity = fuzz.ratio(cell, cell_content) / 100.0
1287
-
1288
- if similarity >= threshold:
1289
- matches.append((i, j))
1290
-
1291
- # If no matches found in this table, continue to the next table
1292
- if not matches:
1293
- continue
1294
-
1295
- # Check the relationships for each matching cell
1296
- for row_idx, col_idx in matches:
1297
- all_relationships_satisfied = True
1298
- current_failed_reasons = []
1299
-
1300
- # Check up relationship
1301
- if up and row_idx > 0:
1302
- up_cell = self.normalise(table_array[row_idx - 1, col_idx])
1303
- up_similarity = fuzz.ratio(up, up_cell) / 100.0
1304
- if up_similarity < max(
1305
- 0.5,
1306
- 1.0 - (self.max_diffs / (len(up) if len(up) > 0 else 1)),
1307
- ):
1308
- all_relationships_satisfied = False
1309
- current_failed_reasons.append(
1310
- f"Cell above '{up_cell}' doesn't match expected '{up}' (similarity: {up_similarity:.2f})"
1311
- )
1312
-
1313
- # Check down relationship
1314
- if down and row_idx < table_array.shape[0] - 1:
1315
- down_cell = self.normalise(table_array[row_idx + 1, col_idx])
1316
- down_similarity = fuzz.ratio(down, down_cell) / 100.0
1317
- if down_similarity < max(
1318
- 0.5,
1319
- 1.0 - (self.max_diffs / (len(down) if len(down) > 0 else 1)),
1320
- ):
1321
- all_relationships_satisfied = False
1322
- current_failed_reasons.append(
1323
- f"Cell below '{down_cell}' doesn't match expected '{down}' (similarity: {down_similarity:.2f})"
1324
- )
1325
-
1326
- # Check left relationship
1327
- if left and col_idx > 0:
1328
- left_cell = self.normalise(table_array[row_idx, col_idx - 1])
1329
- left_similarity = fuzz.ratio(left, left_cell) / 100.0
1330
- if left_similarity < max(
1331
- 0.5,
1332
- 1.0 - (self.max_diffs / (len(left) if len(left) > 0 else 1)),
1333
- ):
1334
- all_relationships_satisfied = False
1335
- current_failed_reasons.append(
1336
- f"Cell to the left '{left_cell}' doesn't match expected '{left}' (similarity: {left_similarity:.2f})"
1337
- )
1338
-
1339
- # Check right relationship
1340
- if right and col_idx < table_array.shape[1] - 1:
1341
- right_cell = self.normalise(table_array[row_idx, col_idx + 1])
1342
- right_similarity = fuzz.ratio(right, right_cell) / 100.0
1343
- if right_similarity < max(
1344
- 0.5,
1345
- 1.0 - (self.max_diffs / (len(right) if len(right) > 0 else 1)),
1346
- ):
1347
- all_relationships_satisfied = False
1348
- current_failed_reasons.append(
1349
- f"Cell to the right '{right_cell}' doesn't match expected '{right}' (similarity: {right_similarity:.2f})"
1350
- )
1351
-
1352
- # Check top heading relationship
1353
- if top_heading:
1354
- # Try to find a match in the column headers
1355
- top_heading_found = False
1356
- best_match = ""
1357
- best_similarity = 0
1358
-
1359
- # Check the col_headers dictionary first (this handles colspan properly)
1360
- if col_idx in table_data.col_headers:
1361
- for _, header_text in table_data.col_headers[col_idx]:
1362
- header_text = self.normalise(header_text)
1363
- similarity = fuzz.ratio(top_heading, header_text) / 100.0
1364
- if similarity > best_similarity:
1365
- best_similarity = similarity
1366
- best_match = header_text
1367
- if best_similarity >= max(
1368
- 0.5,
1369
- 1.0
1370
- - (
1371
- self.max_diffs
1372
- / (
1373
- len(top_heading)
1374
- if len(top_heading) > 0
1375
- else 1
1376
- )
1377
- ),
1378
- ):
1379
- top_heading_found = True
1380
- break
1381
-
1382
- # If no match found in col_headers, fall back to checking header rows
1383
- if not top_heading_found and header_rows:
1384
- for i in sorted(header_rows):
1385
- if i < row_idx and table_array[i, col_idx].strip():
1386
- header_text = self.normalise(table_array[i, col_idx])
1387
- similarity = (
1388
- fuzz.ratio(top_heading, header_text) / 100.0
1389
- )
1390
- if similarity > best_similarity:
1391
- best_similarity = similarity
1392
- best_match = header_text
1393
- if best_similarity >= max(
1394
- 0.5,
1395
- 1.0
1396
- - (
1397
- self.max_diffs
1398
- / (
1399
- len(top_heading)
1400
- if len(top_heading) > 0
1401
- else 1
1402
- )
1403
- ),
1404
- ):
1405
- top_heading_found = True
1406
- break
1407
-
1408
- # If still no match, use any non-empty cell above as a last resort
1409
- if not top_heading_found and not best_match and row_idx > 0:
1410
- for i in range(row_idx):
1411
- if table_array[i, col_idx].strip():
1412
- header_text = self.normalise(table_array[i, col_idx])
1413
- similarity = (
1414
- fuzz.ratio(top_heading, header_text) / 100.0
1415
- )
1416
- if similarity > best_similarity:
1417
- best_similarity = similarity
1418
- best_match = header_text
1419
-
1420
- if not best_match:
1421
- all_relationships_satisfied = False
1422
- current_failed_reasons.append(
1423
- f"No top heading found for cell at ({row_idx}, {col_idx})"
1424
- )
1425
- elif best_similarity < max(
1426
- 0.5,
1427
- 1.0
1428
- - (
1429
- self.max_diffs
1430
- / (len(top_heading) if len(top_heading) > 0 else 1)
1431
- ),
1432
- ):
1433
- all_relationships_satisfied = False
1434
- current_failed_reasons.append(
1435
- f"Top heading '{best_match}' doesn't match expected '{top_heading}' (similarity: {best_similarity:.2f})"
1436
- )
1437
-
1438
- # Check left heading relationship
1439
- if left_heading:
1440
- # Try to find a match in the row headers
1441
- left_heading_found = False
1442
- best_match = ""
1443
- best_similarity = 0
1444
-
1445
- # Check the row_headers dictionary first (this handles rowspan properly)
1446
- if row_idx in table_data.row_headers:
1447
- for _, header_text in table_data.row_headers[row_idx]:
1448
- header_text = self.normalise(header_text)
1449
- similarity = fuzz.ratio(left_heading, header_text) / 100.0
1450
- if similarity > best_similarity:
1451
- best_similarity = similarity
1452
- best_match = header_text
1453
- if best_similarity >= max(
1454
- 0.5,
1455
- 1.0
1456
- - (
1457
- self.max_diffs
1458
- / (
1459
- len(left_heading)
1460
- if len(left_heading) > 0
1461
- else 1
1462
- )
1463
- ),
1464
- ):
1465
- left_heading_found = True
1466
- break
1467
-
1468
- # If no match found in row_headers, fall back to checking header columns
1469
- if not left_heading_found and header_cols:
1470
- for j in sorted(header_cols):
1471
- if j < col_idx and table_array[row_idx, j].strip():
1472
- header_text = self.normalise(table_array[row_idx, j])
1473
- similarity = (
1474
- fuzz.ratio(left_heading, header_text) / 100.0
1475
- )
1476
- if similarity > best_similarity:
1477
- best_similarity = similarity
1478
- best_match = header_text
1479
- if best_similarity >= max(
1480
- 0.5,
1481
- 1.0
1482
- - (
1483
- self.max_diffs
1484
- / (
1485
- len(left_heading)
1486
- if len(left_heading) > 0
1487
- else 1
1488
- )
1489
- ),
1490
- ):
1491
- left_heading_found = True
1492
- break
1493
-
1494
- # If still no match, use any non-empty cell to the left as a last resort
1495
- if not left_heading_found and not best_match and col_idx > 0:
1496
- for j in range(col_idx):
1497
- if table_array[row_idx, j].strip():
1498
- header_text = self.normalise(table_array[row_idx, j])
1499
- similarity = (
1500
- fuzz.ratio(left_heading, header_text) / 100.0
1501
- )
1502
- if similarity > best_similarity:
1503
- best_similarity = similarity
1504
- best_match = header_text
1505
-
1506
- if not best_match:
1507
- all_relationships_satisfied = False
1508
- current_failed_reasons.append(
1509
- f"No left heading found for cell at ({row_idx}, {col_idx})"
1510
- )
1511
- elif best_similarity < max(
1512
- 0.5,
1513
- 1.0
1514
- - (
1515
- self.max_diffs
1516
- / (len(left_heading) if len(left_heading) > 0 else 1)
1517
- ),
1518
- ):
1519
- all_relationships_satisfied = False
1520
- current_failed_reasons.append(
1521
- f"Left heading '{best_match}' doesn't match expected '{left_heading}' (similarity: {best_similarity:.2f})"
1522
- )
1523
-
1524
- # If all relationships are satisfied for this cell, the test passes
1525
- if all_relationships_satisfied:
1526
- return True, ""
1527
- else:
1528
- failed_reasons.extend(current_failed_reasons)
1529
-
1530
- # If we've gone through all tables and all matching cells and none satisfied all relationships
1531
- if not failed_reasons:
1532
- return (
1533
- False,
1534
- f"No cell matching '{cell}' found in any table with threshold {threshold}",
1535
- )
1536
- else:
1537
- return (
1538
- False,
1539
- f"Found cells matching '{cell}' but relationships were not satisfied: {'; '.join(failed_reasons)}",
1540
- )
1541
-
1542
-
1543
- class BaselineTest(BasePDFTest):
1544
- """
1545
- This test makes sure that several baseline quality checks pass for the output generation.
1546
-
1547
- Namely, the output is not blank, not endlessly repeating, and contains characters of the proper
1548
- character sets.
1549
-
1550
- """
1551
-
1552
- max_repeats: int = 30
1553
- check_disallowed_characters: bool = True
1554
- type: Literal["baseline"] = Field(default="baseline")
1555
-
1556
- def run(self, content: str) -> Tuple[bool, str, float]:
1557
- if len("".join(c for c in content if c.isalnum()).strip()) == 0:
1558
- return False, "The text contains no alpha numeric characters", 0.0
1559
-
1560
- # Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality
1561
- # Honestly, this test doesn't seem to catch anything at the moment, maybe it can be refactored to a "text-quality"
1562
- # test or something, that measures repetition, non-blanks, charsets, etc
1563
- d = RepeatDetector(max_ngram_size=5)
1564
- d.add_letters(content)
1565
- repeats = d.ngram_repeats()
1566
-
1567
- for index, count in enumerate(repeats):
1568
- if count > self.max_repeats:
1569
- return (
1570
- False,
1571
- f"Text ends with {count} repeating {index+1}-grams, invalid",
1572
- 0.0,
1573
- )
1574
-
1575
- pattern = re.compile(
1576
- r"["
1577
- r"\u4e00-\u9FFF" # CJK Unified Ideographs (Chinese characters)
1578
- r"\u3040-\u309F" # Hiragana (Japanese)
1579
- r"\u30A0-\u30FF" # Katakana (Japanese)
1580
- r"\U0001F600-\U0001F64F" # Emoticons (Emoji)
1581
- r"\U0001F300-\U0001F5FF" # Miscellaneous Symbols and Pictographs (Emoji)
1582
- r"\U0001F680-\U0001F6FF" # Transport and Map Symbols (Emoji)
1583
- r"\U0001F1E0-\U0001F1FF" # Regional Indicator Symbols (flags, Emoji)
1584
- r"]",
1585
- flags=re.UNICODE,
1586
- )
1587
-
1588
- matches = pattern.findall(content)
1589
- if self.check_disallowed_characters and matches:
1590
- return False, f"Text contains disallowed characters {matches}", 0.0
1591
-
1592
- return True, "", 1.0
1593
-
1594
-
1595
- def load_tests(jsonl_file: str) -> List[BasePDFTest]:
1596
- """
1597
- Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor.
1598
-
1599
- Args:
1600
- jsonl_file: Path to the JSONL file containing test definitions.
1601
-
1602
- Returns:
1603
- A list of test objects.
1604
- """
1605
-
1606
- def process_line(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]:
1607
- """
1608
- Process a single line from the JSONL file and return a tuple of (line_number, test object).
1609
- Returns None for empty lines.
1610
- """
1611
- line_number, line = line_tuple
1612
- line = line.strip()
1613
- if not line:
1614
- return None
1615
-
1616
- try:
1617
- data = json.loads(line)
1618
- if "resources" in data:
1619
- data.pop("resources")
1620
- if "tags" in data:
1621
- data.pop("tags")
1622
-
1623
- _data = {}
1624
- for k, v in data.items():
1625
- if isinstance(v, float) and math.isnan(v) or v is None:
1626
- continue
1627
- _data[k] = v
1628
- data = _data
1629
- test_type = data.get("type")
1630
-
1631
- if test_type in {"present", "absent"}:
1632
- test = TextPresenceTest(**data)
1633
- elif test_type == "order":
1634
- test = TextOrderTest(**data)
1635
- elif test_type == "table":
1636
- test = TableTest(**data)
1637
- elif test_type == "baseline":
1638
- test = BaselineTest(**data)
1639
- else:
1640
- raise ValidationError(f"Unknown test type: {test_type}")
1641
- return (line_number, test)
1642
- except json.JSONDecodeError as e:
1643
- print(f"Error parsing JSON on line {line_number}: {e}")
1644
- raise
1645
- except (ValidationError, KeyError) as e:
1646
- print(f"Error on line {line_number}: {e}")
1647
- raise
1648
- except Exception as e:
1649
- print(f"Unexpected error on line {line_number}: {e}")
1650
- raise
1651
-
1652
- tests = []
1653
-
1654
- # Read all lines along with their line numbers.
1655
- with open(jsonl_file, "r") as f:
1656
- lines = list(enumerate(f, start=1))
1657
- for line in lines:
1658
- tests.append(process_line(line)[1])
1659
-
1660
- # Check for duplicate test IDs after parallel processing.
1661
- unique_ids = set()
1662
- for test in tests:
1663
- if test.id in unique_ids:
1664
- raise ValidationError(
1665
- f"Test with duplicate id {test.id} found, error loading tests."
1666
- )
1667
- unique_ids.add(test.id)
1668
-
1669
- return tests
1670
-
1671
-
1672
- def load_single_test(row: dict) -> Optional[BasePDFTest]:
1673
- """
1674
- Process a single line from the JSONL file and return a tuple of (line_number, test object).
1675
- Returns None for empty lines.
1676
- """
1677
-
1678
- try:
1679
- _data = {}
1680
- for k, v in row.items():
1681
- if isinstance(v, float) and math.isnan(v) or v is None:
1682
- continue
1683
- _data[k] = v
1684
- data = _data
1685
- if "resources" in data:
1686
- data.pop("resources")
1687
- if "tags" in data:
1688
- data.pop("tags")
1689
- test_type = data.get("type")
1690
-
1691
- if test_type in {"present", "absent"}:
1692
- test = TextPresenceTest(**data)
1693
- elif test_type == "order":
1694
- test = TextOrderTest(**data)
1695
- elif test_type == "table":
1696
- test = TableTest(**data)
1697
- elif test_type == "baseline":
1698
- test = BaselineTest(**data)
1699
- else:
1700
- raise ValidationError(f"Unknown test type: {test_type}")
1701
- return test
1702
- except json.JSONDecodeError as e:
1703
- print(f"Error parsing ds on {row['id']}: {e}")
1704
- raise
1705
- except (ValidationError, KeyError) as e:
1706
- print(f"Error on line {row['id']}: {e}")
1707
- raise
1708
- except Exception as e:
1709
- print(f"Unexpected error on {row['id']}: {e}")
1710
- raise
1711
-
1712
-
1713
- def load_tests_from_ds(ds) -> List[BasePDFTest]:
1714
- """
1715
- Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor.
1716
-
1717
- Args:
1718
- jsonl_file: Path to the JSONL file containing test definitions.
1719
-
1720
- Returns:
1721
- A list of test objects.
1722
- """
1723
-
1724
- tests = []
1725
-
1726
- # Read all lines along with their line numbers.
1727
- for row in ds.to_dict(orient="records"):
1728
- tests.append(load_single_test(row))
1729
-
1730
- # _data = {}
1731
- # for k, v in row.items():
1732
- # if isinstance(v, float) and math.isnan(v) or v is None or k in ["pdf_path"]:
1733
- # continue
1734
- # _data[k] = v
1735
- # data = _data
1736
- # for k in ["max_diffs", "first_n", "last_n", "page"]:
1737
- # if k in data:
1738
- # data[k] = int(data[k])
1739
- # tests.append(load_single_test(data))
1740
-
1741
- # Check for duplicate test IDs after parallel processing.
1742
- unique_ids = set()
1743
- for test in tests:
1744
- if test.id in unique_ids:
1745
- raise ValidationError(
1746
- f"Test with duplicate id {test.id} found, error loading tests."
1747
- )
1748
- unique_ids.add(test.id)
1749
-
1750
- return tests
1751
-
1752
-
1753
- def save_tests(tests: List[BasePDFTest], jsonl_file: str) -> None:
1754
- """
1755
- Save tests to a JSONL file using model_dump for conversion.
1756
-
1757
- Args:
1758
- tests: A list of test objects.
1759
- jsonl_file: Path to the output JSONL file.
1760
- """
1761
- with open(jsonl_file, "w") as file:
1762
- for test in tests:
1763
- file.write(json.dumps(test.model_dump()) + "\n")