vlmparse 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/clients/docling.py +2 -2
- vlmparse/clients/dotsocr.py +11 -2
- vlmparse/clients/mineru.py +8 -7
- vlmparse/clients/openai_converter.py +1 -0
- vlmparse/converter_with_server.py +5 -4
- vlmparse/registries.py +2 -4
- vlmparse/servers/docker_server.py +1 -1
- vlmparse/servers/utils.py +3 -2
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/METADATA +17 -3
- vlmparse-0.1.5.dist-info/RECORD +36 -0
- vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
- vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
- vlmparse/benchpdf2md/create_dataset.py +0 -60
- vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
- vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
- vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
- vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
- vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
- vlmparse/benchpdf2md/run_benchmark.py +0 -296
- vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
- vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
- vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
- vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
- vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
- vlmparse/benchpdf2md/utils.py +0 -56
- vlmparse-0.1.4.dist-info/RECORD +0 -51
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/WHEEL +0 -0
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/top_level.txt +0 -0
|
@@ -1,1334 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import re
|
|
4
|
-
import unicodedata
|
|
5
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
-
from dataclasses import asdict, dataclass, field
|
|
7
|
-
from enum import Enum
|
|
8
|
-
from typing import Dict, List, Optional, Set, Tuple, Union
|
|
9
|
-
|
|
10
|
-
import numpy as np
|
|
11
|
-
from bs4 import BeautifulSoup
|
|
12
|
-
from fuzzysearch import find_near_matches
|
|
13
|
-
from rapidfuzz import fuzz
|
|
14
|
-
from tqdm import tqdm
|
|
15
|
-
|
|
16
|
-
from .katex.render import compare_rendered_equations, render_equation
|
|
17
|
-
from .repeatdetect import RepeatDetector
|
|
18
|
-
|
|
19
|
-
# Tell pytest these are not tests
|
|
20
|
-
__test__ = False
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@dataclass
|
|
24
|
-
class TableData:
|
|
25
|
-
"""Class to hold table data and metadata about headers."""
|
|
26
|
-
|
|
27
|
-
data: np.ndarray # The actual table data
|
|
28
|
-
header_rows: Set[int] = field(
|
|
29
|
-
default_factory=set
|
|
30
|
-
) # Indices of rows that are headers
|
|
31
|
-
header_cols: Set[int] = field(
|
|
32
|
-
default_factory=set
|
|
33
|
-
) # Indices of columns that are headers
|
|
34
|
-
col_headers: dict = field(
|
|
35
|
-
default_factory=dict
|
|
36
|
-
) # Maps column index to header text, handling colspan
|
|
37
|
-
row_headers: dict = field(
|
|
38
|
-
default_factory=dict
|
|
39
|
-
) # Maps row index to header text, handling rowspan
|
|
40
|
-
|
|
41
|
-
def __repr__(self) -> str:
|
|
42
|
-
"""Returns a concise representation of the TableData object for debugging."""
|
|
43
|
-
return f"TableData(shape={self.data.shape}, header_rows={len(self.header_rows)}, header_cols={len(self.header_cols)})"
|
|
44
|
-
|
|
45
|
-
def __str__(self) -> str:
|
|
46
|
-
"""Returns a pretty string representation of the table with header information."""
|
|
47
|
-
output = []
|
|
48
|
-
|
|
49
|
-
# Table dimensions
|
|
50
|
-
output.append(
|
|
51
|
-
f"Table: {self.data.shape[0]} rows × {self.data.shape[1]} columns"
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
# Header info
|
|
55
|
-
output.append(f"Header rows: {sorted(self.header_rows)}")
|
|
56
|
-
output.append(f"Header columns: {sorted(self.header_cols)}")
|
|
57
|
-
|
|
58
|
-
# Table content with formatting
|
|
59
|
-
separator = "+" + "+".join(["-" * 17] * self.data.shape[1]) + "+"
|
|
60
|
-
|
|
61
|
-
# Add a header for row indices
|
|
62
|
-
output.append(separator)
|
|
63
|
-
headers = [""] + [f"Column {i}" for i in range(self.data.shape[1])]
|
|
64
|
-
output.append(
|
|
65
|
-
"| {:<5} | ".format("Row")
|
|
66
|
-
+ " | ".join(["{:<15}".format(h) for h in headers[1:]])
|
|
67
|
-
+ " |"
|
|
68
|
-
)
|
|
69
|
-
output.append(separator)
|
|
70
|
-
|
|
71
|
-
# Format each row
|
|
72
|
-
for i in range(min(self.data.shape[0], 15)): # Limit to 15 rows for readability
|
|
73
|
-
# Format cells, mark header cells
|
|
74
|
-
cells = []
|
|
75
|
-
for j in range(self.data.shape[1]):
|
|
76
|
-
cell = str(self.data[i, j])
|
|
77
|
-
if len(cell) > 15:
|
|
78
|
-
cell = cell[:12] + "..."
|
|
79
|
-
# Mark header cells with *
|
|
80
|
-
if i in self.header_rows or j in self.header_cols:
|
|
81
|
-
cell = f"*{cell}*"
|
|
82
|
-
cells.append(cell)
|
|
83
|
-
|
|
84
|
-
row_str = (
|
|
85
|
-
"| {:<5} | ".format(i)
|
|
86
|
-
+ " | ".join(["{:<15}".format(c) for c in cells])
|
|
87
|
-
+ " |"
|
|
88
|
-
)
|
|
89
|
-
output.append(row_str)
|
|
90
|
-
output.append(separator)
|
|
91
|
-
|
|
92
|
-
# If table is too large, indicate truncation
|
|
93
|
-
if self.data.shape[0] > 15:
|
|
94
|
-
output.append(f"... {self.data.shape[0] - 15} more rows ...")
|
|
95
|
-
|
|
96
|
-
# Column header details if available
|
|
97
|
-
if self.col_headers:
|
|
98
|
-
output.append("\nColumn header mappings:")
|
|
99
|
-
for col, headers in sorted(self.col_headers.items()):
|
|
100
|
-
header_strs = [f"({row}, '{text}')" for row, text in headers]
|
|
101
|
-
output.append(f" Column {col}: {', '.join(header_strs)}")
|
|
102
|
-
|
|
103
|
-
# Row header details if available
|
|
104
|
-
if self.row_headers:
|
|
105
|
-
output.append("\nRow header mappings:")
|
|
106
|
-
for row, headers in sorted(self.row_headers.items()):
|
|
107
|
-
header_strs = [f"({col}, '{text}')" for col, text in headers]
|
|
108
|
-
output.append(f" Row {row}: {', '.join(header_strs)}")
|
|
109
|
-
|
|
110
|
-
return "\n".join(output)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
class TestType(str, Enum):
|
|
114
|
-
BASELINE = "baseline"
|
|
115
|
-
PRESENT = "present"
|
|
116
|
-
ABSENT = "absent"
|
|
117
|
-
ORDER = "order"
|
|
118
|
-
TABLE = "table"
|
|
119
|
-
MATH = "math"
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
class TestChecked(str, Enum):
|
|
123
|
-
VERIFIED = "verified"
|
|
124
|
-
REJECTED = "rejected"
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
class ValidationError(Exception):
|
|
128
|
-
"""Exception raised for validation errors."""
|
|
129
|
-
|
|
130
|
-
pass
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def normalize_text(md_content: str) -> str:
|
|
134
|
-
if md_content is None:
|
|
135
|
-
return None
|
|
136
|
-
|
|
137
|
-
# Normalize <br> and <br/> to newlines
|
|
138
|
-
md_content = re.sub(r"<br/?>", " ", md_content)
|
|
139
|
-
|
|
140
|
-
# Normalize whitespace in the md_content
|
|
141
|
-
md_content = re.sub(r"\s+", " ", md_content)
|
|
142
|
-
|
|
143
|
-
# Remove markdown bold formatting (** or __ for bold)
|
|
144
|
-
md_content = re.sub(r"\*\*(.*?)\*\*", r"\1", md_content)
|
|
145
|
-
md_content = re.sub(r"__(.*?)__", r"\1", md_content)
|
|
146
|
-
md_content = re.sub(r"</?b>", "", md_content) # Remove <b> tags if they exist
|
|
147
|
-
md_content = re.sub(r"</?i>", "", md_content) # Remove <i> tags if they exist
|
|
148
|
-
|
|
149
|
-
# Remove markdown italics formatting (* or _ for italics)
|
|
150
|
-
md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
|
|
151
|
-
md_content = re.sub(r"_(.*?)_", r"\1", md_content)
|
|
152
|
-
|
|
153
|
-
# Convert down to a consistent unicode form, so é == e + accent, unicode forms
|
|
154
|
-
md_content = unicodedata.normalize("NFC", md_content)
|
|
155
|
-
|
|
156
|
-
# Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
|
|
157
|
-
replacements = {
|
|
158
|
-
"‘": "'",
|
|
159
|
-
"’": "'",
|
|
160
|
-
"‚": "'",
|
|
161
|
-
"“": '"',
|
|
162
|
-
"”": '"',
|
|
163
|
-
"„": '"',
|
|
164
|
-
"_": "_",
|
|
165
|
-
"–": "-",
|
|
166
|
-
"—": "-",
|
|
167
|
-
"‑": "-",
|
|
168
|
-
"‒": "-",
|
|
169
|
-
"−": "-",
|
|
170
|
-
"\u00b5": "\u03bc",
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
# Apply all replacements from the dictionary
|
|
174
|
-
for fancy_char, ascii_char in replacements.items():
|
|
175
|
-
md_content = md_content.replace(fancy_char, ascii_char)
|
|
176
|
-
|
|
177
|
-
return md_content
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def parse_markdown_tables(md_content: str) -> List[TableData]:
|
|
181
|
-
"""
|
|
182
|
-
Extract and parse all markdown tables from the provided content.
|
|
183
|
-
Uses a direct approach to find and parse tables, which is more robust for tables
|
|
184
|
-
at the end of files or with irregular formatting.
|
|
185
|
-
|
|
186
|
-
Args:
|
|
187
|
-
md_content: The markdown content containing tables
|
|
188
|
-
|
|
189
|
-
Returns:
|
|
190
|
-
A list of TableData objects, each containing the table data and header information
|
|
191
|
-
"""
|
|
192
|
-
# Split the content into lines and process line by line
|
|
193
|
-
lines = md_content.strip().split("\n")
|
|
194
|
-
|
|
195
|
-
parsed_tables = []
|
|
196
|
-
current_table_lines = []
|
|
197
|
-
in_table = False
|
|
198
|
-
|
|
199
|
-
# Identify potential tables by looking for lines with pipe characters
|
|
200
|
-
for _, line in enumerate(lines):
|
|
201
|
-
# Check if this line has pipe characters (a table row indicator)
|
|
202
|
-
if "|" in line:
|
|
203
|
-
# If we weren't in a table before, start a new one
|
|
204
|
-
if not in_table:
|
|
205
|
-
in_table = True
|
|
206
|
-
current_table_lines = [line]
|
|
207
|
-
else:
|
|
208
|
-
# Continue adding to the current table
|
|
209
|
-
current_table_lines.append(line)
|
|
210
|
-
else:
|
|
211
|
-
# No pipes in this line, so if we were in a table, we've reached its end
|
|
212
|
-
if in_table:
|
|
213
|
-
# Process the completed table if it has at least 2 rows
|
|
214
|
-
if len(current_table_lines) >= 2:
|
|
215
|
-
table_data = _process_table_lines(current_table_lines)
|
|
216
|
-
if table_data and len(table_data) > 0:
|
|
217
|
-
# Convert to numpy array for easier manipulation
|
|
218
|
-
max_cols = max(len(row) for row in table_data)
|
|
219
|
-
padded_data = [
|
|
220
|
-
row + [""] * (max_cols - len(row)) for row in table_data
|
|
221
|
-
]
|
|
222
|
-
table_array = np.array(padded_data)
|
|
223
|
-
|
|
224
|
-
# In markdown tables, the first row is typically a header row
|
|
225
|
-
header_rows = {0} if len(table_array) > 0 else set()
|
|
226
|
-
|
|
227
|
-
# Set up col_headers with first row headers for each column
|
|
228
|
-
col_headers = {}
|
|
229
|
-
if len(table_array) > 0:
|
|
230
|
-
for col_idx in range(table_array.shape[1]):
|
|
231
|
-
if col_idx < len(table_array[0]):
|
|
232
|
-
col_headers[col_idx] = [
|
|
233
|
-
(0, table_array[0, col_idx])
|
|
234
|
-
]
|
|
235
|
-
|
|
236
|
-
# Set up row_headers with first column headers for each row
|
|
237
|
-
row_headers = {}
|
|
238
|
-
if table_array.shape[1] > 0:
|
|
239
|
-
for row_idx in range(
|
|
240
|
-
1, table_array.shape[0]
|
|
241
|
-
): # Skip header row
|
|
242
|
-
row_headers[row_idx] = [
|
|
243
|
-
(0, table_array[row_idx, 0])
|
|
244
|
-
] # First column as heading
|
|
245
|
-
|
|
246
|
-
# Create TableData object
|
|
247
|
-
parsed_tables.append(
|
|
248
|
-
TableData(
|
|
249
|
-
data=table_array,
|
|
250
|
-
header_rows=header_rows,
|
|
251
|
-
header_cols={0}
|
|
252
|
-
if table_array.shape[1] > 0
|
|
253
|
-
else set(), # First column as header
|
|
254
|
-
col_headers=col_headers,
|
|
255
|
-
row_headers=row_headers,
|
|
256
|
-
)
|
|
257
|
-
)
|
|
258
|
-
in_table = False
|
|
259
|
-
|
|
260
|
-
# Process the last table if we're still tracking one at the end of the file
|
|
261
|
-
if in_table and len(current_table_lines) >= 2:
|
|
262
|
-
table_data = _process_table_lines(current_table_lines)
|
|
263
|
-
if table_data and len(table_data) > 0:
|
|
264
|
-
# Convert to numpy array
|
|
265
|
-
max_cols = max(len(row) for row in table_data)
|
|
266
|
-
padded_data = [row + [""] * (max_cols - len(row)) for row in table_data]
|
|
267
|
-
table_array = np.array(padded_data)
|
|
268
|
-
|
|
269
|
-
# In markdown tables, the first row is typically a header row
|
|
270
|
-
header_rows = {0} if len(table_array) > 0 else set()
|
|
271
|
-
|
|
272
|
-
# Set up col_headers with first row headers for each column
|
|
273
|
-
col_headers = {}
|
|
274
|
-
if len(table_array) > 0:
|
|
275
|
-
for col_idx in range(table_array.shape[1]):
|
|
276
|
-
if col_idx < len(table_array[0]):
|
|
277
|
-
col_headers[col_idx] = [(0, table_array[0, col_idx])]
|
|
278
|
-
|
|
279
|
-
# Set up row_headers with first column headers for each row
|
|
280
|
-
row_headers = {}
|
|
281
|
-
if table_array.shape[1] > 0:
|
|
282
|
-
for row_idx in range(1, table_array.shape[0]): # Skip header row
|
|
283
|
-
row_headers[row_idx] = [
|
|
284
|
-
(0, table_array[row_idx, 0])
|
|
285
|
-
] # First column as heading
|
|
286
|
-
|
|
287
|
-
# Create TableData object
|
|
288
|
-
parsed_tables.append(
|
|
289
|
-
TableData(
|
|
290
|
-
data=table_array,
|
|
291
|
-
header_rows=header_rows,
|
|
292
|
-
header_cols={0}
|
|
293
|
-
if table_array.shape[1] > 0
|
|
294
|
-
else set(), # First column as header
|
|
295
|
-
col_headers=col_headers,
|
|
296
|
-
row_headers=row_headers,
|
|
297
|
-
)
|
|
298
|
-
)
|
|
299
|
-
|
|
300
|
-
return parsed_tables
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
def _process_table_lines(table_lines: List[str]) -> List[List[str]]:
|
|
304
|
-
"""
|
|
305
|
-
Process a list of lines that potentially form a markdown table.
|
|
306
|
-
|
|
307
|
-
Args:
|
|
308
|
-
table_lines: List of strings, each representing a line in a potential markdown table
|
|
309
|
-
|
|
310
|
-
Returns:
|
|
311
|
-
A list of rows, each a list of cell values
|
|
312
|
-
"""
|
|
313
|
-
table_data = []
|
|
314
|
-
separator_row_index = None
|
|
315
|
-
|
|
316
|
-
# First, identify the separator row (the row with dashes)
|
|
317
|
-
for i, line in enumerate(table_lines):
|
|
318
|
-
# Check if this looks like a separator row (contains mostly dashes)
|
|
319
|
-
content_without_pipes = line.replace("|", "").strip()
|
|
320
|
-
if content_without_pipes and all(c in "- :" for c in content_without_pipes):
|
|
321
|
-
separator_row_index = i
|
|
322
|
-
break
|
|
323
|
-
|
|
324
|
-
# Process each line, filtering out the separator row
|
|
325
|
-
for i, line in enumerate(table_lines):
|
|
326
|
-
# Skip the separator row
|
|
327
|
-
if i == separator_row_index:
|
|
328
|
-
continue
|
|
329
|
-
|
|
330
|
-
# Skip lines that are entirely formatting
|
|
331
|
-
if line.strip() and all(c in "- :|" for c in line):
|
|
332
|
-
continue
|
|
333
|
-
|
|
334
|
-
# Process the cells in this row
|
|
335
|
-
cells = [cell.strip() for cell in line.split("|")]
|
|
336
|
-
|
|
337
|
-
# Remove empty cells at the beginning and end (caused by leading/trailing pipes)
|
|
338
|
-
if cells and cells[0] == "":
|
|
339
|
-
cells = cells[1:]
|
|
340
|
-
if cells and cells[-1] == "":
|
|
341
|
-
cells = cells[:-1]
|
|
342
|
-
|
|
343
|
-
if cells: # Only add non-empty rows
|
|
344
|
-
table_data.append(cells)
|
|
345
|
-
|
|
346
|
-
return table_data
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
def parse_html_tables(html_content: str) -> List[TableData]:
|
|
350
|
-
"""
|
|
351
|
-
Extract and parse all HTML tables from the provided content.
|
|
352
|
-
Identifies header rows and columns, and maps them properly handling rowspan/colspan.
|
|
353
|
-
|
|
354
|
-
Args:
|
|
355
|
-
html_content: The HTML content containing tables
|
|
356
|
-
|
|
357
|
-
Returns:
|
|
358
|
-
A list of TableData objects, each containing the table data and header information
|
|
359
|
-
"""
|
|
360
|
-
soup = BeautifulSoup(html_content, "html.parser")
|
|
361
|
-
tables = soup.find_all("table")
|
|
362
|
-
|
|
363
|
-
parsed_tables = []
|
|
364
|
-
|
|
365
|
-
for table in tables:
|
|
366
|
-
rows = table.find_all(["tr"])
|
|
367
|
-
table_data = []
|
|
368
|
-
header_rows = set()
|
|
369
|
-
header_cols = set()
|
|
370
|
-
col_headers = {} # Maps column index to all header cells above it
|
|
371
|
-
row_headers = {} # Maps row index to all header cells to its left
|
|
372
|
-
|
|
373
|
-
# Find rows inside thead tags - these are definitely header rows
|
|
374
|
-
thead = table.find("thead")
|
|
375
|
-
if thead:
|
|
376
|
-
thead_rows = thead.find_all("tr")
|
|
377
|
-
for tr in thead_rows:
|
|
378
|
-
header_rows.add(rows.index(tr))
|
|
379
|
-
|
|
380
|
-
# Initialize a grid to track filled cells due to rowspan/colspan
|
|
381
|
-
cell_grid = {}
|
|
382
|
-
col_span_info = {} # Tracks which columns contain headers
|
|
383
|
-
row_span_info = {} # Tracks which rows contain headers
|
|
384
|
-
|
|
385
|
-
# First pass: process each row to build the raw table data and identify headers
|
|
386
|
-
for row_idx, row in enumerate(rows):
|
|
387
|
-
cells = row.find_all(["th", "td"])
|
|
388
|
-
row_data = []
|
|
389
|
-
col_idx = 0
|
|
390
|
-
|
|
391
|
-
# If there are th elements in this row, it's likely a header row
|
|
392
|
-
if row.find("th"):
|
|
393
|
-
header_rows.add(row_idx)
|
|
394
|
-
|
|
395
|
-
for cell in cells:
|
|
396
|
-
# Skip positions already filled by rowspans from above
|
|
397
|
-
while (row_idx, col_idx) in cell_grid:
|
|
398
|
-
row_data.append(cell_grid[(row_idx, col_idx)])
|
|
399
|
-
col_idx += 1
|
|
400
|
-
|
|
401
|
-
# Replace <br> and <br/> tags with newlines before getting text
|
|
402
|
-
for br in cell.find_all("br"):
|
|
403
|
-
br.replace_with("\n")
|
|
404
|
-
cell_text = cell.get_text().strip()
|
|
405
|
-
|
|
406
|
-
# Handle rowspan/colspan
|
|
407
|
-
rowspan = int(cell.get("rowspan", 1))
|
|
408
|
-
colspan = int(cell.get("colspan", 1))
|
|
409
|
-
|
|
410
|
-
# Add the cell to the row data
|
|
411
|
-
row_data.append(cell_text)
|
|
412
|
-
|
|
413
|
-
# Fill the grid for this cell and its rowspan/colspan
|
|
414
|
-
for i in range(rowspan):
|
|
415
|
-
for j in range(colspan):
|
|
416
|
-
if i == 0 and j == 0:
|
|
417
|
-
continue # Skip the main cell position
|
|
418
|
-
# For rowspan cells, preserve the text in all spanned rows
|
|
419
|
-
if j == 0 and i > 0: # Only for cells directly below
|
|
420
|
-
cell_grid[(row_idx + i, col_idx + j)] = cell_text
|
|
421
|
-
else:
|
|
422
|
-
cell_grid[(row_idx + i, col_idx + j)] = (
|
|
423
|
-
"" # Mark other spans as empty
|
|
424
|
-
)
|
|
425
|
-
|
|
426
|
-
# If this is a header cell (th), mark it and its span
|
|
427
|
-
if cell.name == "th":
|
|
428
|
-
# Mark columns as header columns
|
|
429
|
-
for j in range(colspan):
|
|
430
|
-
header_cols.add(col_idx + j)
|
|
431
|
-
|
|
432
|
-
# For rowspan, mark spanned rows as part of header
|
|
433
|
-
for i in range(1, rowspan):
|
|
434
|
-
if row_idx + i < len(rows):
|
|
435
|
-
header_rows.add(row_idx + i)
|
|
436
|
-
|
|
437
|
-
# Record this header for all spanned columns
|
|
438
|
-
for j in range(colspan):
|
|
439
|
-
curr_col = col_idx + j
|
|
440
|
-
if curr_col not in col_headers:
|
|
441
|
-
col_headers[curr_col] = []
|
|
442
|
-
col_headers[curr_col].append((row_idx, cell_text))
|
|
443
|
-
|
|
444
|
-
# Store which columns are covered by this header
|
|
445
|
-
if cell_text and colspan > 1:
|
|
446
|
-
if cell_text not in col_span_info:
|
|
447
|
-
col_span_info[cell_text] = set()
|
|
448
|
-
col_span_info[cell_text].add(curr_col)
|
|
449
|
-
|
|
450
|
-
# Store which rows are covered by this header for rowspan
|
|
451
|
-
if cell_text and rowspan > 1:
|
|
452
|
-
if cell_text not in row_span_info:
|
|
453
|
-
row_span_info[cell_text] = set()
|
|
454
|
-
for i in range(rowspan):
|
|
455
|
-
row_span_info[cell_text].add(row_idx + i)
|
|
456
|
-
|
|
457
|
-
# Also handle row headers from data cells that have rowspan
|
|
458
|
-
if cell.name == "td" and rowspan > 1 and col_idx in header_cols:
|
|
459
|
-
for i in range(1, rowspan):
|
|
460
|
-
if row_idx + i < len(rows):
|
|
461
|
-
if row_idx + i not in row_headers:
|
|
462
|
-
row_headers[row_idx + i] = []
|
|
463
|
-
row_headers[row_idx + i].append((col_idx, cell_text))
|
|
464
|
-
|
|
465
|
-
col_idx += colspan
|
|
466
|
-
|
|
467
|
-
# Pad the row if needed to handle different row lengths
|
|
468
|
-
table_data.append(row_data)
|
|
469
|
-
|
|
470
|
-
# Second pass: expand headers to cells that should inherit them
|
|
471
|
-
# First handle column headers
|
|
472
|
-
for header_text, columns in col_span_info.items():
|
|
473
|
-
for col in columns:
|
|
474
|
-
# Add this header to all columns it spans over
|
|
475
|
-
for row_idx in range(len(table_data)):
|
|
476
|
-
if row_idx not in header_rows: # Only apply to data rows
|
|
477
|
-
for j in range(
|
|
478
|
-
col,
|
|
479
|
-
len(table_data[row_idx])
|
|
480
|
-
if row_idx < len(table_data)
|
|
481
|
-
else 0,
|
|
482
|
-
):
|
|
483
|
-
# Add header info to data cells in these columns
|
|
484
|
-
if j not in col_headers:
|
|
485
|
-
col_headers[j] = []
|
|
486
|
-
if not any(h[1] == header_text for h in col_headers[j]):
|
|
487
|
-
header_row = min(
|
|
488
|
-
[r for r, t in col_headers.get(col, [(0, "")])]
|
|
489
|
-
)
|
|
490
|
-
col_headers[j].append((header_row, header_text))
|
|
491
|
-
|
|
492
|
-
# Handle row headers
|
|
493
|
-
for header_text, rows in row_span_info.items():
|
|
494
|
-
for row in rows:
|
|
495
|
-
if row < len(table_data):
|
|
496
|
-
# Find first header column
|
|
497
|
-
header_col = min(header_cols) if header_cols else 0
|
|
498
|
-
if row not in row_headers:
|
|
499
|
-
row_headers[row] = []
|
|
500
|
-
if not any(h[1] == header_text for h in row_headers.get(row, [])):
|
|
501
|
-
row_headers[row].append((header_col, header_text))
|
|
502
|
-
|
|
503
|
-
# Process regular row headers - each cell in a header column becomes a header for its row
|
|
504
|
-
for col_idx in header_cols:
|
|
505
|
-
for row_idx, row in enumerate(table_data):
|
|
506
|
-
if col_idx < len(row) and row[col_idx].strip():
|
|
507
|
-
if row_idx not in row_headers:
|
|
508
|
-
row_headers[row_idx] = []
|
|
509
|
-
if not any(
|
|
510
|
-
h[1] == row[col_idx] for h in row_headers.get(row_idx, [])
|
|
511
|
-
):
|
|
512
|
-
row_headers[row_idx].append((col_idx, row[col_idx]))
|
|
513
|
-
|
|
514
|
-
# Calculate max columns for padding
|
|
515
|
-
max_cols = max(len(row) for row in table_data) if table_data else 0
|
|
516
|
-
|
|
517
|
-
# Ensure all rows have the same number of columns
|
|
518
|
-
if table_data:
|
|
519
|
-
padded_data = [row + [""] * (max_cols - len(row)) for row in table_data]
|
|
520
|
-
table_array = np.array(padded_data)
|
|
521
|
-
|
|
522
|
-
# Create TableData object with the table and header information
|
|
523
|
-
parsed_tables.append(
|
|
524
|
-
TableData(
|
|
525
|
-
data=table_array,
|
|
526
|
-
header_rows=header_rows,
|
|
527
|
-
header_cols=header_cols,
|
|
528
|
-
col_headers=col_headers,
|
|
529
|
-
row_headers=row_headers,
|
|
530
|
-
)
|
|
531
|
-
)
|
|
532
|
-
|
|
533
|
-
return parsed_tables
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
@dataclass(kw_only=True)
|
|
537
|
-
class BasePDFTest:
|
|
538
|
-
"""
|
|
539
|
-
Base class for all PDF test types.
|
|
540
|
-
|
|
541
|
-
Attributes:
|
|
542
|
-
pdf: The PDF filename.
|
|
543
|
-
page: The page number for the test.
|
|
544
|
-
id: Unique identifier for the test.
|
|
545
|
-
type: The type of test.
|
|
546
|
-
threshold: A float between 0 and 1 representing the threshold for fuzzy matching.
|
|
547
|
-
"""
|
|
548
|
-
|
|
549
|
-
pdf: str
|
|
550
|
-
page: int
|
|
551
|
-
id: str
|
|
552
|
-
type: str
|
|
553
|
-
max_diffs: int = 0
|
|
554
|
-
checked: Optional[TestChecked] = None
|
|
555
|
-
url: Optional[str] = None
|
|
556
|
-
|
|
557
|
-
def __post_init__(self):
|
|
558
|
-
if not self.pdf:
|
|
559
|
-
raise ValidationError("PDF filename cannot be empty")
|
|
560
|
-
if not self.id:
|
|
561
|
-
raise ValidationError("Test ID cannot be empty")
|
|
562
|
-
if not isinstance(self.max_diffs, int) or self.max_diffs < 0:
|
|
563
|
-
raise ValidationError("Max diffs must be positive number or 0")
|
|
564
|
-
if self.type not in {t.value for t in TestType}:
|
|
565
|
-
raise ValidationError(f"Invalid test type: {self.type}")
|
|
566
|
-
|
|
567
|
-
def run(self, md_content: str) -> Tuple[bool, str]:
|
|
568
|
-
"""
|
|
569
|
-
Run the test on the provided markdown content.
|
|
570
|
-
|
|
571
|
-
Args:
|
|
572
|
-
md_content: The content of the .md file.
|
|
573
|
-
|
|
574
|
-
Returns:
|
|
575
|
-
A tuple (passed, explanation) where 'passed' is True if the test passes,
|
|
576
|
-
and 'explanation' provides details when the test fails.
|
|
577
|
-
"""
|
|
578
|
-
raise NotImplementedError("Subclasses must implement the run method")
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
@dataclass
|
|
582
|
-
class TextPresenceTest(BasePDFTest):
|
|
583
|
-
"""
|
|
584
|
-
Test to verify the presence or absence of specific text in a PDF.
|
|
585
|
-
|
|
586
|
-
Attributes:
|
|
587
|
-
text: The text string to search for.
|
|
588
|
-
"""
|
|
589
|
-
|
|
590
|
-
text: str
|
|
591
|
-
case_sensitive: bool = True
|
|
592
|
-
first_n: Optional[int] = None
|
|
593
|
-
last_n: Optional[int] = None
|
|
594
|
-
|
|
595
|
-
def __post_init__(self):
|
|
596
|
-
super().__post_init__()
|
|
597
|
-
if self.type not in {TestType.PRESENT.value, TestType.ABSENT.value}:
|
|
598
|
-
raise ValidationError(f"Invalid type for TextPresenceTest: {self.type}")
|
|
599
|
-
self.text = normalize_text(self.text)
|
|
600
|
-
if not self.text.strip():
|
|
601
|
-
raise ValidationError("Text field cannot be empty")
|
|
602
|
-
|
|
603
|
-
def run(self, md_content: str) -> Tuple[bool, str]:
|
|
604
|
-
reference_query = self.text
|
|
605
|
-
|
|
606
|
-
# Normalize whitespace in the md_content
|
|
607
|
-
md_content = normalize_text(md_content)
|
|
608
|
-
|
|
609
|
-
if not self.case_sensitive:
|
|
610
|
-
reference_query = reference_query.lower()
|
|
611
|
-
md_content = md_content.lower()
|
|
612
|
-
|
|
613
|
-
if self.first_n and self.last_n:
|
|
614
|
-
md_content = md_content[: self.first_n] + md_content[-self.last_n :]
|
|
615
|
-
elif self.first_n:
|
|
616
|
-
md_content = md_content[: self.first_n]
|
|
617
|
-
elif self.last_n:
|
|
618
|
-
md_content = md_content[-self.last_n :]
|
|
619
|
-
|
|
620
|
-
# Threshold for fuzzy matching derived from max_diffs
|
|
621
|
-
threshold = 1.0 - (
|
|
622
|
-
self.max_diffs / (len(reference_query) if len(reference_query) > 0 else 1)
|
|
623
|
-
)
|
|
624
|
-
best_ratio = fuzz.partial_ratio(reference_query, md_content) / 100.0
|
|
625
|
-
|
|
626
|
-
if self.type == TestType.PRESENT.value:
|
|
627
|
-
if best_ratio >= threshold:
|
|
628
|
-
return True, ""
|
|
629
|
-
else:
|
|
630
|
-
msg = (
|
|
631
|
-
f"Expected '{reference_query[:40]}...' with threshold {threshold} "
|
|
632
|
-
f"but best match ratio was {best_ratio:.3f}"
|
|
633
|
-
)
|
|
634
|
-
return False, msg
|
|
635
|
-
else: # ABSENT
|
|
636
|
-
if best_ratio < threshold:
|
|
637
|
-
return True, ""
|
|
638
|
-
else:
|
|
639
|
-
msg = (
|
|
640
|
-
f"Expected absence of '{reference_query[:40]}...' with threshold {threshold} "
|
|
641
|
-
f"but best match ratio was {best_ratio:.3f}"
|
|
642
|
-
)
|
|
643
|
-
return False, msg
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
@dataclass
|
|
647
|
-
class TextOrderTest(BasePDFTest):
|
|
648
|
-
"""
|
|
649
|
-
Test to verify that one text appears before another in a PDF.
|
|
650
|
-
|
|
651
|
-
Attributes:
|
|
652
|
-
before: The text expected to appear first.
|
|
653
|
-
after: The text expected to appear after the 'before' text.
|
|
654
|
-
"""
|
|
655
|
-
|
|
656
|
-
before: str
|
|
657
|
-
after: str
|
|
658
|
-
|
|
659
|
-
def __post_init__(self):
|
|
660
|
-
super().__post_init__()
|
|
661
|
-
if self.type != TestType.ORDER.value:
|
|
662
|
-
raise ValidationError(f"Invalid type for TextOrderTest: {self.type}")
|
|
663
|
-
self.before = normalize_text(self.before)
|
|
664
|
-
self.after = normalize_text(self.after)
|
|
665
|
-
if not self.before.strip():
|
|
666
|
-
raise ValidationError("Before field cannot be empty")
|
|
667
|
-
if not self.after.strip():
|
|
668
|
-
raise ValidationError("After field cannot be empty")
|
|
669
|
-
if (
|
|
670
|
-
self.max_diffs > len(self.before) // 2
|
|
671
|
-
or self.max_diffs > len(self.after) // 2
|
|
672
|
-
):
|
|
673
|
-
raise ValidationError(
|
|
674
|
-
"Max diffs is too large for this test, greater than 50% of the search string"
|
|
675
|
-
)
|
|
676
|
-
|
|
677
|
-
def run(self, md_content: str) -> Tuple[bool, str]:
|
|
678
|
-
md_content = normalize_text(md_content)
|
|
679
|
-
|
|
680
|
-
before_matches = find_near_matches(
|
|
681
|
-
self.before, md_content, max_l_dist=self.max_diffs
|
|
682
|
-
)
|
|
683
|
-
after_matches = find_near_matches(
|
|
684
|
-
self.after, md_content, max_l_dist=self.max_diffs
|
|
685
|
-
)
|
|
686
|
-
|
|
687
|
-
if not before_matches:
|
|
688
|
-
return (
|
|
689
|
-
False,
|
|
690
|
-
f"'before' text '{self.before[:40]}...' not found with max_l_dist {self.max_diffs}",
|
|
691
|
-
)
|
|
692
|
-
if not after_matches:
|
|
693
|
-
return (
|
|
694
|
-
False,
|
|
695
|
-
f"'after' text '{self.after[:40]}...' not found with max_l_dist {self.max_diffs}",
|
|
696
|
-
)
|
|
697
|
-
|
|
698
|
-
for before_match in before_matches:
|
|
699
|
-
for after_match in after_matches:
|
|
700
|
-
if before_match.start < after_match.start:
|
|
701
|
-
return True, ""
|
|
702
|
-
return False, (
|
|
703
|
-
f"Could not find a location where '{self.before[:40]}...' appears before "
|
|
704
|
-
f"'{self.after[:40]}...'."
|
|
705
|
-
)
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
@dataclass
|
|
709
|
-
class TableTest(BasePDFTest):
|
|
710
|
-
"""
|
|
711
|
-
Test to verify certain properties of a table are held, namely that some cells appear relative to other cells correctly
|
|
712
|
-
"""
|
|
713
|
-
|
|
714
|
-
# This is the target cell, which must exist in at least one place in the table
|
|
715
|
-
cell: str
|
|
716
|
-
|
|
717
|
-
# These properties say that the cell immediately up/down/left/right of the target cell has the string specified
|
|
718
|
-
up: str = ""
|
|
719
|
-
down: str = ""
|
|
720
|
-
left: str = ""
|
|
721
|
-
right: str = ""
|
|
722
|
-
|
|
723
|
-
# These properties say that the cell all the way up, or all the way left of the target cell (ex. headings) has the string value specified
|
|
724
|
-
top_heading: str = ""
|
|
725
|
-
left_heading: str = ""
|
|
726
|
-
|
|
727
|
-
ignore_markdown_tables: bool = False
|
|
728
|
-
|
|
729
|
-
def __post_init__(self):
|
|
730
|
-
super().__post_init__()
|
|
731
|
-
if self.type != TestType.TABLE.value:
|
|
732
|
-
raise ValidationError(f"Invalid type for TableTest: {self.type}")
|
|
733
|
-
|
|
734
|
-
# Normalize the search text too
|
|
735
|
-
self.cell = normalize_text(self.cell)
|
|
736
|
-
self.up = normalize_text(self.up)
|
|
737
|
-
self.down = normalize_text(self.down)
|
|
738
|
-
self.left = normalize_text(self.left)
|
|
739
|
-
self.right = normalize_text(self.right)
|
|
740
|
-
self.top_heading = normalize_text(self.top_heading)
|
|
741
|
-
self.left_heading = normalize_text(self.left_heading)
|
|
742
|
-
|
|
743
|
-
def run(self, content: str) -> Tuple[bool, str]:
|
|
744
|
-
"""
|
|
745
|
-
Run the table test on provided content.
|
|
746
|
-
|
|
747
|
-
Finds all tables (markdown and/or HTML based on content_type) and checks if any cell
|
|
748
|
-
matches the target cell and satisfies the specified relationships.
|
|
749
|
-
|
|
750
|
-
Args:
|
|
751
|
-
content: The content containing tables (markdown or HTML)
|
|
752
|
-
|
|
753
|
-
Returns:
|
|
754
|
-
A tuple (passed, explanation) where 'passed' is True if the test passes,
|
|
755
|
-
and 'explanation' provides details when the test fails.
|
|
756
|
-
"""
|
|
757
|
-
# Initialize variables to track tables and results
|
|
758
|
-
tables_to_check = []
|
|
759
|
-
failed_reasons = []
|
|
760
|
-
|
|
761
|
-
# Threshold for fuzzy matching derived from max_diffs
|
|
762
|
-
threshold = 1.0 - (
|
|
763
|
-
self.max_diffs / (len(self.cell) if len(self.cell) > 0 else 1)
|
|
764
|
-
)
|
|
765
|
-
threshold = max(0.5, threshold)
|
|
766
|
-
|
|
767
|
-
# Parse tables based on content_type
|
|
768
|
-
if not self.ignore_markdown_tables:
|
|
769
|
-
md_tables = parse_markdown_tables(content)
|
|
770
|
-
tables_to_check.extend(md_tables)
|
|
771
|
-
|
|
772
|
-
html_tables = parse_html_tables(content)
|
|
773
|
-
tables_to_check.extend(html_tables)
|
|
774
|
-
|
|
775
|
-
# If no tables found, return failure
|
|
776
|
-
if not tables_to_check:
|
|
777
|
-
return False, "No tables found in the content"
|
|
778
|
-
|
|
779
|
-
# Check each table
|
|
780
|
-
for table_data in tables_to_check:
|
|
781
|
-
# Removed debug print statement
|
|
782
|
-
table_array = table_data.data
|
|
783
|
-
header_rows = table_data.header_rows
|
|
784
|
-
header_cols = table_data.header_cols
|
|
785
|
-
|
|
786
|
-
# Find all cells that match the target cell using fuzzy matching
|
|
787
|
-
matches = []
|
|
788
|
-
for i in range(table_array.shape[0]):
|
|
789
|
-
for j in range(table_array.shape[1]):
|
|
790
|
-
cell_content = normalize_text(table_array[i, j])
|
|
791
|
-
similarity = fuzz.ratio(self.cell, cell_content) / 100.0
|
|
792
|
-
|
|
793
|
-
if similarity >= threshold:
|
|
794
|
-
matches.append((i, j))
|
|
795
|
-
|
|
796
|
-
# If no matches found in this table, continue to the next table
|
|
797
|
-
if not matches:
|
|
798
|
-
continue
|
|
799
|
-
|
|
800
|
-
# Check the relationships for each matching cell
|
|
801
|
-
for row_idx, col_idx in matches:
|
|
802
|
-
all_relationships_satisfied = True
|
|
803
|
-
current_failed_reasons = []
|
|
804
|
-
|
|
805
|
-
# Check up relationship
|
|
806
|
-
if self.up and row_idx > 0:
|
|
807
|
-
up_cell = normalize_text(table_array[row_idx - 1, col_idx])
|
|
808
|
-
up_similarity = fuzz.ratio(self.up, up_cell) / 100.0
|
|
809
|
-
if up_similarity < max(
|
|
810
|
-
0.5,
|
|
811
|
-
1.0
|
|
812
|
-
- (self.max_diffs / (len(self.up) if len(self.up) > 0 else 1)),
|
|
813
|
-
):
|
|
814
|
-
all_relationships_satisfied = False
|
|
815
|
-
current_failed_reasons.append(
|
|
816
|
-
f"Cell above '{up_cell}' doesn't match expected '{self.up}' (similarity: {up_similarity:.2f})"
|
|
817
|
-
)
|
|
818
|
-
|
|
819
|
-
# Check down relationship
|
|
820
|
-
if self.down and row_idx < table_array.shape[0] - 1:
|
|
821
|
-
down_cell = normalize_text(table_array[row_idx + 1, col_idx])
|
|
822
|
-
down_similarity = fuzz.ratio(self.down, down_cell) / 100.0
|
|
823
|
-
if down_similarity < max(
|
|
824
|
-
0.5,
|
|
825
|
-
1.0
|
|
826
|
-
- (
|
|
827
|
-
self.max_diffs
|
|
828
|
-
/ (len(self.down) if len(self.down) > 0 else 1)
|
|
829
|
-
),
|
|
830
|
-
):
|
|
831
|
-
all_relationships_satisfied = False
|
|
832
|
-
current_failed_reasons.append(
|
|
833
|
-
f"Cell below '{down_cell}' doesn't match expected '{self.down}' (similarity: {down_similarity:.2f})"
|
|
834
|
-
)
|
|
835
|
-
|
|
836
|
-
# Check left relationship
|
|
837
|
-
if self.left and col_idx > 0:
|
|
838
|
-
left_cell = normalize_text(table_array[row_idx, col_idx - 1])
|
|
839
|
-
left_similarity = fuzz.ratio(self.left, left_cell) / 100.0
|
|
840
|
-
if left_similarity < max(
|
|
841
|
-
0.5,
|
|
842
|
-
1.0
|
|
843
|
-
- (
|
|
844
|
-
self.max_diffs
|
|
845
|
-
/ (len(self.left) if len(self.left) > 0 else 1)
|
|
846
|
-
),
|
|
847
|
-
):
|
|
848
|
-
all_relationships_satisfied = False
|
|
849
|
-
current_failed_reasons.append(
|
|
850
|
-
f"Cell to the left '{left_cell}' doesn't match expected '{self.left}' (similarity: {left_similarity:.2f})"
|
|
851
|
-
)
|
|
852
|
-
|
|
853
|
-
# Check right relationship
|
|
854
|
-
if self.right and col_idx < table_array.shape[1] - 1:
|
|
855
|
-
right_cell = normalize_text(table_array[row_idx, col_idx + 1])
|
|
856
|
-
right_similarity = fuzz.ratio(self.right, right_cell) / 100.0
|
|
857
|
-
if right_similarity < max(
|
|
858
|
-
0.5,
|
|
859
|
-
1.0
|
|
860
|
-
- (
|
|
861
|
-
self.max_diffs
|
|
862
|
-
/ (len(self.right) if len(self.right) > 0 else 1)
|
|
863
|
-
),
|
|
864
|
-
):
|
|
865
|
-
all_relationships_satisfied = False
|
|
866
|
-
current_failed_reasons.append(
|
|
867
|
-
f"Cell to the right '{right_cell}' doesn't match expected '{self.right}' (similarity: {right_similarity:.2f})"
|
|
868
|
-
)
|
|
869
|
-
|
|
870
|
-
# Check top heading relationship
|
|
871
|
-
if self.top_heading:
|
|
872
|
-
# Try to find a match in the column headers
|
|
873
|
-
top_heading_found = False
|
|
874
|
-
best_match = ""
|
|
875
|
-
best_similarity = 0
|
|
876
|
-
|
|
877
|
-
# Check the col_headers dictionary first (this handles colspan properly)
|
|
878
|
-
if col_idx in table_data.col_headers:
|
|
879
|
-
for _, header_text in table_data.col_headers[col_idx]:
|
|
880
|
-
header_text = normalize_text(header_text)
|
|
881
|
-
similarity = (
|
|
882
|
-
fuzz.ratio(self.top_heading, header_text) / 100.0
|
|
883
|
-
)
|
|
884
|
-
if similarity > best_similarity:
|
|
885
|
-
best_similarity = similarity
|
|
886
|
-
best_match = header_text
|
|
887
|
-
if best_similarity >= max(
|
|
888
|
-
0.5,
|
|
889
|
-
1.0
|
|
890
|
-
- (
|
|
891
|
-
self.max_diffs
|
|
892
|
-
/ (
|
|
893
|
-
len(self.top_heading)
|
|
894
|
-
if len(self.top_heading) > 0
|
|
895
|
-
else 1
|
|
896
|
-
)
|
|
897
|
-
),
|
|
898
|
-
):
|
|
899
|
-
top_heading_found = True
|
|
900
|
-
break
|
|
901
|
-
|
|
902
|
-
# If no match found in col_headers, fall back to checking header rows
|
|
903
|
-
if not top_heading_found and header_rows:
|
|
904
|
-
for i in sorted(header_rows):
|
|
905
|
-
if i < row_idx and table_array[i, col_idx].strip():
|
|
906
|
-
header_text = normalize_text(table_array[i, col_idx])
|
|
907
|
-
similarity = (
|
|
908
|
-
fuzz.ratio(self.top_heading, header_text) / 100.0
|
|
909
|
-
)
|
|
910
|
-
if similarity > best_similarity:
|
|
911
|
-
best_similarity = similarity
|
|
912
|
-
best_match = header_text
|
|
913
|
-
if best_similarity >= max(
|
|
914
|
-
0.5,
|
|
915
|
-
1.0
|
|
916
|
-
- (
|
|
917
|
-
self.max_diffs
|
|
918
|
-
/ (
|
|
919
|
-
len(self.top_heading)
|
|
920
|
-
if len(self.top_heading) > 0
|
|
921
|
-
else 1
|
|
922
|
-
)
|
|
923
|
-
),
|
|
924
|
-
):
|
|
925
|
-
top_heading_found = True
|
|
926
|
-
break
|
|
927
|
-
|
|
928
|
-
# If still no match, use any non-empty cell above as a last resort
|
|
929
|
-
if not top_heading_found and not best_match and row_idx > 0:
|
|
930
|
-
for i in range(row_idx):
|
|
931
|
-
if table_array[i, col_idx].strip():
|
|
932
|
-
header_text = normalize_text(table_array[i, col_idx])
|
|
933
|
-
similarity = (
|
|
934
|
-
fuzz.ratio(self.top_heading, header_text) / 100.0
|
|
935
|
-
)
|
|
936
|
-
if similarity > best_similarity:
|
|
937
|
-
best_similarity = similarity
|
|
938
|
-
best_match = header_text
|
|
939
|
-
|
|
940
|
-
if not best_match:
|
|
941
|
-
all_relationships_satisfied = False
|
|
942
|
-
current_failed_reasons.append(
|
|
943
|
-
f"No top heading found for cell at ({row_idx}, {col_idx})"
|
|
944
|
-
)
|
|
945
|
-
elif best_similarity < max(
|
|
946
|
-
0.5,
|
|
947
|
-
1.0
|
|
948
|
-
- (
|
|
949
|
-
self.max_diffs
|
|
950
|
-
/ (
|
|
951
|
-
len(self.top_heading)
|
|
952
|
-
if len(self.top_heading) > 0
|
|
953
|
-
else 1
|
|
954
|
-
)
|
|
955
|
-
),
|
|
956
|
-
):
|
|
957
|
-
all_relationships_satisfied = False
|
|
958
|
-
current_failed_reasons.append(
|
|
959
|
-
f"Top heading '{best_match}' doesn't match expected '{self.top_heading}' (similarity: {best_similarity:.2f})"
|
|
960
|
-
)
|
|
961
|
-
|
|
962
|
-
# Check left heading relationship
|
|
963
|
-
if self.left_heading:
|
|
964
|
-
# Try to find a match in the row headers
|
|
965
|
-
left_heading_found = False
|
|
966
|
-
best_match = ""
|
|
967
|
-
best_similarity = 0
|
|
968
|
-
|
|
969
|
-
# Check the row_headers dictionary first (this handles rowspan properly)
|
|
970
|
-
if row_idx in table_data.row_headers:
|
|
971
|
-
for _, header_text in table_data.row_headers[row_idx]:
|
|
972
|
-
header_text = normalize_text(header_text)
|
|
973
|
-
similarity = (
|
|
974
|
-
fuzz.ratio(self.left_heading, header_text) / 100.0
|
|
975
|
-
)
|
|
976
|
-
if similarity > best_similarity:
|
|
977
|
-
best_similarity = similarity
|
|
978
|
-
best_match = header_text
|
|
979
|
-
if best_similarity >= max(
|
|
980
|
-
0.5,
|
|
981
|
-
1.0
|
|
982
|
-
- (
|
|
983
|
-
self.max_diffs
|
|
984
|
-
/ (
|
|
985
|
-
len(self.left_heading)
|
|
986
|
-
if len(self.left_heading) > 0
|
|
987
|
-
else 1
|
|
988
|
-
)
|
|
989
|
-
),
|
|
990
|
-
):
|
|
991
|
-
left_heading_found = True
|
|
992
|
-
break
|
|
993
|
-
|
|
994
|
-
# If no match found in row_headers, fall back to checking header columns
|
|
995
|
-
if not left_heading_found and header_cols:
|
|
996
|
-
for j in sorted(header_cols):
|
|
997
|
-
if j < col_idx and table_array[row_idx, j].strip():
|
|
998
|
-
header_text = normalize_text(table_array[row_idx, j])
|
|
999
|
-
similarity = (
|
|
1000
|
-
fuzz.ratio(self.left_heading, header_text) / 100.0
|
|
1001
|
-
)
|
|
1002
|
-
if similarity > best_similarity:
|
|
1003
|
-
best_similarity = similarity
|
|
1004
|
-
best_match = header_text
|
|
1005
|
-
if best_similarity >= max(
|
|
1006
|
-
0.5,
|
|
1007
|
-
1.0
|
|
1008
|
-
- (
|
|
1009
|
-
self.max_diffs
|
|
1010
|
-
/ (
|
|
1011
|
-
len(self.left_heading)
|
|
1012
|
-
if len(self.left_heading) > 0
|
|
1013
|
-
else 1
|
|
1014
|
-
)
|
|
1015
|
-
),
|
|
1016
|
-
):
|
|
1017
|
-
left_heading_found = True
|
|
1018
|
-
break
|
|
1019
|
-
|
|
1020
|
-
# If still no match, use any non-empty cell to the left as a last resort
|
|
1021
|
-
if not left_heading_found and not best_match and col_idx > 0:
|
|
1022
|
-
for j in range(col_idx):
|
|
1023
|
-
if table_array[row_idx, j].strip():
|
|
1024
|
-
header_text = normalize_text(table_array[row_idx, j])
|
|
1025
|
-
similarity = (
|
|
1026
|
-
fuzz.ratio(self.left_heading, header_text) / 100.0
|
|
1027
|
-
)
|
|
1028
|
-
if similarity > best_similarity:
|
|
1029
|
-
best_similarity = similarity
|
|
1030
|
-
best_match = header_text
|
|
1031
|
-
|
|
1032
|
-
if not best_match:
|
|
1033
|
-
all_relationships_satisfied = False
|
|
1034
|
-
current_failed_reasons.append(
|
|
1035
|
-
f"No left heading found for cell at ({row_idx}, {col_idx})"
|
|
1036
|
-
)
|
|
1037
|
-
elif best_similarity < max(
|
|
1038
|
-
0.5,
|
|
1039
|
-
1.0
|
|
1040
|
-
- (
|
|
1041
|
-
self.max_diffs
|
|
1042
|
-
/ (
|
|
1043
|
-
len(self.left_heading)
|
|
1044
|
-
if len(self.left_heading) > 0
|
|
1045
|
-
else 1
|
|
1046
|
-
)
|
|
1047
|
-
),
|
|
1048
|
-
):
|
|
1049
|
-
all_relationships_satisfied = False
|
|
1050
|
-
current_failed_reasons.append(
|
|
1051
|
-
f"Left heading '{best_match}' doesn't match expected '{self.left_heading}' (similarity: {best_similarity:.2f})"
|
|
1052
|
-
)
|
|
1053
|
-
|
|
1054
|
-
# If all relationships are satisfied for this cell, the test passes
|
|
1055
|
-
if all_relationships_satisfied:
|
|
1056
|
-
return True, ""
|
|
1057
|
-
else:
|
|
1058
|
-
failed_reasons.extend(current_failed_reasons)
|
|
1059
|
-
|
|
1060
|
-
# If we've gone through all tables and all matching cells and none satisfied all relationships
|
|
1061
|
-
if not failed_reasons:
|
|
1062
|
-
return (
|
|
1063
|
-
False,
|
|
1064
|
-
f"No cell matching '{self.cell}' found in any table with threshold {threshold}",
|
|
1065
|
-
)
|
|
1066
|
-
else:
|
|
1067
|
-
return (
|
|
1068
|
-
False,
|
|
1069
|
-
f"Found cells matching '{self.cell}' but relationships were not satisfied: {'; '.join(failed_reasons)}",
|
|
1070
|
-
)
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
@dataclass
|
|
1074
|
-
class BaselineTest(BasePDFTest):
|
|
1075
|
-
"""
|
|
1076
|
-
This test makes sure that several baseline quality checks pass for the output generation.
|
|
1077
|
-
|
|
1078
|
-
Namely, the output is not blank, not endlessly repeating, and contains characters of the proper
|
|
1079
|
-
character sets.
|
|
1080
|
-
|
|
1081
|
-
"""
|
|
1082
|
-
|
|
1083
|
-
max_length: Optional[int] = None # Used to implement blank page checks
|
|
1084
|
-
max_length_skips_image_alt_tags: bool = False
|
|
1085
|
-
|
|
1086
|
-
max_repeats: int = 30
|
|
1087
|
-
check_disallowed_characters: bool = True
|
|
1088
|
-
|
|
1089
|
-
def run(self, content: str) -> Tuple[bool, str]:
|
|
1090
|
-
base_content_len = len("".join(c for c in content if c.isalnum()).strip())
|
|
1091
|
-
|
|
1092
|
-
# If this a blank page check, then it short circuits the rest of the checks
|
|
1093
|
-
if self.max_length is not None:
|
|
1094
|
-
if self.max_length_skips_image_alt_tags:
|
|
1095
|
-
# Remove markdown image tags like  from the text length count
|
|
1096
|
-
content_for_length_check = re.sub(r"!\[.*?\]\(.*?\)", "", content)
|
|
1097
|
-
base_content_len = len(
|
|
1098
|
-
"".join(c for c in content_for_length_check if c.isalnum()).strip()
|
|
1099
|
-
)
|
|
1100
|
-
|
|
1101
|
-
if base_content_len > self.max_length:
|
|
1102
|
-
return (
|
|
1103
|
-
False,
|
|
1104
|
-
f"{base_content_len} characters were output for a page we expected to be blank",
|
|
1105
|
-
)
|
|
1106
|
-
else:
|
|
1107
|
-
return True, ""
|
|
1108
|
-
|
|
1109
|
-
if base_content_len == 0:
|
|
1110
|
-
return False, "The text contains no alpha numeric characters"
|
|
1111
|
-
|
|
1112
|
-
# Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality
|
|
1113
|
-
# Honestly, this test doesn't seem to catch anything at the moment, maybe it can be refactored to a "text-quality"
|
|
1114
|
-
# test or something, that measures repetition, non-blanks, charsets, etc
|
|
1115
|
-
d = RepeatDetector(max_ngram_size=5)
|
|
1116
|
-
d.add_letters(content)
|
|
1117
|
-
repeats = d.ngram_repeats()
|
|
1118
|
-
|
|
1119
|
-
for index, count in enumerate(repeats):
|
|
1120
|
-
if count > self.max_repeats:
|
|
1121
|
-
return (
|
|
1122
|
-
False,
|
|
1123
|
-
f"Text ends with {count} repeating {index+1}-grams, invalid",
|
|
1124
|
-
)
|
|
1125
|
-
|
|
1126
|
-
pattern = re.compile(
|
|
1127
|
-
r"["
|
|
1128
|
-
r"\u4e00-\u9FFF" # CJK Unified Ideographs (Chinese characters)
|
|
1129
|
-
r"\u3040-\u309F" # Hiragana (Japanese)
|
|
1130
|
-
r"\u30A0-\u30FF" # Katakana (Japanese)
|
|
1131
|
-
r"\U0001F600-\U0001F64F" # Emoticons (Emoji)
|
|
1132
|
-
r"\U0001F300-\U0001F5FF" # Miscellaneous Symbols and Pictographs (Emoji)
|
|
1133
|
-
r"\U0001F680-\U0001F6FF" # Transport and Map Symbols (Emoji)
|
|
1134
|
-
r"\U0001F1E0-\U0001F1FF" # Regional Indicator Symbols (flags, Emoji)
|
|
1135
|
-
r"]",
|
|
1136
|
-
flags=re.UNICODE,
|
|
1137
|
-
)
|
|
1138
|
-
|
|
1139
|
-
matches = pattern.findall(content)
|
|
1140
|
-
if self.check_disallowed_characters and matches:
|
|
1141
|
-
return False, f"Text contains disallowed characters {matches}"
|
|
1142
|
-
|
|
1143
|
-
return True, ""
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
@dataclass
|
|
1147
|
-
class MathTest(BasePDFTest):
|
|
1148
|
-
math: str
|
|
1149
|
-
|
|
1150
|
-
ignore_dollar_delimited: bool = False
|
|
1151
|
-
|
|
1152
|
-
def __post_init__(self):
|
|
1153
|
-
super().__post_init__()
|
|
1154
|
-
if self.type != TestType.MATH.value:
|
|
1155
|
-
raise ValidationError(f"Invalid type for MathTest: {self.type}")
|
|
1156
|
-
if len(self.math.strip()) == 0:
|
|
1157
|
-
raise ValidationError("Math test must have non-empty math expression")
|
|
1158
|
-
|
|
1159
|
-
self.reference_render = render_equation(self.math)
|
|
1160
|
-
|
|
1161
|
-
if self.reference_render is None:
|
|
1162
|
-
raise ValidationError(f"Math equation {self.math} was not able to render")
|
|
1163
|
-
|
|
1164
|
-
def run(self, content: str) -> Tuple[bool, str]:
|
|
1165
|
-
# Store both the search pattern and the full pattern to replace
|
|
1166
|
-
patterns = [
|
|
1167
|
-
(r"\\\((.+?)\\\)", r"\\\((.+?)\\\)"), # \(...\)
|
|
1168
|
-
(r"\\\[(.+?)\\\]", r"\\\[(.+?)\\\]"), # \[...\]
|
|
1169
|
-
]
|
|
1170
|
-
|
|
1171
|
-
if not self.ignore_dollar_delimited:
|
|
1172
|
-
patterns.extend(
|
|
1173
|
-
[
|
|
1174
|
-
(r"\$\$(.+?)\$\$", r"\$\$(.+?)\$\$"), # $$...$$
|
|
1175
|
-
(r"\$(.+?)\$", r"\$(.+?)\$"), # $...$])
|
|
1176
|
-
]
|
|
1177
|
-
)
|
|
1178
|
-
|
|
1179
|
-
equations = []
|
|
1180
|
-
modified_content = content
|
|
1181
|
-
|
|
1182
|
-
for search_pattern, replace_pattern in patterns:
|
|
1183
|
-
# Find all matches for the current pattern
|
|
1184
|
-
matches = re.findall(search_pattern, modified_content, re.DOTALL)
|
|
1185
|
-
equations.extend([e.strip() for e in matches])
|
|
1186
|
-
|
|
1187
|
-
# Replace all instances of this pattern with empty strings
|
|
1188
|
-
modified_content = re.sub(
|
|
1189
|
-
replace_pattern, "", modified_content, flags=re.DOTALL
|
|
1190
|
-
)
|
|
1191
|
-
|
|
1192
|
-
# If an equation in the markdown exactly matches our math string, then that's good enough
|
|
1193
|
-
# we don't have to do a more expensive comparison
|
|
1194
|
-
if any(hyp == self.math for hyp in equations):
|
|
1195
|
-
return True, ""
|
|
1196
|
-
|
|
1197
|
-
# If not, then let's render the math equation itself and now compare to each hypothesis
|
|
1198
|
-
# But, to speed things up, since rendering equations is hard, we sort the equations on the page
|
|
1199
|
-
# by fuzzy similarity to the hypothesis
|
|
1200
|
-
equations.sort(key=lambda x: -fuzz.ratio(x, self.math))
|
|
1201
|
-
for hypothesis in equations:
|
|
1202
|
-
hypothesis_render = render_equation(hypothesis)
|
|
1203
|
-
|
|
1204
|
-
if not hypothesis_render:
|
|
1205
|
-
continue
|
|
1206
|
-
|
|
1207
|
-
if compare_rendered_equations(self.reference_render, hypothesis_render):
|
|
1208
|
-
return True, ""
|
|
1209
|
-
|
|
1210
|
-
# self.reference_render.save(f"maths/{self.id}_ref.png", format="PNG")
|
|
1211
|
-
# best_match_render.save(f"maths/{self.id}_hyp.png", format="PNG")
|
|
1212
|
-
|
|
1213
|
-
return False, f"No match found for {self.math} anywhere in content"
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
def load_single_test(data: Union[str, Dict]) -> BasePDFTest:
|
|
1217
|
-
"""
|
|
1218
|
-
Load a single test from a JSON line string or JSON object.
|
|
1219
|
-
|
|
1220
|
-
Args:
|
|
1221
|
-
data: Either a JSON string to parse or a dictionary containing test data.
|
|
1222
|
-
|
|
1223
|
-
Returns:
|
|
1224
|
-
A test object of the appropriate type.
|
|
1225
|
-
|
|
1226
|
-
Raises:
|
|
1227
|
-
ValidationError: If the test type is unknown or data is invalid.
|
|
1228
|
-
json.JSONDecodeError: If the string cannot be parsed as JSON.
|
|
1229
|
-
"""
|
|
1230
|
-
# Handle JSON string input
|
|
1231
|
-
if isinstance(data, str):
|
|
1232
|
-
data = data.strip()
|
|
1233
|
-
if not data:
|
|
1234
|
-
raise ValueError("Empty string provided")
|
|
1235
|
-
data = json.loads(data)
|
|
1236
|
-
|
|
1237
|
-
# Process the test data
|
|
1238
|
-
test_type = data.get("type")
|
|
1239
|
-
if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}:
|
|
1240
|
-
test = TextPresenceTest(**data)
|
|
1241
|
-
elif test_type == TestType.ORDER.value:
|
|
1242
|
-
test = TextOrderTest(**data)
|
|
1243
|
-
elif test_type == TestType.TABLE.value:
|
|
1244
|
-
test = TableTest(**data)
|
|
1245
|
-
elif test_type == TestType.MATH.value:
|
|
1246
|
-
test = MathTest(**data)
|
|
1247
|
-
elif test_type == TestType.BASELINE.value:
|
|
1248
|
-
test = BaselineTest(**data)
|
|
1249
|
-
else:
|
|
1250
|
-
raise ValidationError(f"Unknown test type: {test_type}")
|
|
1251
|
-
|
|
1252
|
-
return test
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
def load_tests(jsonl_file: str) -> List[BasePDFTest]:
|
|
1256
|
-
"""
|
|
1257
|
-
Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor.
|
|
1258
|
-
|
|
1259
|
-
Args:
|
|
1260
|
-
jsonl_file: Path to the JSONL file containing test definitions.
|
|
1261
|
-
|
|
1262
|
-
Returns:
|
|
1263
|
-
A list of test objects.
|
|
1264
|
-
"""
|
|
1265
|
-
|
|
1266
|
-
def process_line_with_number(
|
|
1267
|
-
line_tuple: Tuple[int, str],
|
|
1268
|
-
) -> Optional[Tuple[int, BasePDFTest]]:
|
|
1269
|
-
"""
|
|
1270
|
-
Process a single line from the JSONL file and return a tuple of (line_number, test object).
|
|
1271
|
-
Returns None for empty lines.
|
|
1272
|
-
"""
|
|
1273
|
-
line_number, line = line_tuple
|
|
1274
|
-
line = line.strip()
|
|
1275
|
-
if not line:
|
|
1276
|
-
return None
|
|
1277
|
-
|
|
1278
|
-
try:
|
|
1279
|
-
test = load_single_test(line)
|
|
1280
|
-
return (line_number, test)
|
|
1281
|
-
except json.JSONDecodeError as e:
|
|
1282
|
-
print(f"Error parsing JSON on line {line_number}: {e}")
|
|
1283
|
-
raise
|
|
1284
|
-
except (ValidationError, KeyError) as e:
|
|
1285
|
-
print(f"Error on line {line_number}: {e}")
|
|
1286
|
-
raise
|
|
1287
|
-
except Exception as e:
|
|
1288
|
-
print(f"Unexpected error on line {line_number}: {e}")
|
|
1289
|
-
raise
|
|
1290
|
-
|
|
1291
|
-
tests = []
|
|
1292
|
-
|
|
1293
|
-
# Read all lines along with their line numbers.
|
|
1294
|
-
with open(jsonl_file, "r") as f:
|
|
1295
|
-
lines = list(enumerate(f, start=1))
|
|
1296
|
-
|
|
1297
|
-
# Use a ThreadPoolExecutor to process each line in parallel.
|
|
1298
|
-
with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 1, 64)) as executor:
|
|
1299
|
-
# Submit all tasks concurrently.
|
|
1300
|
-
futures = {
|
|
1301
|
-
executor.submit(process_line_with_number, item): item[0] for item in lines
|
|
1302
|
-
}
|
|
1303
|
-
# Use tqdm to show progress as futures complete.
|
|
1304
|
-
for future in tqdm(
|
|
1305
|
-
as_completed(futures), total=len(futures), desc="Loading tests"
|
|
1306
|
-
):
|
|
1307
|
-
result = future.result()
|
|
1308
|
-
if result is not None:
|
|
1309
|
-
_, test = result
|
|
1310
|
-
tests.append(test)
|
|
1311
|
-
|
|
1312
|
-
# Check for duplicate test IDs after parallel processing.
|
|
1313
|
-
unique_ids = set()
|
|
1314
|
-
for test in tests:
|
|
1315
|
-
if test.id in unique_ids:
|
|
1316
|
-
raise ValidationError(
|
|
1317
|
-
f"Test with duplicate id {test.id} found, error loading tests."
|
|
1318
|
-
)
|
|
1319
|
-
unique_ids.add(test.id)
|
|
1320
|
-
|
|
1321
|
-
return tests
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
def save_tests(tests: List[BasePDFTest], jsonl_file: str) -> None:
|
|
1325
|
-
"""
|
|
1326
|
-
Save tests to a JSONL file using asdict for conversion.
|
|
1327
|
-
|
|
1328
|
-
Args:
|
|
1329
|
-
tests: A list of test objects.
|
|
1330
|
-
jsonl_file: Path to the output JSONL file.
|
|
1331
|
-
"""
|
|
1332
|
-
with open(jsonl_file, "w") as file:
|
|
1333
|
-
for test in tests:
|
|
1334
|
-
file.write(json.dumps(asdict(test)) + "\n")
|