vlmparse 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. vlmparse/clients/docling.py +2 -2
  2. vlmparse/clients/dotsocr.py +11 -2
  3. vlmparse/clients/mineru.py +8 -7
  4. vlmparse/clients/openai_converter.py +1 -0
  5. vlmparse/converter_with_server.py +5 -4
  6. vlmparse/registries.py +2 -4
  7. vlmparse/servers/docker_server.py +1 -1
  8. vlmparse/servers/utils.py +3 -2
  9. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/METADATA +17 -3
  10. vlmparse-0.1.5.dist-info/RECORD +36 -0
  11. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
  12. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  13. vlmparse/benchpdf2md/create_dataset.py +0 -60
  14. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
  15. vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
  16. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
  17. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
  18. vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
  19. vlmparse/benchpdf2md/run_benchmark.py +0 -296
  20. vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
  21. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
  22. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
  23. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
  24. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
  25. vlmparse/benchpdf2md/utils.py +0 -56
  26. vlmparse-0.1.4.dist-info/RECORD +0 -51
  27. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/WHEEL +0 -0
  28. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/entry_points.txt +0 -0
  29. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/licenses/LICENSE +0 -0
  30. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/top_level.txt +0 -0
@@ -1,1334 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- import unicodedata
5
- from concurrent.futures import ThreadPoolExecutor, as_completed
6
- from dataclasses import asdict, dataclass, field
7
- from enum import Enum
8
- from typing import Dict, List, Optional, Set, Tuple, Union
9
-
10
- import numpy as np
11
- from bs4 import BeautifulSoup
12
- from fuzzysearch import find_near_matches
13
- from rapidfuzz import fuzz
14
- from tqdm import tqdm
15
-
16
- from .katex.render import compare_rendered_equations, render_equation
17
- from .repeatdetect import RepeatDetector
18
-
19
- # Tell pytest these are not tests
20
- __test__ = False
21
-
22
-
23
- @dataclass
24
- class TableData:
25
- """Class to hold table data and metadata about headers."""
26
-
27
- data: np.ndarray # The actual table data
28
- header_rows: Set[int] = field(
29
- default_factory=set
30
- ) # Indices of rows that are headers
31
- header_cols: Set[int] = field(
32
- default_factory=set
33
- ) # Indices of columns that are headers
34
- col_headers: dict = field(
35
- default_factory=dict
36
- ) # Maps column index to header text, handling colspan
37
- row_headers: dict = field(
38
- default_factory=dict
39
- ) # Maps row index to header text, handling rowspan
40
-
41
- def __repr__(self) -> str:
42
- """Returns a concise representation of the TableData object for debugging."""
43
- return f"TableData(shape={self.data.shape}, header_rows={len(self.header_rows)}, header_cols={len(self.header_cols)})"
44
-
45
- def __str__(self) -> str:
46
- """Returns a pretty string representation of the table with header information."""
47
- output = []
48
-
49
- # Table dimensions
50
- output.append(
51
- f"Table: {self.data.shape[0]} rows × {self.data.shape[1]} columns"
52
- )
53
-
54
- # Header info
55
- output.append(f"Header rows: {sorted(self.header_rows)}")
56
- output.append(f"Header columns: {sorted(self.header_cols)}")
57
-
58
- # Table content with formatting
59
- separator = "+" + "+".join(["-" * 17] * self.data.shape[1]) + "+"
60
-
61
- # Add a header for row indices
62
- output.append(separator)
63
- headers = [""] + [f"Column {i}" for i in range(self.data.shape[1])]
64
- output.append(
65
- "| {:<5} | ".format("Row")
66
- + " | ".join(["{:<15}".format(h) for h in headers[1:]])
67
- + " |"
68
- )
69
- output.append(separator)
70
-
71
- # Format each row
72
- for i in range(min(self.data.shape[0], 15)): # Limit to 15 rows for readability
73
- # Format cells, mark header cells
74
- cells = []
75
- for j in range(self.data.shape[1]):
76
- cell = str(self.data[i, j])
77
- if len(cell) > 15:
78
- cell = cell[:12] + "..."
79
- # Mark header cells with *
80
- if i in self.header_rows or j in self.header_cols:
81
- cell = f"*{cell}*"
82
- cells.append(cell)
83
-
84
- row_str = (
85
- "| {:<5} | ".format(i)
86
- + " | ".join(["{:<15}".format(c) for c in cells])
87
- + " |"
88
- )
89
- output.append(row_str)
90
- output.append(separator)
91
-
92
- # If table is too large, indicate truncation
93
- if self.data.shape[0] > 15:
94
- output.append(f"... {self.data.shape[0] - 15} more rows ...")
95
-
96
- # Column header details if available
97
- if self.col_headers:
98
- output.append("\nColumn header mappings:")
99
- for col, headers in sorted(self.col_headers.items()):
100
- header_strs = [f"({row}, '{text}')" for row, text in headers]
101
- output.append(f" Column {col}: {', '.join(header_strs)}")
102
-
103
- # Row header details if available
104
- if self.row_headers:
105
- output.append("\nRow header mappings:")
106
- for row, headers in sorted(self.row_headers.items()):
107
- header_strs = [f"({col}, '{text}')" for col, text in headers]
108
- output.append(f" Row {row}: {', '.join(header_strs)}")
109
-
110
- return "\n".join(output)
111
-
112
-
113
- class TestType(str, Enum):
114
- BASELINE = "baseline"
115
- PRESENT = "present"
116
- ABSENT = "absent"
117
- ORDER = "order"
118
- TABLE = "table"
119
- MATH = "math"
120
-
121
-
122
- class TestChecked(str, Enum):
123
- VERIFIED = "verified"
124
- REJECTED = "rejected"
125
-
126
-
127
- class ValidationError(Exception):
128
- """Exception raised for validation errors."""
129
-
130
- pass
131
-
132
-
133
- def normalize_text(md_content: str) -> str:
134
- if md_content is None:
135
- return None
136
-
137
- # Normalize <br> and <br/> to newlines
138
- md_content = re.sub(r"<br/?>", " ", md_content)
139
-
140
- # Normalize whitespace in the md_content
141
- md_content = re.sub(r"\s+", " ", md_content)
142
-
143
- # Remove markdown bold formatting (** or __ for bold)
144
- md_content = re.sub(r"\*\*(.*?)\*\*", r"\1", md_content)
145
- md_content = re.sub(r"__(.*?)__", r"\1", md_content)
146
- md_content = re.sub(r"</?b>", "", md_content) # Remove <b> tags if they exist
147
- md_content = re.sub(r"</?i>", "", md_content) # Remove <i> tags if they exist
148
-
149
- # Remove markdown italics formatting (* or _ for italics)
150
- md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
151
- md_content = re.sub(r"_(.*?)_", r"\1", md_content)
152
-
153
- # Convert down to a consistent unicode form, so é == e + accent, unicode forms
154
- md_content = unicodedata.normalize("NFC", md_content)
155
-
156
- # Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
157
- replacements = {
158
- "‘": "'",
159
- "’": "'",
160
- "‚": "'",
161
- "“": '"',
162
- "”": '"',
163
- "„": '"',
164
- "_": "_",
165
- "–": "-",
166
- "—": "-",
167
- "‑": "-",
168
- "‒": "-",
169
- "−": "-",
170
- "\u00b5": "\u03bc",
171
- }
172
-
173
- # Apply all replacements from the dictionary
174
- for fancy_char, ascii_char in replacements.items():
175
- md_content = md_content.replace(fancy_char, ascii_char)
176
-
177
- return md_content
178
-
179
-
180
- def parse_markdown_tables(md_content: str) -> List[TableData]:
181
- """
182
- Extract and parse all markdown tables from the provided content.
183
- Uses a direct approach to find and parse tables, which is more robust for tables
184
- at the end of files or with irregular formatting.
185
-
186
- Args:
187
- md_content: The markdown content containing tables
188
-
189
- Returns:
190
- A list of TableData objects, each containing the table data and header information
191
- """
192
- # Split the content into lines and process line by line
193
- lines = md_content.strip().split("\n")
194
-
195
- parsed_tables = []
196
- current_table_lines = []
197
- in_table = False
198
-
199
- # Identify potential tables by looking for lines with pipe characters
200
- for _, line in enumerate(lines):
201
- # Check if this line has pipe characters (a table row indicator)
202
- if "|" in line:
203
- # If we weren't in a table before, start a new one
204
- if not in_table:
205
- in_table = True
206
- current_table_lines = [line]
207
- else:
208
- # Continue adding to the current table
209
- current_table_lines.append(line)
210
- else:
211
- # No pipes in this line, so if we were in a table, we've reached its end
212
- if in_table:
213
- # Process the completed table if it has at least 2 rows
214
- if len(current_table_lines) >= 2:
215
- table_data = _process_table_lines(current_table_lines)
216
- if table_data and len(table_data) > 0:
217
- # Convert to numpy array for easier manipulation
218
- max_cols = max(len(row) for row in table_data)
219
- padded_data = [
220
- row + [""] * (max_cols - len(row)) for row in table_data
221
- ]
222
- table_array = np.array(padded_data)
223
-
224
- # In markdown tables, the first row is typically a header row
225
- header_rows = {0} if len(table_array) > 0 else set()
226
-
227
- # Set up col_headers with first row headers for each column
228
- col_headers = {}
229
- if len(table_array) > 0:
230
- for col_idx in range(table_array.shape[1]):
231
- if col_idx < len(table_array[0]):
232
- col_headers[col_idx] = [
233
- (0, table_array[0, col_idx])
234
- ]
235
-
236
- # Set up row_headers with first column headers for each row
237
- row_headers = {}
238
- if table_array.shape[1] > 0:
239
- for row_idx in range(
240
- 1, table_array.shape[0]
241
- ): # Skip header row
242
- row_headers[row_idx] = [
243
- (0, table_array[row_idx, 0])
244
- ] # First column as heading
245
-
246
- # Create TableData object
247
- parsed_tables.append(
248
- TableData(
249
- data=table_array,
250
- header_rows=header_rows,
251
- header_cols={0}
252
- if table_array.shape[1] > 0
253
- else set(), # First column as header
254
- col_headers=col_headers,
255
- row_headers=row_headers,
256
- )
257
- )
258
- in_table = False
259
-
260
- # Process the last table if we're still tracking one at the end of the file
261
- if in_table and len(current_table_lines) >= 2:
262
- table_data = _process_table_lines(current_table_lines)
263
- if table_data and len(table_data) > 0:
264
- # Convert to numpy array
265
- max_cols = max(len(row) for row in table_data)
266
- padded_data = [row + [""] * (max_cols - len(row)) for row in table_data]
267
- table_array = np.array(padded_data)
268
-
269
- # In markdown tables, the first row is typically a header row
270
- header_rows = {0} if len(table_array) > 0 else set()
271
-
272
- # Set up col_headers with first row headers for each column
273
- col_headers = {}
274
- if len(table_array) > 0:
275
- for col_idx in range(table_array.shape[1]):
276
- if col_idx < len(table_array[0]):
277
- col_headers[col_idx] = [(0, table_array[0, col_idx])]
278
-
279
- # Set up row_headers with first column headers for each row
280
- row_headers = {}
281
- if table_array.shape[1] > 0:
282
- for row_idx in range(1, table_array.shape[0]): # Skip header row
283
- row_headers[row_idx] = [
284
- (0, table_array[row_idx, 0])
285
- ] # First column as heading
286
-
287
- # Create TableData object
288
- parsed_tables.append(
289
- TableData(
290
- data=table_array,
291
- header_rows=header_rows,
292
- header_cols={0}
293
- if table_array.shape[1] > 0
294
- else set(), # First column as header
295
- col_headers=col_headers,
296
- row_headers=row_headers,
297
- )
298
- )
299
-
300
- return parsed_tables
301
-
302
-
303
- def _process_table_lines(table_lines: List[str]) -> List[List[str]]:
304
- """
305
- Process a list of lines that potentially form a markdown table.
306
-
307
- Args:
308
- table_lines: List of strings, each representing a line in a potential markdown table
309
-
310
- Returns:
311
- A list of rows, each a list of cell values
312
- """
313
- table_data = []
314
- separator_row_index = None
315
-
316
- # First, identify the separator row (the row with dashes)
317
- for i, line in enumerate(table_lines):
318
- # Check if this looks like a separator row (contains mostly dashes)
319
- content_without_pipes = line.replace("|", "").strip()
320
- if content_without_pipes and all(c in "- :" for c in content_without_pipes):
321
- separator_row_index = i
322
- break
323
-
324
- # Process each line, filtering out the separator row
325
- for i, line in enumerate(table_lines):
326
- # Skip the separator row
327
- if i == separator_row_index:
328
- continue
329
-
330
- # Skip lines that are entirely formatting
331
- if line.strip() and all(c in "- :|" for c in line):
332
- continue
333
-
334
- # Process the cells in this row
335
- cells = [cell.strip() for cell in line.split("|")]
336
-
337
- # Remove empty cells at the beginning and end (caused by leading/trailing pipes)
338
- if cells and cells[0] == "":
339
- cells = cells[1:]
340
- if cells and cells[-1] == "":
341
- cells = cells[:-1]
342
-
343
- if cells: # Only add non-empty rows
344
- table_data.append(cells)
345
-
346
- return table_data
347
-
348
-
349
- def parse_html_tables(html_content: str) -> List[TableData]:
350
- """
351
- Extract and parse all HTML tables from the provided content.
352
- Identifies header rows and columns, and maps them properly handling rowspan/colspan.
353
-
354
- Args:
355
- html_content: The HTML content containing tables
356
-
357
- Returns:
358
- A list of TableData objects, each containing the table data and header information
359
- """
360
- soup = BeautifulSoup(html_content, "html.parser")
361
- tables = soup.find_all("table")
362
-
363
- parsed_tables = []
364
-
365
- for table in tables:
366
- rows = table.find_all(["tr"])
367
- table_data = []
368
- header_rows = set()
369
- header_cols = set()
370
- col_headers = {} # Maps column index to all header cells above it
371
- row_headers = {} # Maps row index to all header cells to its left
372
-
373
- # Find rows inside thead tags - these are definitely header rows
374
- thead = table.find("thead")
375
- if thead:
376
- thead_rows = thead.find_all("tr")
377
- for tr in thead_rows:
378
- header_rows.add(rows.index(tr))
379
-
380
- # Initialize a grid to track filled cells due to rowspan/colspan
381
- cell_grid = {}
382
- col_span_info = {} # Tracks which columns contain headers
383
- row_span_info = {} # Tracks which rows contain headers
384
-
385
- # First pass: process each row to build the raw table data and identify headers
386
- for row_idx, row in enumerate(rows):
387
- cells = row.find_all(["th", "td"])
388
- row_data = []
389
- col_idx = 0
390
-
391
- # If there are th elements in this row, it's likely a header row
392
- if row.find("th"):
393
- header_rows.add(row_idx)
394
-
395
- for cell in cells:
396
- # Skip positions already filled by rowspans from above
397
- while (row_idx, col_idx) in cell_grid:
398
- row_data.append(cell_grid[(row_idx, col_idx)])
399
- col_idx += 1
400
-
401
- # Replace <br> and <br/> tags with newlines before getting text
402
- for br in cell.find_all("br"):
403
- br.replace_with("\n")
404
- cell_text = cell.get_text().strip()
405
-
406
- # Handle rowspan/colspan
407
- rowspan = int(cell.get("rowspan", 1))
408
- colspan = int(cell.get("colspan", 1))
409
-
410
- # Add the cell to the row data
411
- row_data.append(cell_text)
412
-
413
- # Fill the grid for this cell and its rowspan/colspan
414
- for i in range(rowspan):
415
- for j in range(colspan):
416
- if i == 0 and j == 0:
417
- continue # Skip the main cell position
418
- # For rowspan cells, preserve the text in all spanned rows
419
- if j == 0 and i > 0: # Only for cells directly below
420
- cell_grid[(row_idx + i, col_idx + j)] = cell_text
421
- else:
422
- cell_grid[(row_idx + i, col_idx + j)] = (
423
- "" # Mark other spans as empty
424
- )
425
-
426
- # If this is a header cell (th), mark it and its span
427
- if cell.name == "th":
428
- # Mark columns as header columns
429
- for j in range(colspan):
430
- header_cols.add(col_idx + j)
431
-
432
- # For rowspan, mark spanned rows as part of header
433
- for i in range(1, rowspan):
434
- if row_idx + i < len(rows):
435
- header_rows.add(row_idx + i)
436
-
437
- # Record this header for all spanned columns
438
- for j in range(colspan):
439
- curr_col = col_idx + j
440
- if curr_col not in col_headers:
441
- col_headers[curr_col] = []
442
- col_headers[curr_col].append((row_idx, cell_text))
443
-
444
- # Store which columns are covered by this header
445
- if cell_text and colspan > 1:
446
- if cell_text not in col_span_info:
447
- col_span_info[cell_text] = set()
448
- col_span_info[cell_text].add(curr_col)
449
-
450
- # Store which rows are covered by this header for rowspan
451
- if cell_text and rowspan > 1:
452
- if cell_text not in row_span_info:
453
- row_span_info[cell_text] = set()
454
- for i in range(rowspan):
455
- row_span_info[cell_text].add(row_idx + i)
456
-
457
- # Also handle row headers from data cells that have rowspan
458
- if cell.name == "td" and rowspan > 1 and col_idx in header_cols:
459
- for i in range(1, rowspan):
460
- if row_idx + i < len(rows):
461
- if row_idx + i not in row_headers:
462
- row_headers[row_idx + i] = []
463
- row_headers[row_idx + i].append((col_idx, cell_text))
464
-
465
- col_idx += colspan
466
-
467
- # Pad the row if needed to handle different row lengths
468
- table_data.append(row_data)
469
-
470
- # Second pass: expand headers to cells that should inherit them
471
- # First handle column headers
472
- for header_text, columns in col_span_info.items():
473
- for col in columns:
474
- # Add this header to all columns it spans over
475
- for row_idx in range(len(table_data)):
476
- if row_idx not in header_rows: # Only apply to data rows
477
- for j in range(
478
- col,
479
- len(table_data[row_idx])
480
- if row_idx < len(table_data)
481
- else 0,
482
- ):
483
- # Add header info to data cells in these columns
484
- if j not in col_headers:
485
- col_headers[j] = []
486
- if not any(h[1] == header_text for h in col_headers[j]):
487
- header_row = min(
488
- [r for r, t in col_headers.get(col, [(0, "")])]
489
- )
490
- col_headers[j].append((header_row, header_text))
491
-
492
- # Handle row headers
493
- for header_text, rows in row_span_info.items():
494
- for row in rows:
495
- if row < len(table_data):
496
- # Find first header column
497
- header_col = min(header_cols) if header_cols else 0
498
- if row not in row_headers:
499
- row_headers[row] = []
500
- if not any(h[1] == header_text for h in row_headers.get(row, [])):
501
- row_headers[row].append((header_col, header_text))
502
-
503
- # Process regular row headers - each cell in a header column becomes a header for its row
504
- for col_idx in header_cols:
505
- for row_idx, row in enumerate(table_data):
506
- if col_idx < len(row) and row[col_idx].strip():
507
- if row_idx not in row_headers:
508
- row_headers[row_idx] = []
509
- if not any(
510
- h[1] == row[col_idx] for h in row_headers.get(row_idx, [])
511
- ):
512
- row_headers[row_idx].append((col_idx, row[col_idx]))
513
-
514
- # Calculate max columns for padding
515
- max_cols = max(len(row) for row in table_data) if table_data else 0
516
-
517
- # Ensure all rows have the same number of columns
518
- if table_data:
519
- padded_data = [row + [""] * (max_cols - len(row)) for row in table_data]
520
- table_array = np.array(padded_data)
521
-
522
- # Create TableData object with the table and header information
523
- parsed_tables.append(
524
- TableData(
525
- data=table_array,
526
- header_rows=header_rows,
527
- header_cols=header_cols,
528
- col_headers=col_headers,
529
- row_headers=row_headers,
530
- )
531
- )
532
-
533
- return parsed_tables
534
-
535
-
536
- @dataclass(kw_only=True)
537
- class BasePDFTest:
538
- """
539
- Base class for all PDF test types.
540
-
541
- Attributes:
542
- pdf: The PDF filename.
543
- page: The page number for the test.
544
- id: Unique identifier for the test.
545
- type: The type of test.
546
- threshold: A float between 0 and 1 representing the threshold for fuzzy matching.
547
- """
548
-
549
- pdf: str
550
- page: int
551
- id: str
552
- type: str
553
- max_diffs: int = 0
554
- checked: Optional[TestChecked] = None
555
- url: Optional[str] = None
556
-
557
- def __post_init__(self):
558
- if not self.pdf:
559
- raise ValidationError("PDF filename cannot be empty")
560
- if not self.id:
561
- raise ValidationError("Test ID cannot be empty")
562
- if not isinstance(self.max_diffs, int) or self.max_diffs < 0:
563
- raise ValidationError("Max diffs must be positive number or 0")
564
- if self.type not in {t.value for t in TestType}:
565
- raise ValidationError(f"Invalid test type: {self.type}")
566
-
567
- def run(self, md_content: str) -> Tuple[bool, str]:
568
- """
569
- Run the test on the provided markdown content.
570
-
571
- Args:
572
- md_content: The content of the .md file.
573
-
574
- Returns:
575
- A tuple (passed, explanation) where 'passed' is True if the test passes,
576
- and 'explanation' provides details when the test fails.
577
- """
578
- raise NotImplementedError("Subclasses must implement the run method")
579
-
580
-
581
- @dataclass
582
- class TextPresenceTest(BasePDFTest):
583
- """
584
- Test to verify the presence or absence of specific text in a PDF.
585
-
586
- Attributes:
587
- text: The text string to search for.
588
- """
589
-
590
- text: str
591
- case_sensitive: bool = True
592
- first_n: Optional[int] = None
593
- last_n: Optional[int] = None
594
-
595
- def __post_init__(self):
596
- super().__post_init__()
597
- if self.type not in {TestType.PRESENT.value, TestType.ABSENT.value}:
598
- raise ValidationError(f"Invalid type for TextPresenceTest: {self.type}")
599
- self.text = normalize_text(self.text)
600
- if not self.text.strip():
601
- raise ValidationError("Text field cannot be empty")
602
-
603
- def run(self, md_content: str) -> Tuple[bool, str]:
604
- reference_query = self.text
605
-
606
- # Normalize whitespace in the md_content
607
- md_content = normalize_text(md_content)
608
-
609
- if not self.case_sensitive:
610
- reference_query = reference_query.lower()
611
- md_content = md_content.lower()
612
-
613
- if self.first_n and self.last_n:
614
- md_content = md_content[: self.first_n] + md_content[-self.last_n :]
615
- elif self.first_n:
616
- md_content = md_content[: self.first_n]
617
- elif self.last_n:
618
- md_content = md_content[-self.last_n :]
619
-
620
- # Threshold for fuzzy matching derived from max_diffs
621
- threshold = 1.0 - (
622
- self.max_diffs / (len(reference_query) if len(reference_query) > 0 else 1)
623
- )
624
- best_ratio = fuzz.partial_ratio(reference_query, md_content) / 100.0
625
-
626
- if self.type == TestType.PRESENT.value:
627
- if best_ratio >= threshold:
628
- return True, ""
629
- else:
630
- msg = (
631
- f"Expected '{reference_query[:40]}...' with threshold {threshold} "
632
- f"but best match ratio was {best_ratio:.3f}"
633
- )
634
- return False, msg
635
- else: # ABSENT
636
- if best_ratio < threshold:
637
- return True, ""
638
- else:
639
- msg = (
640
- f"Expected absence of '{reference_query[:40]}...' with threshold {threshold} "
641
- f"but best match ratio was {best_ratio:.3f}"
642
- )
643
- return False, msg
644
-
645
-
646
- @dataclass
647
- class TextOrderTest(BasePDFTest):
648
- """
649
- Test to verify that one text appears before another in a PDF.
650
-
651
- Attributes:
652
- before: The text expected to appear first.
653
- after: The text expected to appear after the 'before' text.
654
- """
655
-
656
- before: str
657
- after: str
658
-
659
- def __post_init__(self):
660
- super().__post_init__()
661
- if self.type != TestType.ORDER.value:
662
- raise ValidationError(f"Invalid type for TextOrderTest: {self.type}")
663
- self.before = normalize_text(self.before)
664
- self.after = normalize_text(self.after)
665
- if not self.before.strip():
666
- raise ValidationError("Before field cannot be empty")
667
- if not self.after.strip():
668
- raise ValidationError("After field cannot be empty")
669
- if (
670
- self.max_diffs > len(self.before) // 2
671
- or self.max_diffs > len(self.after) // 2
672
- ):
673
- raise ValidationError(
674
- "Max diffs is too large for this test, greater than 50% of the search string"
675
- )
676
-
677
- def run(self, md_content: str) -> Tuple[bool, str]:
678
- md_content = normalize_text(md_content)
679
-
680
- before_matches = find_near_matches(
681
- self.before, md_content, max_l_dist=self.max_diffs
682
- )
683
- after_matches = find_near_matches(
684
- self.after, md_content, max_l_dist=self.max_diffs
685
- )
686
-
687
- if not before_matches:
688
- return (
689
- False,
690
- f"'before' text '{self.before[:40]}...' not found with max_l_dist {self.max_diffs}",
691
- )
692
- if not after_matches:
693
- return (
694
- False,
695
- f"'after' text '{self.after[:40]}...' not found with max_l_dist {self.max_diffs}",
696
- )
697
-
698
- for before_match in before_matches:
699
- for after_match in after_matches:
700
- if before_match.start < after_match.start:
701
- return True, ""
702
- return False, (
703
- f"Could not find a location where '{self.before[:40]}...' appears before "
704
- f"'{self.after[:40]}...'."
705
- )
706
-
707
-
708
- @dataclass
709
- class TableTest(BasePDFTest):
710
- """
711
- Test to verify certain properties of a table are held, namely that some cells appear relative to other cells correctly
712
- """
713
-
714
- # This is the target cell, which must exist in at least one place in the table
715
- cell: str
716
-
717
- # These properties say that the cell immediately up/down/left/right of the target cell has the string specified
718
- up: str = ""
719
- down: str = ""
720
- left: str = ""
721
- right: str = ""
722
-
723
- # These properties say that the cell all the way up, or all the way left of the target cell (ex. headings) has the string value specified
724
- top_heading: str = ""
725
- left_heading: str = ""
726
-
727
- ignore_markdown_tables: bool = False
728
-
729
- def __post_init__(self):
730
- super().__post_init__()
731
- if self.type != TestType.TABLE.value:
732
- raise ValidationError(f"Invalid type for TableTest: {self.type}")
733
-
734
- # Normalize the search text too
735
- self.cell = normalize_text(self.cell)
736
- self.up = normalize_text(self.up)
737
- self.down = normalize_text(self.down)
738
- self.left = normalize_text(self.left)
739
- self.right = normalize_text(self.right)
740
- self.top_heading = normalize_text(self.top_heading)
741
- self.left_heading = normalize_text(self.left_heading)
742
-
743
- def run(self, content: str) -> Tuple[bool, str]:
744
- """
745
- Run the table test on provided content.
746
-
747
- Finds all tables (markdown and/or HTML based on content_type) and checks if any cell
748
- matches the target cell and satisfies the specified relationships.
749
-
750
- Args:
751
- content: The content containing tables (markdown or HTML)
752
-
753
- Returns:
754
- A tuple (passed, explanation) where 'passed' is True if the test passes,
755
- and 'explanation' provides details when the test fails.
756
- """
757
- # Initialize variables to track tables and results
758
- tables_to_check = []
759
- failed_reasons = []
760
-
761
- # Threshold for fuzzy matching derived from max_diffs
762
- threshold = 1.0 - (
763
- self.max_diffs / (len(self.cell) if len(self.cell) > 0 else 1)
764
- )
765
- threshold = max(0.5, threshold)
766
-
767
- # Parse tables based on content_type
768
- if not self.ignore_markdown_tables:
769
- md_tables = parse_markdown_tables(content)
770
- tables_to_check.extend(md_tables)
771
-
772
- html_tables = parse_html_tables(content)
773
- tables_to_check.extend(html_tables)
774
-
775
- # If no tables found, return failure
776
- if not tables_to_check:
777
- return False, "No tables found in the content"
778
-
779
- # Check each table
780
- for table_data in tables_to_check:
781
- # Removed debug print statement
782
- table_array = table_data.data
783
- header_rows = table_data.header_rows
784
- header_cols = table_data.header_cols
785
-
786
- # Find all cells that match the target cell using fuzzy matching
787
- matches = []
788
- for i in range(table_array.shape[0]):
789
- for j in range(table_array.shape[1]):
790
- cell_content = normalize_text(table_array[i, j])
791
- similarity = fuzz.ratio(self.cell, cell_content) / 100.0
792
-
793
- if similarity >= threshold:
794
- matches.append((i, j))
795
-
796
- # If no matches found in this table, continue to the next table
797
- if not matches:
798
- continue
799
-
800
- # Check the relationships for each matching cell
801
- for row_idx, col_idx in matches:
802
- all_relationships_satisfied = True
803
- current_failed_reasons = []
804
-
805
- # Check up relationship
806
- if self.up and row_idx > 0:
807
- up_cell = normalize_text(table_array[row_idx - 1, col_idx])
808
- up_similarity = fuzz.ratio(self.up, up_cell) / 100.0
809
- if up_similarity < max(
810
- 0.5,
811
- 1.0
812
- - (self.max_diffs / (len(self.up) if len(self.up) > 0 else 1)),
813
- ):
814
- all_relationships_satisfied = False
815
- current_failed_reasons.append(
816
- f"Cell above '{up_cell}' doesn't match expected '{self.up}' (similarity: {up_similarity:.2f})"
817
- )
818
-
819
- # Check down relationship
820
- if self.down and row_idx < table_array.shape[0] - 1:
821
- down_cell = normalize_text(table_array[row_idx + 1, col_idx])
822
- down_similarity = fuzz.ratio(self.down, down_cell) / 100.0
823
- if down_similarity < max(
824
- 0.5,
825
- 1.0
826
- - (
827
- self.max_diffs
828
- / (len(self.down) if len(self.down) > 0 else 1)
829
- ),
830
- ):
831
- all_relationships_satisfied = False
832
- current_failed_reasons.append(
833
- f"Cell below '{down_cell}' doesn't match expected '{self.down}' (similarity: {down_similarity:.2f})"
834
- )
835
-
836
- # Check left relationship
837
- if self.left and col_idx > 0:
838
- left_cell = normalize_text(table_array[row_idx, col_idx - 1])
839
- left_similarity = fuzz.ratio(self.left, left_cell) / 100.0
840
- if left_similarity < max(
841
- 0.5,
842
- 1.0
843
- - (
844
- self.max_diffs
845
- / (len(self.left) if len(self.left) > 0 else 1)
846
- ),
847
- ):
848
- all_relationships_satisfied = False
849
- current_failed_reasons.append(
850
- f"Cell to the left '{left_cell}' doesn't match expected '{self.left}' (similarity: {left_similarity:.2f})"
851
- )
852
-
853
- # Check right relationship
854
- if self.right and col_idx < table_array.shape[1] - 1:
855
- right_cell = normalize_text(table_array[row_idx, col_idx + 1])
856
- right_similarity = fuzz.ratio(self.right, right_cell) / 100.0
857
- if right_similarity < max(
858
- 0.5,
859
- 1.0
860
- - (
861
- self.max_diffs
862
- / (len(self.right) if len(self.right) > 0 else 1)
863
- ),
864
- ):
865
- all_relationships_satisfied = False
866
- current_failed_reasons.append(
867
- f"Cell to the right '{right_cell}' doesn't match expected '{self.right}' (similarity: {right_similarity:.2f})"
868
- )
869
-
870
- # Check top heading relationship
871
- if self.top_heading:
872
- # Try to find a match in the column headers
873
- top_heading_found = False
874
- best_match = ""
875
- best_similarity = 0
876
-
877
- # Check the col_headers dictionary first (this handles colspan properly)
878
- if col_idx in table_data.col_headers:
879
- for _, header_text in table_data.col_headers[col_idx]:
880
- header_text = normalize_text(header_text)
881
- similarity = (
882
- fuzz.ratio(self.top_heading, header_text) / 100.0
883
- )
884
- if similarity > best_similarity:
885
- best_similarity = similarity
886
- best_match = header_text
887
- if best_similarity >= max(
888
- 0.5,
889
- 1.0
890
- - (
891
- self.max_diffs
892
- / (
893
- len(self.top_heading)
894
- if len(self.top_heading) > 0
895
- else 1
896
- )
897
- ),
898
- ):
899
- top_heading_found = True
900
- break
901
-
902
- # If no match found in col_headers, fall back to checking header rows
903
- if not top_heading_found and header_rows:
904
- for i in sorted(header_rows):
905
- if i < row_idx and table_array[i, col_idx].strip():
906
- header_text = normalize_text(table_array[i, col_idx])
907
- similarity = (
908
- fuzz.ratio(self.top_heading, header_text) / 100.0
909
- )
910
- if similarity > best_similarity:
911
- best_similarity = similarity
912
- best_match = header_text
913
- if best_similarity >= max(
914
- 0.5,
915
- 1.0
916
- - (
917
- self.max_diffs
918
- / (
919
- len(self.top_heading)
920
- if len(self.top_heading) > 0
921
- else 1
922
- )
923
- ),
924
- ):
925
- top_heading_found = True
926
- break
927
-
928
- # If still no match, use any non-empty cell above as a last resort
929
- if not top_heading_found and not best_match and row_idx > 0:
930
- for i in range(row_idx):
931
- if table_array[i, col_idx].strip():
932
- header_text = normalize_text(table_array[i, col_idx])
933
- similarity = (
934
- fuzz.ratio(self.top_heading, header_text) / 100.0
935
- )
936
- if similarity > best_similarity:
937
- best_similarity = similarity
938
- best_match = header_text
939
-
940
- if not best_match:
941
- all_relationships_satisfied = False
942
- current_failed_reasons.append(
943
- f"No top heading found for cell at ({row_idx}, {col_idx})"
944
- )
945
- elif best_similarity < max(
946
- 0.5,
947
- 1.0
948
- - (
949
- self.max_diffs
950
- / (
951
- len(self.top_heading)
952
- if len(self.top_heading) > 0
953
- else 1
954
- )
955
- ),
956
- ):
957
- all_relationships_satisfied = False
958
- current_failed_reasons.append(
959
- f"Top heading '{best_match}' doesn't match expected '{self.top_heading}' (similarity: {best_similarity:.2f})"
960
- )
961
-
962
- # Check left heading relationship
963
- if self.left_heading:
964
- # Try to find a match in the row headers
965
- left_heading_found = False
966
- best_match = ""
967
- best_similarity = 0
968
-
969
- # Check the row_headers dictionary first (this handles rowspan properly)
970
- if row_idx in table_data.row_headers:
971
- for _, header_text in table_data.row_headers[row_idx]:
972
- header_text = normalize_text(header_text)
973
- similarity = (
974
- fuzz.ratio(self.left_heading, header_text) / 100.0
975
- )
976
- if similarity > best_similarity:
977
- best_similarity = similarity
978
- best_match = header_text
979
- if best_similarity >= max(
980
- 0.5,
981
- 1.0
982
- - (
983
- self.max_diffs
984
- / (
985
- len(self.left_heading)
986
- if len(self.left_heading) > 0
987
- else 1
988
- )
989
- ),
990
- ):
991
- left_heading_found = True
992
- break
993
-
994
- # If no match found in row_headers, fall back to checking header columns
995
- if not left_heading_found and header_cols:
996
- for j in sorted(header_cols):
997
- if j < col_idx and table_array[row_idx, j].strip():
998
- header_text = normalize_text(table_array[row_idx, j])
999
- similarity = (
1000
- fuzz.ratio(self.left_heading, header_text) / 100.0
1001
- )
1002
- if similarity > best_similarity:
1003
- best_similarity = similarity
1004
- best_match = header_text
1005
- if best_similarity >= max(
1006
- 0.5,
1007
- 1.0
1008
- - (
1009
- self.max_diffs
1010
- / (
1011
- len(self.left_heading)
1012
- if len(self.left_heading) > 0
1013
- else 1
1014
- )
1015
- ),
1016
- ):
1017
- left_heading_found = True
1018
- break
1019
-
1020
- # If still no match, use any non-empty cell to the left as a last resort
1021
- if not left_heading_found and not best_match and col_idx > 0:
1022
- for j in range(col_idx):
1023
- if table_array[row_idx, j].strip():
1024
- header_text = normalize_text(table_array[row_idx, j])
1025
- similarity = (
1026
- fuzz.ratio(self.left_heading, header_text) / 100.0
1027
- )
1028
- if similarity > best_similarity:
1029
- best_similarity = similarity
1030
- best_match = header_text
1031
-
1032
- if not best_match:
1033
- all_relationships_satisfied = False
1034
- current_failed_reasons.append(
1035
- f"No left heading found for cell at ({row_idx}, {col_idx})"
1036
- )
1037
- elif best_similarity < max(
1038
- 0.5,
1039
- 1.0
1040
- - (
1041
- self.max_diffs
1042
- / (
1043
- len(self.left_heading)
1044
- if len(self.left_heading) > 0
1045
- else 1
1046
- )
1047
- ),
1048
- ):
1049
- all_relationships_satisfied = False
1050
- current_failed_reasons.append(
1051
- f"Left heading '{best_match}' doesn't match expected '{self.left_heading}' (similarity: {best_similarity:.2f})"
1052
- )
1053
-
1054
- # If all relationships are satisfied for this cell, the test passes
1055
- if all_relationships_satisfied:
1056
- return True, ""
1057
- else:
1058
- failed_reasons.extend(current_failed_reasons)
1059
-
1060
- # If we've gone through all tables and all matching cells and none satisfied all relationships
1061
- if not failed_reasons:
1062
- return (
1063
- False,
1064
- f"No cell matching '{self.cell}' found in any table with threshold {threshold}",
1065
- )
1066
- else:
1067
- return (
1068
- False,
1069
- f"Found cells matching '{self.cell}' but relationships were not satisfied: {'; '.join(failed_reasons)}",
1070
- )
1071
-
1072
-
1073
- @dataclass
1074
- class BaselineTest(BasePDFTest):
1075
- """
1076
- This test makes sure that several baseline quality checks pass for the output generation.
1077
-
1078
- Namely, the output is not blank, not endlessly repeating, and contains characters of the proper
1079
- character sets.
1080
-
1081
- """
1082
-
1083
- max_length: Optional[int] = None # Used to implement blank page checks
1084
- max_length_skips_image_alt_tags: bool = False
1085
-
1086
- max_repeats: int = 30
1087
- check_disallowed_characters: bool = True
1088
-
1089
- def run(self, content: str) -> Tuple[bool, str]:
1090
- base_content_len = len("".join(c for c in content if c.isalnum()).strip())
1091
-
1092
- # If this a blank page check, then it short circuits the rest of the checks
1093
- if self.max_length is not None:
1094
- if self.max_length_skips_image_alt_tags:
1095
- # Remove markdown image tags like ![alt text](image.png) from the text length count
1096
- content_for_length_check = re.sub(r"!\[.*?\]\(.*?\)", "", content)
1097
- base_content_len = len(
1098
- "".join(c for c in content_for_length_check if c.isalnum()).strip()
1099
- )
1100
-
1101
- if base_content_len > self.max_length:
1102
- return (
1103
- False,
1104
- f"{base_content_len} characters were output for a page we expected to be blank",
1105
- )
1106
- else:
1107
- return True, ""
1108
-
1109
- if base_content_len == 0:
1110
- return False, "The text contains no alpha numeric characters"
1111
-
1112
- # Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality
1113
- # Honestly, this test doesn't seem to catch anything at the moment, maybe it can be refactored to a "text-quality"
1114
- # test or something, that measures repetition, non-blanks, charsets, etc
1115
- d = RepeatDetector(max_ngram_size=5)
1116
- d.add_letters(content)
1117
- repeats = d.ngram_repeats()
1118
-
1119
- for index, count in enumerate(repeats):
1120
- if count > self.max_repeats:
1121
- return (
1122
- False,
1123
- f"Text ends with {count} repeating {index+1}-grams, invalid",
1124
- )
1125
-
1126
- pattern = re.compile(
1127
- r"["
1128
- r"\u4e00-\u9FFF" # CJK Unified Ideographs (Chinese characters)
1129
- r"\u3040-\u309F" # Hiragana (Japanese)
1130
- r"\u30A0-\u30FF" # Katakana (Japanese)
1131
- r"\U0001F600-\U0001F64F" # Emoticons (Emoji)
1132
- r"\U0001F300-\U0001F5FF" # Miscellaneous Symbols and Pictographs (Emoji)
1133
- r"\U0001F680-\U0001F6FF" # Transport and Map Symbols (Emoji)
1134
- r"\U0001F1E0-\U0001F1FF" # Regional Indicator Symbols (flags, Emoji)
1135
- r"]",
1136
- flags=re.UNICODE,
1137
- )
1138
-
1139
- matches = pattern.findall(content)
1140
- if self.check_disallowed_characters and matches:
1141
- return False, f"Text contains disallowed characters {matches}"
1142
-
1143
- return True, ""
1144
-
1145
-
1146
- @dataclass
1147
- class MathTest(BasePDFTest):
1148
- math: str
1149
-
1150
- ignore_dollar_delimited: bool = False
1151
-
1152
- def __post_init__(self):
1153
- super().__post_init__()
1154
- if self.type != TestType.MATH.value:
1155
- raise ValidationError(f"Invalid type for MathTest: {self.type}")
1156
- if len(self.math.strip()) == 0:
1157
- raise ValidationError("Math test must have non-empty math expression")
1158
-
1159
- self.reference_render = render_equation(self.math)
1160
-
1161
- if self.reference_render is None:
1162
- raise ValidationError(f"Math equation {self.math} was not able to render")
1163
-
1164
- def run(self, content: str) -> Tuple[bool, str]:
1165
- # Store both the search pattern and the full pattern to replace
1166
- patterns = [
1167
- (r"\\\((.+?)\\\)", r"\\\((.+?)\\\)"), # \(...\)
1168
- (r"\\\[(.+?)\\\]", r"\\\[(.+?)\\\]"), # \[...\]
1169
- ]
1170
-
1171
- if not self.ignore_dollar_delimited:
1172
- patterns.extend(
1173
- [
1174
- (r"\$\$(.+?)\$\$", r"\$\$(.+?)\$\$"), # $$...$$
1175
- (r"\$(.+?)\$", r"\$(.+?)\$"), # $...$])
1176
- ]
1177
- )
1178
-
1179
- equations = []
1180
- modified_content = content
1181
-
1182
- for search_pattern, replace_pattern in patterns:
1183
- # Find all matches for the current pattern
1184
- matches = re.findall(search_pattern, modified_content, re.DOTALL)
1185
- equations.extend([e.strip() for e in matches])
1186
-
1187
- # Replace all instances of this pattern with empty strings
1188
- modified_content = re.sub(
1189
- replace_pattern, "", modified_content, flags=re.DOTALL
1190
- )
1191
-
1192
- # If an equation in the markdown exactly matches our math string, then that's good enough
1193
- # we don't have to do a more expensive comparison
1194
- if any(hyp == self.math for hyp in equations):
1195
- return True, ""
1196
-
1197
- # If not, then let's render the math equation itself and now compare to each hypothesis
1198
- # But, to speed things up, since rendering equations is hard, we sort the equations on the page
1199
- # by fuzzy similarity to the hypothesis
1200
- equations.sort(key=lambda x: -fuzz.ratio(x, self.math))
1201
- for hypothesis in equations:
1202
- hypothesis_render = render_equation(hypothesis)
1203
-
1204
- if not hypothesis_render:
1205
- continue
1206
-
1207
- if compare_rendered_equations(self.reference_render, hypothesis_render):
1208
- return True, ""
1209
-
1210
- # self.reference_render.save(f"maths/{self.id}_ref.png", format="PNG")
1211
- # best_match_render.save(f"maths/{self.id}_hyp.png", format="PNG")
1212
-
1213
- return False, f"No match found for {self.math} anywhere in content"
1214
-
1215
-
1216
- def load_single_test(data: Union[str, Dict]) -> BasePDFTest:
1217
- """
1218
- Load a single test from a JSON line string or JSON object.
1219
-
1220
- Args:
1221
- data: Either a JSON string to parse or a dictionary containing test data.
1222
-
1223
- Returns:
1224
- A test object of the appropriate type.
1225
-
1226
- Raises:
1227
- ValidationError: If the test type is unknown or data is invalid.
1228
- json.JSONDecodeError: If the string cannot be parsed as JSON.
1229
- """
1230
- # Handle JSON string input
1231
- if isinstance(data, str):
1232
- data = data.strip()
1233
- if not data:
1234
- raise ValueError("Empty string provided")
1235
- data = json.loads(data)
1236
-
1237
- # Process the test data
1238
- test_type = data.get("type")
1239
- if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}:
1240
- test = TextPresenceTest(**data)
1241
- elif test_type == TestType.ORDER.value:
1242
- test = TextOrderTest(**data)
1243
- elif test_type == TestType.TABLE.value:
1244
- test = TableTest(**data)
1245
- elif test_type == TestType.MATH.value:
1246
- test = MathTest(**data)
1247
- elif test_type == TestType.BASELINE.value:
1248
- test = BaselineTest(**data)
1249
- else:
1250
- raise ValidationError(f"Unknown test type: {test_type}")
1251
-
1252
- return test
1253
-
1254
-
1255
- def load_tests(jsonl_file: str) -> List[BasePDFTest]:
1256
- """
1257
- Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor.
1258
-
1259
- Args:
1260
- jsonl_file: Path to the JSONL file containing test definitions.
1261
-
1262
- Returns:
1263
- A list of test objects.
1264
- """
1265
-
1266
- def process_line_with_number(
1267
- line_tuple: Tuple[int, str],
1268
- ) -> Optional[Tuple[int, BasePDFTest]]:
1269
- """
1270
- Process a single line from the JSONL file and return a tuple of (line_number, test object).
1271
- Returns None for empty lines.
1272
- """
1273
- line_number, line = line_tuple
1274
- line = line.strip()
1275
- if not line:
1276
- return None
1277
-
1278
- try:
1279
- test = load_single_test(line)
1280
- return (line_number, test)
1281
- except json.JSONDecodeError as e:
1282
- print(f"Error parsing JSON on line {line_number}: {e}")
1283
- raise
1284
- except (ValidationError, KeyError) as e:
1285
- print(f"Error on line {line_number}: {e}")
1286
- raise
1287
- except Exception as e:
1288
- print(f"Unexpected error on line {line_number}: {e}")
1289
- raise
1290
-
1291
- tests = []
1292
-
1293
- # Read all lines along with their line numbers.
1294
- with open(jsonl_file, "r") as f:
1295
- lines = list(enumerate(f, start=1))
1296
-
1297
- # Use a ThreadPoolExecutor to process each line in parallel.
1298
- with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 1, 64)) as executor:
1299
- # Submit all tasks concurrently.
1300
- futures = {
1301
- executor.submit(process_line_with_number, item): item[0] for item in lines
1302
- }
1303
- # Use tqdm to show progress as futures complete.
1304
- for future in tqdm(
1305
- as_completed(futures), total=len(futures), desc="Loading tests"
1306
- ):
1307
- result = future.result()
1308
- if result is not None:
1309
- _, test = result
1310
- tests.append(test)
1311
-
1312
- # Check for duplicate test IDs after parallel processing.
1313
- unique_ids = set()
1314
- for test in tests:
1315
- if test.id in unique_ids:
1316
- raise ValidationError(
1317
- f"Test with duplicate id {test.id} found, error loading tests."
1318
- )
1319
- unique_ids.add(test.id)
1320
-
1321
- return tests
1322
-
1323
-
1324
- def save_tests(tests: List[BasePDFTest], jsonl_file: str) -> None:
1325
- """
1326
- Save tests to a JSONL file using asdict for conversion.
1327
-
1328
- Args:
1329
- tests: A list of test objects.
1330
- jsonl_file: Path to the output JSONL file.
1331
- """
1332
- with open(jsonl_file, "w") as file:
1333
- for test in tests:
1334
- file.write(json.dumps(asdict(test)) + "\n")