vlmparse 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +1763 -0
  2. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  3. vlmparse/benchpdf2md/create_dataset.py +60 -0
  4. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +1 -0
  5. vlmparse/benchpdf2md/olmocrbench/katex/render.py +592 -0
  6. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +175 -0
  7. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +256 -0
  8. vlmparse/benchpdf2md/olmocrbench/tests.py +1334 -0
  9. vlmparse/benchpdf2md/run_benchmark.py +296 -0
  10. vlmparse/benchpdf2md/st_visu_benchmark/app.py +271 -0
  11. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +117 -0
  12. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +95 -0
  13. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +20 -0
  14. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +50 -0
  15. vlmparse/benchpdf2md/utils.py +56 -0
  16. vlmparse/clients/chandra.py +323 -0
  17. vlmparse/clients/deepseekocr.py +52 -0
  18. vlmparse/clients/docling.py +146 -0
  19. vlmparse/clients/dotsocr.py +277 -0
  20. vlmparse/clients/granite_docling.py +132 -0
  21. vlmparse/clients/hunyuanocr.py +45 -0
  22. vlmparse/clients/lightonocr.py +43 -0
  23. vlmparse/clients/mineru.py +119 -0
  24. vlmparse/clients/nanonetocr.py +29 -0
  25. vlmparse/clients/olmocr.py +46 -0
  26. vlmparse/clients/openai_converter.py +173 -0
  27. vlmparse/clients/paddleocrvl.py +48 -0
  28. vlmparse/clients/pipe_utils/cleaner.py +74 -0
  29. vlmparse/clients/pipe_utils/html_to_md_conversion.py +136 -0
  30. vlmparse/clients/pipe_utils/utils.py +12 -0
  31. vlmparse/clients/prompts.py +66 -0
  32. vlmparse/data_model/box.py +551 -0
  33. vlmparse/data_model/document.py +148 -0
  34. vlmparse/servers/docker_server.py +199 -0
  35. vlmparse/servers/utils.py +250 -0
  36. vlmparse/st_viewer/fs_nav.py +53 -0
  37. vlmparse/st_viewer/st_viewer.py +80 -0
  38. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/METADATA +11 -1
  39. vlmparse-0.1.2.dist-info/RECORD +50 -0
  40. vlmparse-0.1.0.dist-info/RECORD +0 -13
  41. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/WHEEL +0 -0
  42. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/entry_points.txt +0 -0
  43. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/licenses/LICENSE +0 -0
  44. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/top_level.txt +0 -0
File without changes
@@ -0,0 +1,60 @@
1
+ # %%
2
+ """Create a HuggingFace dataset from the benchmark folder structure."""
3
+
4
+ import json
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+
10
+ def create_dataset(
11
+ base_folder: Path,
12
+ ) -> pd.DataFrame:
13
+ """Load all data from the folder structure.
14
+ One row per test with relative PDF path.
15
+
16
+ Args:
17
+ base_folder: Path to the folder containing benchmark data
18
+ """
19
+ data = []
20
+
21
+ for subdir in sorted(Path(base_folder).rglob("**/")):
22
+ if not subdir.is_dir() or not len(list(subdir.glob("*.jsonl"))) >= 1:
23
+ continue
24
+
25
+ metadata_path = subdir / "metadata.json"
26
+ tests_paths = list(subdir.glob("tests*.jsonl"))
27
+ pdf_path = [p for p in subdir.glob("*.pdf")]
28
+ assert len(pdf_path) == 1, f"Expected 1 PDF file, got {len(pdf_path)}"
29
+ pdf_path = pdf_path[0]
30
+
31
+ if not all([metadata_path.exists(), pdf_path.exists(), len(tests_paths) > 0]):
32
+ print(f"Skipping {subdir.name}: missing files")
33
+ continue
34
+
35
+ # Load metadata
36
+ with open(metadata_path, "r") as f:
37
+ metadata = json.load(f)
38
+
39
+ # Load tests
40
+ tests = []
41
+ for tests_path in subdir.glob("*.jsonl"):
42
+ with open(tests_path, "r") as f:
43
+ for line in f:
44
+ if not line.strip():
45
+ continue
46
+ tests.append(json.loads(line.strip()))
47
+
48
+ # Create one row per test
49
+ for test in tests:
50
+ row = {
51
+ "pdf_name": metadata["pdf"],
52
+ "page": metadata["page"],
53
+ "doc_type": metadata.get("doc_type"),
54
+ "original_doc_path": metadata.get("original_doc_path"),
55
+ "pdf_path": str(pdf_path),
56
+ **test, # Unpack all test fields
57
+ }
58
+ data.append(row)
59
+
60
+ return pd.DataFrame(data)
@@ -0,0 +1 @@
1
+ from .render import compare_rendered_equations, render_equation # noqa
@@ -0,0 +1,592 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Extract inner-most spans and their bounding boxes, and the MathML output,
4
+ from rendered LaTeX equations using Playwright and KaTeX.
5
+ Caching is maintained via a SHA1-based hash stored in a sqlite database.
6
+
7
+ Requirements:
8
+ pip install playwright
9
+ python -m playwright install chromium
10
+
11
+ Place katex.min.css and katex.min.js in the same directory as this script
12
+ """
13
+
14
+ import atexit
15
+ import hashlib
16
+ import json
17
+ import os
18
+ import pathlib
19
+ import re
20
+ import sqlite3
21
+ import threading
22
+ import unittest
23
+ import weakref
24
+ from concurrent.futures import ThreadPoolExecutor
25
+ from dataclasses import dataclass
26
+ from typing import List, Optional
27
+
28
+ from playwright.sync_api import Error as PlaywrightError
29
+ from playwright.sync_api import sync_playwright
30
+
31
+ # --- New SQLite Cache Implementation ---
32
+
33
+
34
+ class EquationCache:
35
+ def __init__(self, db_path: Optional[str] = None):
36
+ if db_path is None:
37
+ # Use the same cache directory as before
38
+ cache_dir = (
39
+ pathlib.Path.home() / ".cache" / "olmocr" / "bench" / "equations"
40
+ )
41
+ cache_dir.mkdir(parents=True, exist_ok=True)
42
+ db_path = str(cache_dir / "cache.db")
43
+ self.db_path = db_path
44
+ self.lock = threading.Lock()
45
+ self._init_db()
46
+
47
+ def _init_db(self):
48
+ with self.lock:
49
+ conn = sqlite3.connect(self.db_path)
50
+ c = conn.cursor()
51
+ # Added an 'error' column to store rendering errors
52
+ c.execute(
53
+ """
54
+ CREATE TABLE IF NOT EXISTS equations (
55
+ eq_hash TEXT PRIMARY KEY,
56
+ mathml TEXT,
57
+ spans TEXT,
58
+ error TEXT
59
+ )
60
+ """
61
+ )
62
+ conn.commit()
63
+ conn.close()
64
+
65
+ def load(self, eq_hash: str) -> Optional["RenderedEquation"]:
66
+ with self.lock:
67
+ conn = sqlite3.connect(self.db_path)
68
+ c = conn.cursor()
69
+ c.execute(
70
+ "SELECT mathml, spans, error FROM equations WHERE eq_hash = ?",
71
+ (eq_hash,),
72
+ )
73
+ row = c.fetchone()
74
+ conn.close()
75
+ if row:
76
+ mathml, spans_json, error = row
77
+ if error:
78
+ # In error cases, we return an instance with error set and no spans.
79
+ return RenderedEquation(mathml=mathml, spans=[], error=error)
80
+ else:
81
+ spans_data = json.loads(spans_json)
82
+ spans = [
83
+ SpanInfo(
84
+ text=s["text"],
85
+ bounding_box=BoundingBox(
86
+ x=s["boundingBox"]["x"],
87
+ y=s["boundingBox"]["y"],
88
+ width=s["boundingBox"]["width"],
89
+ height=s["boundingBox"]["height"],
90
+ ),
91
+ )
92
+ for s in spans_data
93
+ ]
94
+ return RenderedEquation(mathml=mathml, spans=spans)
95
+ return None
96
+
97
+ def save(self, eq_hash: str, rendered_eq: "RenderedEquation"):
98
+ spans_data = [
99
+ {
100
+ "text": span.text,
101
+ "boundingBox": {
102
+ "x": span.bounding_box.x,
103
+ "y": span.bounding_box.y,
104
+ "width": span.bounding_box.width,
105
+ "height": span.bounding_box.height,
106
+ },
107
+ }
108
+ for span in rendered_eq.spans
109
+ ]
110
+ spans_json = json.dumps(spans_data)
111
+ with self.lock:
112
+ conn = sqlite3.connect(self.db_path)
113
+ c = conn.cursor()
114
+ c.execute(
115
+ "INSERT OR REPLACE INTO equations (eq_hash, mathml, spans, error) VALUES (?, ?, ?, ?)",
116
+ (eq_hash, rendered_eq.mathml, spans_json, rendered_eq.error),
117
+ )
118
+ conn.commit()
119
+ conn.close()
120
+
121
+ def clear(self):
122
+ with self.lock:
123
+ conn = sqlite3.connect(self.db_path)
124
+ c = conn.cursor()
125
+ c.execute("DELETE FROM equations")
126
+ conn.commit()
127
+ conn.close()
128
+
129
+
130
+ # Global instance of EquationCache
131
+ equation_cache = EquationCache()
132
+
133
+ # --- End SQLite Cache Implementation ---
134
+
135
+
136
+ @dataclass
137
+ class BoundingBox:
138
+ x: float
139
+ y: float
140
+ width: float
141
+ height: float
142
+
143
+
144
+ @dataclass
145
+ class SpanInfo:
146
+ text: str
147
+ bounding_box: BoundingBox
148
+
149
+
150
+ @dataclass
151
+ class RenderedEquation:
152
+ mathml: str
153
+ spans: List[SpanInfo]
154
+ error: Optional[str] = None # New field to store error messages if rendering fails
155
+
156
+
157
+ def get_equation_hash(equation, bg_color="white", text_color="black", font_size=24):
158
+ """
159
+ Calculate SHA1 hash of the equation string and rendering parameters.
160
+ """
161
+ params_str = f"{equation}|{bg_color}|{text_color}|{font_size}"
162
+ return hashlib.sha1(params_str.encode("utf-8")).hexdigest()
163
+
164
+
165
+ # Thread-local storage for browser instances in the executor threads
166
+ _thread_local = threading.local()
167
+
168
+ # Global thread pool executor with a fixed number of threads
169
+ # Each thread will maintain its own Playwright instance
170
+ _render_executor = ThreadPoolExecutor(
171
+ max_workers=8, thread_name_prefix="playwright-render"
172
+ )
173
+
174
+
175
+ def _cleanup_executor():
176
+ """Cleanup function to shutdown the executor on exit."""
177
+ _render_executor.shutdown(wait=False)
178
+
179
+
180
+ # Register cleanup at exit
181
+ atexit.register(_cleanup_executor)
182
+
183
+
184
+ def _cleanup_playwright(playwright, browser):
185
+ print("Cleaning up", playwright)
186
+ try:
187
+ browser.close()
188
+ except Exception:
189
+ pass
190
+ try:
191
+ playwright.stop()
192
+ except Exception:
193
+ pass
194
+
195
+
196
+ class _BrowserOwner:
197
+ def __init__(self):
198
+ p = sync_playwright().start()
199
+ b = p.chromium.launch()
200
+ self.p = p
201
+ self.browser = b
202
+ self._closed = False
203
+ # Important: don't capture `self` or globals in the finalizer
204
+ self._finalizer = weakref.finalize(self, _cleanup_playwright, p, b)
205
+
206
+ def close_now(self):
207
+ if not self._closed:
208
+ self._closed = True
209
+ self._finalizer() # idempotent; runs at most once
210
+
211
+
212
+ def _get_thread_local_browser():
213
+ """Get or create a browser instance for the current thread."""
214
+ owner = getattr(_thread_local, "owner", None)
215
+ if owner is None:
216
+ owner = _BrowserOwner()
217
+ _thread_local.owner = owner
218
+ return owner
219
+
220
+
221
+ def _render_in_executor(
222
+ equation, bg_color, text_color, font_size, use_cache, debug_dom, eq_hash
223
+ ):
224
+ """
225
+ Function to be run in the executor thread pool.
226
+ Each thread maintains its own Playwright instance.
227
+ """
228
+ owner = _get_thread_local_browser()
229
+ ctx = owner.browser.new_context(viewport={"width": 800, "height": 400})
230
+ try:
231
+ return _do_render(ctx, equation, bg_color, text_color, font_size, debug_dom)
232
+ finally:
233
+ try:
234
+ ctx.close()
235
+ except Exception:
236
+ pass
237
+
238
+
239
+ def _do_render(context, equation, bg_color, text_color, font_size, debug_dom):
240
+ """
241
+ Internal rendering function that uses a provided browser context.
242
+ """
243
+ # Escape the equation for use in a JavaScript string.
244
+ escaped_equation = json.dumps(equation)
245
+
246
+ # Get local paths for KaTeX files.
247
+ script_dir = os.path.dirname(os.path.abspath(__file__))
248
+ katex_css_path = os.path.join(script_dir, "katex.min.css")
249
+ katex_js_path = os.path.join(script_dir, "katex.min.js")
250
+
251
+ if not os.path.exists(katex_css_path) or not os.path.exists(katex_js_path):
252
+ raise FileNotFoundError(
253
+ f"KaTeX files not found. Please ensure katex.min.css and katex.min.js are in {script_dir}"
254
+ )
255
+
256
+ # Create a new page.
257
+ page = context.new_page()
258
+
259
+ # Basic HTML structure for rendering.
260
+ page_html = f"""
261
+ <!DOCTYPE html>
262
+ <html>
263
+ <head>
264
+ <style>
265
+ body {{
266
+ display: flex;
267
+ justify-content: center;
268
+ align-items: center;
269
+ height: 100vh;
270
+ margin: 0;
271
+ background-color: {bg_color};
272
+ color: {text_color};
273
+ }}
274
+ #equation-container {{
275
+ padding: 0;
276
+ font-size: {font_size}px;
277
+ }}
278
+ </style>
279
+ </head>
280
+ <body>
281
+ <div id="equation-container"></div>
282
+ </body>
283
+ </html>
284
+ """
285
+ page.set_content(page_html)
286
+ page.add_style_tag(path=katex_css_path)
287
+ page.add_script_tag(path=katex_js_path)
288
+ page.wait_for_load_state("networkidle", timeout=0)
289
+
290
+ katex_loaded = page.evaluate("typeof katex !== 'undefined'")
291
+ if not katex_loaded:
292
+ page.close()
293
+ raise RuntimeError(
294
+ "KaTeX library failed to load. Check your katex.min.js file."
295
+ )
296
+
297
+ try:
298
+ error_message = page.evaluate(
299
+ f"""
300
+ () => {{
301
+ try {{
302
+ katex.render({escaped_equation}, document.getElementById("equation-container"), {{
303
+ displayMode: true,
304
+ throwOnError: true
305
+ }});
306
+ return null;
307
+ }} catch (error) {{
308
+ console.error("KaTeX error:", error.message);
309
+ return error.message;
310
+ }}
311
+ }}
312
+ """
313
+ )
314
+ except PlaywrightError as ex:
315
+ print(escaped_equation)
316
+ error_message = str(ex)
317
+ page.close()
318
+ raise
319
+
320
+ if error_message:
321
+ print(f"Error rendering equation: '{equation}'")
322
+ print(error_message)
323
+ # Return error result
324
+ page.close()
325
+ return RenderedEquation(mathml=error_message, spans=[], error=error_message)
326
+
327
+ page.wait_for_selector(".katex", state="attached", timeout=0)
328
+
329
+ if debug_dom:
330
+ katex_dom_html = page.evaluate(
331
+ """
332
+ () => {
333
+ return document.getElementById("equation-container").innerHTML;
334
+ }
335
+ """
336
+ )
337
+ print("\n===== KaTeX DOM HTML =====")
338
+ print(katex_dom_html)
339
+
340
+ # Extract inner-most spans with non-whitespace text.
341
+ spans_info = page.evaluate(
342
+ """
343
+ () => {
344
+ const spans = Array.from(document.querySelectorAll('span'));
345
+ const list = [];
346
+ spans.forEach(span => {
347
+ if (span.children.length === 0 && /\\S/.test(span.textContent)) {
348
+ const rect = span.getBoundingClientRect();
349
+ list.push({
350
+ text: span.textContent.trim(),
351
+ boundingBox: {
352
+ x: rect.x,
353
+ y: rect.y,
354
+ width: rect.width,
355
+ height: rect.height
356
+ }
357
+ });
358
+ }
359
+ });
360
+ return list;
361
+ }
362
+ """
363
+ )
364
+
365
+ if debug_dom:
366
+ print("\n===== Extracted Span Information =====")
367
+ print(spans_info)
368
+
369
+ # Extract MathML output (if available) from the KaTeX output.
370
+ mathml = page.evaluate(
371
+ """
372
+ () => {
373
+ const mathElem = document.querySelector('.katex-mathml math');
374
+ return mathElem ? mathElem.outerHTML : "";
375
+ }
376
+ """
377
+ )
378
+
379
+ page.close()
380
+
381
+ rendered_eq = RenderedEquation(
382
+ mathml=mathml,
383
+ spans=[
384
+ SpanInfo(
385
+ text=s["text"],
386
+ bounding_box=BoundingBox(
387
+ x=s["boundingBox"]["x"],
388
+ y=s["boundingBox"]["y"],
389
+ width=s["boundingBox"]["width"],
390
+ height=s["boundingBox"]["height"],
391
+ ),
392
+ )
393
+ for s in spans_info
394
+ ],
395
+ )
396
+
397
+ return rendered_eq
398
+
399
+
400
+ def render_equation(
401
+ equation,
402
+ bg_color="white",
403
+ text_color="black",
404
+ font_size=24,
405
+ use_cache=True,
406
+ debug_dom=False,
407
+ ):
408
+ """
409
+ Render a LaTeX equation using Playwright and KaTeX, extract the inner-most span elements
410
+ along with their bounding boxes, and extract the MathML output generated by KaTeX.
411
+
412
+ This function uses a ThreadPoolExecutor with a fixed number of threads to prevent
413
+ resource leaks from unbounded thread creation.
414
+ """
415
+
416
+ equation = equation.replace(r"\_", "_").replace(r"\*", "*")
417
+
418
+ # Calculate hash for caching.
419
+ eq_hash = get_equation_hash(equation, bg_color, text_color, font_size)
420
+
421
+ # Try to load from SQLite cache.
422
+ if use_cache:
423
+ cached = equation_cache.load(eq_hash)
424
+ if cached is not None:
425
+ return cached
426
+
427
+ # Submit the rendering task to the thread pool executor
428
+ future = _render_executor.submit(
429
+ _render_in_executor,
430
+ equation,
431
+ bg_color,
432
+ text_color,
433
+ font_size,
434
+ use_cache,
435
+ debug_dom,
436
+ eq_hash,
437
+ )
438
+
439
+ # Wait for the result
440
+ rendered_eq = future.result()
441
+
442
+ # Save to cache if successful and caching is enabled
443
+ if use_cache and rendered_eq and not rendered_eq.error:
444
+ equation_cache.save(eq_hash, rendered_eq)
445
+
446
+ return rendered_eq
447
+
448
+
449
+ def compare_rendered_equations(
450
+ reference: RenderedEquation, hypothesis: RenderedEquation
451
+ ) -> bool:
452
+ """
453
+ Compare two RenderedEquation objects.
454
+ First, check if the normalized MathML of the hypothesis is contained within that of the reference.
455
+ If not, perform a neighbor-based matching on the spans.
456
+ """
457
+ from bs4 import BeautifulSoup
458
+
459
+ def extract_inner(mathml: str) -> str:
460
+ try:
461
+ soup = BeautifulSoup(mathml, "xml")
462
+ semantics = soup.find("semantics")
463
+ if semantics:
464
+ inner_parts = [
465
+ str(child)
466
+ for child in semantics.contents
467
+ if getattr(child, "name", None) != "annotation"
468
+ ]
469
+ return "".join(inner_parts)
470
+ else:
471
+ return str(soup)
472
+ except Exception as e:
473
+ print("Error parsing MathML with BeautifulSoup:", e)
474
+ print(mathml)
475
+ return mathml
476
+
477
+ def normalize(s: str) -> str:
478
+ return re.sub(r"\s+", "", s)
479
+
480
+ reference_inner = normalize(extract_inner(reference.mathml))
481
+ hypothesis_inner = normalize(extract_inner(hypothesis.mathml))
482
+ if reference_inner in hypothesis_inner:
483
+ return True
484
+
485
+ H, R = reference.spans, hypothesis.spans
486
+ H = [span for span in H if span.text != "\u200b"]
487
+ R = [span for span in R if span.text != "\u200b"]
488
+
489
+ def expand_span_info(span_info: SpanInfo) -> list[SpanInfo]:
490
+ total_elems = len(span_info.text)
491
+ return [
492
+ SpanInfo(
493
+ c,
494
+ BoundingBox(
495
+ span_info.bounding_box.x
496
+ + (span_info.bounding_box.width * index) / total_elems,
497
+ span_info.bounding_box.y,
498
+ span_info.bounding_box.width / total_elems,
499
+ span_info.bounding_box.height,
500
+ ),
501
+ )
502
+ for index, c in enumerate(span_info.text)
503
+ ]
504
+
505
+ H = [span for sublist in H for span in expand_span_info(sublist)]
506
+ R = [span for sublist in R for span in expand_span_info(sublist)]
507
+
508
+ candidate_map = {}
509
+ for i, hspan in enumerate(H):
510
+ candidate_map[i] = [j for j, rsp in enumerate(R) if rsp.text == hspan.text]
511
+ if not candidate_map[i]:
512
+ return False
513
+
514
+ def compute_neighbors(spans, tol=5):
515
+ neighbors = {}
516
+ for i, span in enumerate(spans):
517
+ cx = span.bounding_box.x + span.bounding_box.width / 2
518
+ cy = span.bounding_box.y + span.bounding_box.height / 2
519
+ up = down = left = right = None
520
+ up_dist = down_dist = left_dist = right_dist = None
521
+ for j, other in enumerate(spans):
522
+ if i == j:
523
+ continue
524
+ ocx = other.bounding_box.x + other.bounding_box.width / 2
525
+ ocy = other.bounding_box.y + other.bounding_box.height / 2
526
+ if ocy < cy and abs(ocx - cx) <= tol:
527
+ dist = cy - ocy
528
+ if up is None or dist < up_dist:
529
+ up = j
530
+ up_dist = dist
531
+ if ocy > cy and abs(ocx - cx) <= tol:
532
+ dist = ocy - cy
533
+ if down is None or dist < down_dist:
534
+ down = j
535
+ down_dist = dist
536
+ if ocx < cx and abs(ocy - cy) <= tol:
537
+ dist = cx - ocx
538
+ if left is None or dist < left_dist:
539
+ left = j
540
+ left_dist = dist
541
+ if ocx > cx and abs(ocy - cy) <= tol:
542
+ dist = ocx - cx
543
+ if right is None or dist < right_dist:
544
+ right = j
545
+ right_dist = dist
546
+ neighbors[i] = {"up": up, "down": down, "left": left, "right": right}
547
+ return neighbors
548
+
549
+ hyp_neighbors = compute_neighbors(H)
550
+ ref_neighbors = compute_neighbors(R)
551
+
552
+ n = len(H)
553
+ used = [False] * len(R)
554
+ assignment = {}
555
+
556
+ def backtrack(i):
557
+ if i == n:
558
+ return True
559
+ for cand in candidate_map[i]:
560
+ if used[cand]:
561
+ continue
562
+ assignment[i] = cand
563
+ used[cand] = True
564
+ valid = True
565
+ for direction in ["up", "down", "left", "right"]:
566
+ hyp_nb = hyp_neighbors[i].get(direction)
567
+ ref_nb = ref_neighbors[cand].get(direction)
568
+ if hyp_nb is not None:
569
+ expected_text = H[hyp_nb].text
570
+ if ref_nb is None:
571
+ valid = False
572
+ break
573
+ if hyp_nb in assignment:
574
+ if assignment[hyp_nb] != ref_nb:
575
+ valid = False
576
+ break
577
+ else:
578
+ if R[ref_nb].text != expected_text:
579
+ valid = False
580
+ break
581
+ if valid:
582
+ if backtrack(i + 1):
583
+ return True
584
+ used[cand] = False
585
+ del assignment[i]
586
+ return False
587
+
588
+ return backtrack(0)
589
+
590
+
591
+ if __name__ == "__main__":
592
+ unittest.main()