vlmparse 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +1763 -0
  2. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  3. vlmparse/benchpdf2md/create_dataset.py +60 -0
  4. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +1 -0
  5. vlmparse/benchpdf2md/olmocrbench/katex/render.py +592 -0
  6. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +175 -0
  7. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +256 -0
  8. vlmparse/benchpdf2md/olmocrbench/tests.py +1334 -0
  9. vlmparse/benchpdf2md/run_benchmark.py +296 -0
  10. vlmparse/benchpdf2md/st_visu_benchmark/app.py +271 -0
  11. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +117 -0
  12. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +95 -0
  13. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +20 -0
  14. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +50 -0
  15. vlmparse/benchpdf2md/utils.py +56 -0
  16. vlmparse/clients/chandra.py +323 -0
  17. vlmparse/clients/deepseekocr.py +52 -0
  18. vlmparse/clients/docling.py +146 -0
  19. vlmparse/clients/dotsocr.py +277 -0
  20. vlmparse/clients/granite_docling.py +132 -0
  21. vlmparse/clients/hunyuanocr.py +45 -0
  22. vlmparse/clients/lightonocr.py +43 -0
  23. vlmparse/clients/mineru.py +119 -0
  24. vlmparse/clients/nanonetocr.py +29 -0
  25. vlmparse/clients/olmocr.py +46 -0
  26. vlmparse/clients/openai_converter.py +173 -0
  27. vlmparse/clients/paddleocrvl.py +48 -0
  28. vlmparse/clients/pipe_utils/cleaner.py +74 -0
  29. vlmparse/clients/pipe_utils/html_to_md_conversion.py +136 -0
  30. vlmparse/clients/pipe_utils/utils.py +12 -0
  31. vlmparse/clients/prompts.py +66 -0
  32. vlmparse/data_model/box.py +551 -0
  33. vlmparse/data_model/document.py +148 -0
  34. vlmparse/servers/docker_server.py +199 -0
  35. vlmparse/servers/utils.py +250 -0
  36. vlmparse/st_viewer/fs_nav.py +53 -0
  37. vlmparse/st_viewer/st_viewer.py +80 -0
  38. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/METADATA +12 -1
  39. vlmparse-0.1.3.dist-info/RECORD +50 -0
  40. vlmparse-0.1.0.dist-info/RECORD +0 -13
  41. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/WHEEL +0 -0
  42. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/entry_points.txt +0 -0
  43. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/licenses/LICENSE +0 -0
  44. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,296 @@
1
+ import datetime
2
+ import json
3
+ import os
4
+ import tempfile
5
+ import time
6
+ from pathlib import Path
7
+
8
+ import fire
9
+ import pandas as pd
10
+ from huggingface_hub import snapshot_download
11
+ from joblib import Parallel, delayed
12
+ from loguru import logger
13
+ from tqdm import tqdm
14
+
15
+ from vlmparse.benchpdf2md.bench_tests.benchmark_tsts import (
16
+ BaselineTest,
17
+ load_single_test,
18
+ )
19
+ from vlmparse.benchpdf2md.create_dataset import create_dataset
20
+ from vlmparse.benchpdf2md.utils import bootstrap_and_format_results
21
+ from vlmparse.converter_with_server import ConverterWithServer
22
+ from vlmparse.data_model.document import Document
23
+ from vlmparse.servers.utils import get_model_from_uri
24
+
25
+
26
+ def process_and_run_benchmark(
27
+ model: str | None = None,
28
+ uri: str | None = None,
29
+ retry: str | None = None,
30
+ concurrency: int = 1,
31
+ debug: bool = False,
32
+ gpu: int = 2,
33
+ regenerate: bool = False,
34
+ in_folder: Path | str | None = None,
35
+ save_folder: Path | str | None = None,
36
+ retrylast: bool = False,
37
+ dry_run: bool = True,
38
+ filter_type: str | list[str] | None = None,
39
+ filter_category: str | list[str] | None = None,
40
+ dpi: int | None = None,
41
+ port: int | None = None,
42
+ with_vllm_server: bool = False,
43
+ ):
44
+ if in_folder is None:
45
+ in_folder = os.getenv("IN_FOLDER_FR_BENCHMARK", "pulseia/fr-bench-pdf2md")
46
+ if save_folder is None:
47
+ save_folder = os.getenv("OUT_FOLDER_FR_BENCHMARK", ".")
48
+
49
+ in_folder = Path(in_folder)
50
+ save_folder = Path(save_folder)
51
+
52
+ if uri is not None:
53
+ model = get_model_from_uri(uri)
54
+
55
+ if model is None:
56
+ model = "gemini-2.5-flash-lite"
57
+
58
+ save_folder = Path(save_folder)
59
+
60
+ if in_folder == "pulseia/fr-bench-pdf2md":
61
+ local_folder_path = snapshot_download(
62
+ repo_id=in_folder,
63
+ repo_type="dataset",
64
+ )
65
+ in_folder = local_folder_path
66
+ logger.info(f"In folder: {in_folder}")
67
+
68
+ ds = create_dataset(in_folder)
69
+
70
+ if filter_type is not None:
71
+ if isinstance(filter_type, str):
72
+ filter_type = [filter_type]
73
+ ds = ds[ds.type.isin(filter_type)]
74
+
75
+ if filter_category is not None:
76
+ assert (
77
+ filter_category in ds.category.unique()
78
+ ), f"Filter category {filter_category} not in dataset categories: {ds.category.unique()}"
79
+ if isinstance(filter_category, str):
80
+ filter_category = [filter_category]
81
+ ds = ds[ds.category.isin(filter_category)]
82
+
83
+ try:
84
+ if retrylast:
85
+ retry = save_folder / (model + "_" + str(dpi) if dpi is not None else model)
86
+ previous_runs = sorted(os.listdir(retry))
87
+ if len(previous_runs) > 0:
88
+ retry = retry / previous_runs[-1]
89
+ else:
90
+ raise ValueError(
91
+ "No previous runs found, do not use the retrylast flag"
92
+ )
93
+ files = list(sorted(set(ds["pdf_path"])))
94
+ if retry is None or regenerate:
95
+ files = list(sorted(set(ds["pdf_path"])))
96
+ logger.info(f"Number of files to convert: {len(files)}")
97
+ if retry is not None:
98
+ already_processed = [
99
+ f.removesuffix(".zip") for f in os.listdir(retry / "results")
100
+ ]
101
+ files = [
102
+ f
103
+ for f in files
104
+ if Path(f).name.removesuffix(".pdf") not in already_processed
105
+ ]
106
+
107
+ logger.info(f"Number of files after filtering: {len(files)}")
108
+
109
+ if len(files) == 0:
110
+ raise ValueError(
111
+ f"No PDF files found in the input folder: {in_folder}\nDataset paths: {ds['pdf_path'][:5]}"
112
+ )
113
+ model_folder = model
114
+ if dpi is not None:
115
+ model_folder = model + "_" + str(dpi)
116
+ save_folder = save_folder / model_folder
117
+
118
+ batch_parser = ConverterWithServer(
119
+ model=model,
120
+ uri=uri,
121
+ gpus=str(gpu),
122
+ with_vllm_server=with_vllm_server,
123
+ concurrency=concurrency,
124
+ port=port,
125
+ )
126
+
127
+ if dry_run:
128
+ logger.info("Dry run, converting first 3 files")
129
+ batch_parser.parse(
130
+ files[:3],
131
+ out_folder=tempfile.mkdtemp(),
132
+ mode="document",
133
+ dpi=dpi,
134
+ debug=debug,
135
+ retrylast=retrylast,
136
+ )
137
+
138
+ tic = time.perf_counter()
139
+ batch_parser.parse(
140
+ files,
141
+ out_folder=str(save_folder),
142
+ mode="document",
143
+ dpi=dpi,
144
+ debug=debug,
145
+ retrylast=retrylast,
146
+ )
147
+ save_folder = batch_parser.get_out_folder()
148
+
149
+ total_time = time.perf_counter() - tic
150
+ logger.info(
151
+ f"Time taken to convert {len(files)} files: {total_time:.2f} seconds"
152
+ )
153
+
154
+ else:
155
+ save_folder = Path(retry)
156
+ total_time = None
157
+
158
+ df = run_pb_benchmark(ds, out_folder=save_folder / "results")
159
+
160
+ logger.info(
161
+ f"Number of pages: {ds['pdf_path'].unique().shape[0]}, Number of tests: {len(ds)}"
162
+ )
163
+ for col in ["type", "category"]:
164
+ if col in df.columns:
165
+ by_col_df = bootstrap_and_format_results(df, col, "result")
166
+ logger.info(f"By {col}:\n{by_col_df}")
167
+ if not debug:
168
+ by_col_df.to_excel(save_folder / f"by_{col}.xlsx")
169
+
170
+ logger.info("average result:")
171
+ avg = df.loc[df.type != "baseline"]["result"].mean()
172
+ logger.info(avg)
173
+
174
+ if not debug:
175
+ save_folder_test_results = (
176
+ save_folder
177
+ / "test_results"
178
+ / datetime.datetime.now().strftime("%Y-%m-%dT%Hh%Mm%Ss")
179
+ )
180
+ save_folder_test_results.mkdir(parents=True, exist_ok=True)
181
+ df.to_parquet(save_folder_test_results / "test_results.parquet")
182
+
183
+ with open(save_folder_test_results / "metrics.json", "w") as f:
184
+ metrics = {
185
+ "total_time": total_time,
186
+ "num_pages": len(files),
187
+ "num_tests": len(df),
188
+ "avg_result": avg,
189
+ "avg_doc_latency": df["doc_latency"].mean()
190
+ if "doc_latency" in df.columns
191
+ else None,
192
+ "avg_page_latency": df["page_latency"].mean()
193
+ if "page_latency" in df.columns
194
+ else None,
195
+ "avg_time_per_page": total_time / len(files)
196
+ if total_time is not None
197
+ else None,
198
+ "dpi": dpi,
199
+ "concurrency": concurrency,
200
+ "model": model,
201
+ }
202
+ for k, v in df.groupby("category")["result"].mean().items():
203
+ metrics[f"{k}"] = v
204
+ json.dump(metrics, f)
205
+
206
+ except Exception:
207
+ raise
208
+
209
+
210
+ def run_pb_benchmark(
211
+ ds: pd.DataFrame,
212
+ out_folder: Path,
213
+ num_workers: int = -1,
214
+ ):
215
+ files = list(out_folder.rglob("*.zip"))
216
+ stem_to_zip_path = {path.stem: path for path in files}
217
+ pdf_to_zip = {}
218
+ for pdf_path in ds.pdf_path.unique():
219
+ if Path(pdf_path).stem not in stem_to_zip_path.keys():
220
+ logger.warning(f"No zip document found for {pdf_path}")
221
+ continue
222
+ pdf_to_zip[Path(pdf_path).stem] = stem_to_zip_path[Path(pdf_path).stem]
223
+
224
+ def worker(row):
225
+ zip_path = pdf_to_zip.get(Path(row["pdf_path"]).stem)
226
+
227
+ if zip_path is None:
228
+ return [
229
+ row
230
+ | {
231
+ "result": False,
232
+ "explanation": f"No zip document found for {row['pdf_path']}",
233
+ "best_match_score": 0.0,
234
+ "document_processed": False,
235
+ },
236
+ dict(
237
+ result=False,
238
+ explanation=f"No zip document found for {row['pdf_path']}",
239
+ pdf=row["pdf_path"],
240
+ page=row["page"],
241
+ id=f"{Path(row['pdf_path']).stem}-baseline",
242
+ type="baseline",
243
+ category="baseline",
244
+ ),
245
+ ]
246
+ doc = Document.from_zip(zip_path)
247
+ md_text = doc.text
248
+ tests_name = Path(doc.file_path).parent.name
249
+ tests = [load_single_test(row)]
250
+
251
+ tests.append(
252
+ BaselineTest(
253
+ pdf=row["pdf_path"],
254
+ page=row["page"],
255
+ id=f"{tests_name}-baseline",
256
+ type="baseline",
257
+ category="baseline",
258
+ )
259
+ )
260
+
261
+ results = []
262
+
263
+ for test in tests:
264
+ passed, explanation, best_match_score = test.run(md_text)
265
+ _dict = {
266
+ "test_id": test.id,
267
+ "result": passed,
268
+ "explanation": explanation,
269
+ "tests_name": tests_name,
270
+ "pdf_path": str(doc.file_path),
271
+ "doc_path": str(zip_path),
272
+ "doc_latency": doc.latency,
273
+ "page_latency": doc.pages[0].latency,
274
+ "best_match_score": best_match_score,
275
+ "document_processed": True,
276
+ } | test.model_dump()
277
+
278
+ results.append(_dict)
279
+
280
+ return results
281
+
282
+ results = Parallel(n_jobs=num_workers)(
283
+ delayed(worker)(row) for row in tqdm(ds.to_dict(orient="records"))
284
+ )
285
+
286
+ df = pd.DataFrame([r for r in results for r in r])
287
+
288
+ return df
289
+
290
+
291
+ def main():
292
+ fire.Fire(process_and_run_benchmark)
293
+
294
+
295
+ if __name__ == "__main__":
296
+ main()
@@ -0,0 +1,271 @@
1
+ import argparse
2
+ import subprocess
3
+ import sys
4
+ from glob import glob
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+ import streamlit as st
9
+ from huggingface_hub import snapshot_download
10
+ from pypdfium2.internal.bases import uuid
11
+ from streamlit import runtime
12
+
13
+ from vlmparse.benchpdf2md.bench_tests.benchmark_tsts import load_tests, save_tests
14
+ from vlmparse.benchpdf2md.st_visu_benchmark.highligh_text import highlight_text
15
+ from vlmparse.benchpdf2md.st_visu_benchmark.test_form import edit_test_form
16
+ from vlmparse.benchpdf2md.st_visu_benchmark.ui_elements import download_pdf_page
17
+ from vlmparse.benchpdf2md.st_visu_benchmark.utils import get_doc, save_new_test
18
+
19
+
20
+ @st.cache_data
21
+ def load_df(results_file):
22
+ return pd.read_parquet(results_file).set_index("test_id")
23
+
24
+
25
+ @st.cache_data
26
+ def get_pdf_map(folder: Path) -> dict[str, Path]:
27
+ return {path.name: path for path in Path(folder).rglob("*.pdf")}
28
+
29
+
30
+ @st.cache_data
31
+ def get_doc_zip_map(folder: Path) -> dict[str, Path]:
32
+ return {path.name: path for path in Path(folder).rglob("*.zip")}
33
+
34
+
35
+ def run_streamlit(folder: str, dataset_path="pulseia/fr-bench-pdf2md") -> None:
36
+ st.set_page_config(layout="wide")
37
+ # tests_folder = Path(folder) / "tests"
38
+ preds_folder = Path(folder)
39
+
40
+ # tests = glob(str(tests_folder / "**/**/tests.jsonl"))
41
+ files = glob(str(preds_folder / "**/**/test_results/**/test_results.parquet"))
42
+
43
+ # map_tests = {Path(t).parent.name: t for t in tests}
44
+ if dataset_path == "pulseia/fr-bench-pdf2md":
45
+ local_folder_path = snapshot_download(
46
+ repo_id="pulseia/fr-bench-pdf2md",
47
+ repo_type="dataset",
48
+ )
49
+ dataset_path = local_folder_path
50
+
51
+ tests = glob(str(Path(dataset_path) / "**/*.jsonl"), recursive=True)
52
+
53
+ map_tests = {Path(t).parent.name: t for t in tests}
54
+ with st.sidebar:
55
+ sel_folders = [
56
+ (
57
+ Path(f).parent.parent.parent.parent.name,
58
+ Path(f).parent.parent.parent.name,
59
+ Path(f).parent.name,
60
+ )
61
+ for f in files
62
+ ]
63
+
64
+ if len(sel_folders) == 0:
65
+ st.error(f"No results found in folder {preds_folder}")
66
+ return
67
+ pipe_folder, date1, date2 = st.selectbox("Dir", sel_folders, index=0)
68
+ res_folder = preds_folder / pipe_folder / date1 / "test_results" / date2
69
+ df = load_df(res_folder / "test_results.parquet")
70
+
71
+ test_type = st.selectbox("Test type", ["present", "absent", "order", "table"])
72
+ if "category" not in df.columns:
73
+ df["category"] = None
74
+ df["category"] = df["category"].map(str)
75
+ test_category = st.selectbox("Test category", df.category.map(str).unique())
76
+
77
+ only_failed = st.checkbox("Only failed", value=False)
78
+ only_not_checked = st.checkbox("Only not checked", value=False)
79
+
80
+ display_image = st.checkbox("Display image", value=False)
81
+
82
+ preds_folder = preds_folder / pipe_folder / date1 / "results"
83
+
84
+ df_sel = df.loc[(df.type == test_type) & (df.category == test_category)]
85
+ if only_failed:
86
+ df_sel = df_sel[~df_sel.result]
87
+ if only_not_checked:
88
+ df_sel = df_sel[df_sel.checked != True] # noqa: E712
89
+
90
+ if df_sel.shape[0] == 0:
91
+ st.markdown("No failed tests found")
92
+ st.stop()
93
+ idx = st.number_input(
94
+ f"Test index (out of {df_sel.shape[0]})",
95
+ value=0,
96
+ min_value=0,
97
+ max_value=df_sel.shape[0] - 1,
98
+ step=1,
99
+ )
100
+
101
+ row = df_sel.iloc[idx]
102
+
103
+ display_markdown = st.checkbox("Display markdown", value=True)
104
+ show_layout = st.checkbox("Show layout", value=False)
105
+ display_original_text = st.checkbox("Display original text", value=False)
106
+ pdf_map = get_pdf_map(Path(dataset_path))
107
+
108
+ pdf_path = pdf_map[row.pdf_path.split("/")[-1]]
109
+
110
+ download_pdf_page(pdf_path, page_no=0, file_name=f"{row.tests_name}.pdf")
111
+
112
+ doc_path = get_doc_zip_map(preds_folder)[row.doc_path.split("/")[-1]]
113
+ doc = get_doc(doc_path)
114
+
115
+ col1_head, col2_head = st.columns(2)
116
+ with col1_head:
117
+ pos_buttons = st.container()
118
+ st.markdown(f"Test: {row.id}" + ("✅" if row.checked else ""))
119
+ st.markdown("Success: " + str(row.result))
120
+ st.markdown("Reason: " + row.explanation)
121
+
122
+ tests_path = map_tests[row.tests_name]
123
+
124
+ if (
125
+ "tests" not in st.session_state
126
+ or st.session_state.get("current_tests_path") != tests_path
127
+ ):
128
+ st.session_state["tests"] = load_tests(tests_path)
129
+ st.session_state["current_tests_path"] = tests_path
130
+
131
+ if "current_tests_path" not in st.session_state:
132
+ st.session_state["current_tests_path"] = tests_path
133
+
134
+ if display_original_text:
135
+ res = doc.pages[0].text
136
+ else:
137
+
138
+ @st.cache_data
139
+ def get_doc_page_md(doc_path):
140
+ return doc.pages[0].text
141
+
142
+ res = get_doc_page_md(row.doc_path)
143
+
144
+ with col2_head:
145
+ _tests = [test for test in st.session_state["tests"] if test.id == row.id]
146
+
147
+ if len(_tests) < 1:
148
+ st.error("No test found")
149
+ st.stop()
150
+ elif len(_tests) > 1:
151
+ st.error("Multiple tests found")
152
+ test_obj = _tests[0]
153
+
154
+ if st.button("Run test"):
155
+ success, message, best_match_score = test_obj.run(res)
156
+ st.markdown(f"Success: {success}, score: {best_match_score:.3f}")
157
+ st.markdown(message)
158
+
159
+ add_presence_test = st.checkbox("Add presence test")
160
+ if add_presence_test:
161
+ from vlmparse.benchpdf2md.bench_tests.benchmark_tsts import TextPresenceTest
162
+
163
+ test_obj_edited = edit_test_form(
164
+ TextPresenceTest(
165
+ pdf=row.pdf_path,
166
+ page=0,
167
+ id=f"presence_test_{uuid.uuid4()}",
168
+ type="present",
169
+ text="",
170
+ ),
171
+ "present",
172
+ )
173
+ if test_obj_edited is not None:
174
+ st.session_state["tests"].append(test_obj_edited)
175
+ else:
176
+ test_obj_edited = edit_test_form(
177
+ test_obj,
178
+ test_type,
179
+ )
180
+
181
+ if test_obj_edited is not None:
182
+ save_new_test(
183
+ st.session_state["tests"],
184
+ test_obj_edited,
185
+ st.session_state["current_tests_path"],
186
+ )
187
+
188
+ col1_button, col2_button, col3_button = pos_buttons.columns(3)
189
+
190
+ with col1_button:
191
+ if st.button("✅ Validate"):
192
+ test_obj.checked = True
193
+ save_new_test(
194
+ st.session_state["tests"],
195
+ test_obj,
196
+ st.session_state["current_tests_path"],
197
+ )
198
+
199
+ with col2_button:
200
+ if test_type != "baseline":
201
+ if st.button("❌ Reject"):
202
+ st.session_state["tests"] = [
203
+ test for test in st.session_state["tests"] if test.id != row.id
204
+ ]
205
+ save_tests(
206
+ st.session_state["tests"], st.session_state["current_tests_path"]
207
+ )
208
+ with col3_button:
209
+ if st.button("Supress page (Warning, this is irreversible)"):
210
+ import shutil
211
+
212
+ shutil.rmtree(Path(row.pdf_path).parent)
213
+
214
+ def show_text(res):
215
+ if test_obj:
216
+ res = highlight_text(test_obj, res)
217
+
218
+ with st.container(height=700):
219
+ if display_markdown:
220
+ st.markdown(res, unsafe_allow_html=True)
221
+ else:
222
+ st.text(res)
223
+
224
+ if display_image:
225
+ with col1_head:
226
+ show_text(res)
227
+
228
+ with col2_head:
229
+
230
+ @st.cache_data
231
+ def get_image(pipe_folder, date, test_id, show_layout):
232
+ return doc.pages[0].image
233
+
234
+ st.image(get_image(pipe_folder, date1, row.id, show_layout))
235
+ else:
236
+ show_text(res)
237
+
238
+
239
+ def parse_args() -> argparse.Namespace:
240
+ """Parse command line arguments."""
241
+ parser = argparse.ArgumentParser(description="Document viewer with Streamlit")
242
+ parser.add_argument(
243
+ "folder", type=str, nargs="?", default=".", help="Root folder path"
244
+ )
245
+ parser.add_argument(
246
+ "--ds", type=str, default="pulseia/fr-bench-pdf2md", help="Dataset path"
247
+ )
248
+ return parser.parse_args()
249
+
250
+
251
+ def main() -> None:
252
+ """Main entry point."""
253
+ args = parse_args()
254
+ folder = args.folder
255
+
256
+ if runtime.exists():
257
+ run_streamlit(folder, dataset_path=args.ds)
258
+ else:
259
+ try:
260
+ subprocess.run(
261
+ [sys.executable, "-m", "streamlit", "run", __file__, "--", folder],
262
+ check=True,
263
+ )
264
+ except KeyboardInterrupt:
265
+ print("\nStreamlit app terminated by user.")
266
+ except subprocess.CalledProcessError as e:
267
+ print(f"Error while running Streamlit: {e}")
268
+
269
+
270
+ if __name__ == "__main__":
271
+ main()
@@ -0,0 +1,117 @@
1
+ import streamlit as st
2
+ from rapidfuzz import process
3
+ from rapidfuzz.distance import Levenshtein
4
+
5
+
6
+ class Match:
7
+ def __init__(self, start, end, dist):
8
+ self.start = start
9
+ self.end = end
10
+ self.dist = dist
11
+
12
+
13
+ def find_near_matches(pattern: str, text: str, max_l_dist: int):
14
+ if not pattern or not text:
15
+ return []
16
+
17
+ matches = []
18
+ pattern_len = len(pattern)
19
+
20
+ for window_size in [pattern_len, pattern_len - 1, pattern_len + 1]:
21
+ if window_size <= 0 or window_size > len(text):
22
+ continue
23
+
24
+ chunks = [
25
+ (text[i : i + window_size], i) for i in range(len(text) - window_size + 1)
26
+ ]
27
+ if not chunks:
28
+ continue
29
+
30
+ result = process.extractOne(
31
+ pattern, [c[0] for c in chunks], scorer=Levenshtein.distance
32
+ )
33
+
34
+ if result:
35
+ matched_text, score, idx = result
36
+ dist = int(score)
37
+ if dist <= max_l_dist:
38
+ start_pos = chunks[idx][1]
39
+ matches.append(Match(start_pos, start_pos + window_size, dist))
40
+
41
+ return matches
42
+
43
+
44
+ @st.cache_data
45
+ def highlight_text(test_obj, res):
46
+ texts_to_highlight = []
47
+ if hasattr(test_obj, "text"):
48
+ texts_to_highlight.append(("text", test_obj.text))
49
+ if hasattr(test_obj, "before"):
50
+ texts_to_highlight.append(("before", test_obj.before))
51
+ if hasattr(test_obj, "after"):
52
+ texts_to_highlight.append(("after", test_obj.after))
53
+ if hasattr(test_obj, "cell") and test_obj.cell:
54
+ texts_to_highlight.append(("cell", test_obj.cell))
55
+ if hasattr(test_obj, "up") and test_obj.up:
56
+ texts_to_highlight.append(("up", test_obj.up))
57
+ if hasattr(test_obj, "down") and test_obj.down:
58
+ texts_to_highlight.append(("down", test_obj.down))
59
+ if hasattr(test_obj, "left") and test_obj.left:
60
+ texts_to_highlight.append(("left", test_obj.left))
61
+ if hasattr(test_obj, "right") and test_obj.right:
62
+ texts_to_highlight.append(("right", test_obj.right))
63
+ if hasattr(test_obj, "top_heading") and test_obj.top_heading:
64
+ texts_to_highlight.append(("top_heading", test_obj.top_heading))
65
+ if hasattr(test_obj, "left_heading") and test_obj.left_heading:
66
+ texts_to_highlight.append(("left_heading", test_obj.left_heading))
67
+
68
+ matches_with_pos = []
69
+ for label, txt in texts_to_highlight:
70
+ if txt and txt.strip():
71
+ fuzzy_matches = find_near_matches(
72
+ txt, res, max_l_dist=min(20, len(txt) // 2)
73
+ )
74
+ for match in fuzzy_matches:
75
+ matches_with_pos.append((match.start, match.end, label, match.dist))
76
+
77
+ def remove_overlaps(matches):
78
+ matches = sorted(matches, key=lambda x: (x[3], x[0]))
79
+ result = []
80
+ for match in matches:
81
+ s1, e1, _, _ = match
82
+ overlapping = False
83
+ for s2, e2, _, _ in result:
84
+ if not (e1 <= s2 or e2 <= s1):
85
+ overlapping = True
86
+ break
87
+ if not overlapping:
88
+ result.append(match)
89
+ return result
90
+
91
+ matches_with_pos = remove_overlaps(matches_with_pos)
92
+ matches_with_pos.sort(key=lambda x: x[0], reverse=True)
93
+
94
+ colors = [
95
+ "yellow",
96
+ "lightgreen",
97
+ "lightblue",
98
+ "lightcoral",
99
+ "lightyellow",
100
+ "lightpink",
101
+ "lightgray",
102
+ "lavender",
103
+ "peachpuff",
104
+ "palegreen",
105
+ ]
106
+ label_to_color = {}
107
+
108
+ for start, end, label, dist in matches_with_pos:
109
+ if label not in label_to_color:
110
+ label_to_color[label] = colors[len(label_to_color) % len(colors)]
111
+ color = label_to_color[label]
112
+ res = (
113
+ res[:start]
114
+ + f'<span style="background-color: {color}; font-weight: bold;" title="{label}: edit distance={dist}">{res[start:end]}</span>'
115
+ + res[end:]
116
+ )
117
+ return res