vlmparse 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/build_doc.py +10 -4
- vlmparse/clients/deepseekocr.py +155 -4
- vlmparse/clients/docling.py +2 -2
- vlmparse/clients/dotsocr.py +11 -2
- vlmparse/clients/mineru.py +8 -7
- vlmparse/clients/openai_converter.py +1 -0
- vlmparse/constants.py +2 -0
- vlmparse/converter.py +19 -5
- vlmparse/converter_with_server.py +5 -4
- vlmparse/registries.py +2 -4
- vlmparse/servers/docker_server.py +1 -1
- vlmparse/servers/utils.py +3 -2
- vlmparse/utils.py +2 -2
- {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/METADATA +17 -3
- vlmparse-0.1.5.dist-info/RECORD +36 -0
- vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
- vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
- vlmparse/benchpdf2md/create_dataset.py +0 -60
- vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
- vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
- vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
- vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
- vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
- vlmparse/benchpdf2md/run_benchmark.py +0 -296
- vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
- vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
- vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
- vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
- vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
- vlmparse/benchpdf2md/utils.py +0 -56
- vlmparse-0.1.3.dist-info/RECORD +0 -50
- {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/WHEEL +0 -0
- {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/top_level.txt +0 -0
|
@@ -1,296 +0,0 @@
|
|
|
1
|
-
import datetime
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
import tempfile
|
|
5
|
-
import time
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
import fire
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from huggingface_hub import snapshot_download
|
|
11
|
-
from joblib import Parallel, delayed
|
|
12
|
-
from loguru import logger
|
|
13
|
-
from tqdm import tqdm
|
|
14
|
-
|
|
15
|
-
from vlmparse.benchpdf2md.bench_tests.benchmark_tsts import (
|
|
16
|
-
BaselineTest,
|
|
17
|
-
load_single_test,
|
|
18
|
-
)
|
|
19
|
-
from vlmparse.benchpdf2md.create_dataset import create_dataset
|
|
20
|
-
from vlmparse.benchpdf2md.utils import bootstrap_and_format_results
|
|
21
|
-
from vlmparse.converter_with_server import ConverterWithServer
|
|
22
|
-
from vlmparse.data_model.document import Document
|
|
23
|
-
from vlmparse.servers.utils import get_model_from_uri
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def process_and_run_benchmark(
|
|
27
|
-
model: str | None = None,
|
|
28
|
-
uri: str | None = None,
|
|
29
|
-
retry: str | None = None,
|
|
30
|
-
concurrency: int = 1,
|
|
31
|
-
debug: bool = False,
|
|
32
|
-
gpu: int = 2,
|
|
33
|
-
regenerate: bool = False,
|
|
34
|
-
in_folder: Path | str | None = None,
|
|
35
|
-
save_folder: Path | str | None = None,
|
|
36
|
-
retrylast: bool = False,
|
|
37
|
-
dry_run: bool = True,
|
|
38
|
-
filter_type: str | list[str] | None = None,
|
|
39
|
-
filter_category: str | list[str] | None = None,
|
|
40
|
-
dpi: int | None = None,
|
|
41
|
-
port: int | None = None,
|
|
42
|
-
with_vllm_server: bool = False,
|
|
43
|
-
):
|
|
44
|
-
if in_folder is None:
|
|
45
|
-
in_folder = os.getenv("IN_FOLDER_FR_BENCHMARK", "pulseia/fr-bench-pdf2md")
|
|
46
|
-
if save_folder is None:
|
|
47
|
-
save_folder = os.getenv("OUT_FOLDER_FR_BENCHMARK", ".")
|
|
48
|
-
|
|
49
|
-
in_folder = Path(in_folder)
|
|
50
|
-
save_folder = Path(save_folder)
|
|
51
|
-
|
|
52
|
-
if uri is not None:
|
|
53
|
-
model = get_model_from_uri(uri)
|
|
54
|
-
|
|
55
|
-
if model is None:
|
|
56
|
-
model = "gemini-2.5-flash-lite"
|
|
57
|
-
|
|
58
|
-
save_folder = Path(save_folder)
|
|
59
|
-
|
|
60
|
-
if in_folder == "pulseia/fr-bench-pdf2md":
|
|
61
|
-
local_folder_path = snapshot_download(
|
|
62
|
-
repo_id=in_folder,
|
|
63
|
-
repo_type="dataset",
|
|
64
|
-
)
|
|
65
|
-
in_folder = local_folder_path
|
|
66
|
-
logger.info(f"In folder: {in_folder}")
|
|
67
|
-
|
|
68
|
-
ds = create_dataset(in_folder)
|
|
69
|
-
|
|
70
|
-
if filter_type is not None:
|
|
71
|
-
if isinstance(filter_type, str):
|
|
72
|
-
filter_type = [filter_type]
|
|
73
|
-
ds = ds[ds.type.isin(filter_type)]
|
|
74
|
-
|
|
75
|
-
if filter_category is not None:
|
|
76
|
-
assert (
|
|
77
|
-
filter_category in ds.category.unique()
|
|
78
|
-
), f"Filter category {filter_category} not in dataset categories: {ds.category.unique()}"
|
|
79
|
-
if isinstance(filter_category, str):
|
|
80
|
-
filter_category = [filter_category]
|
|
81
|
-
ds = ds[ds.category.isin(filter_category)]
|
|
82
|
-
|
|
83
|
-
try:
|
|
84
|
-
if retrylast:
|
|
85
|
-
retry = save_folder / (model + "_" + str(dpi) if dpi is not None else model)
|
|
86
|
-
previous_runs = sorted(os.listdir(retry))
|
|
87
|
-
if len(previous_runs) > 0:
|
|
88
|
-
retry = retry / previous_runs[-1]
|
|
89
|
-
else:
|
|
90
|
-
raise ValueError(
|
|
91
|
-
"No previous runs found, do not use the retrylast flag"
|
|
92
|
-
)
|
|
93
|
-
files = list(sorted(set(ds["pdf_path"])))
|
|
94
|
-
if retry is None or regenerate:
|
|
95
|
-
files = list(sorted(set(ds["pdf_path"])))
|
|
96
|
-
logger.info(f"Number of files to convert: {len(files)}")
|
|
97
|
-
if retry is not None:
|
|
98
|
-
already_processed = [
|
|
99
|
-
f.removesuffix(".zip") for f in os.listdir(retry / "results")
|
|
100
|
-
]
|
|
101
|
-
files = [
|
|
102
|
-
f
|
|
103
|
-
for f in files
|
|
104
|
-
if Path(f).name.removesuffix(".pdf") not in already_processed
|
|
105
|
-
]
|
|
106
|
-
|
|
107
|
-
logger.info(f"Number of files after filtering: {len(files)}")
|
|
108
|
-
|
|
109
|
-
if len(files) == 0:
|
|
110
|
-
raise ValueError(
|
|
111
|
-
f"No PDF files found in the input folder: {in_folder}\nDataset paths: {ds['pdf_path'][:5]}"
|
|
112
|
-
)
|
|
113
|
-
model_folder = model
|
|
114
|
-
if dpi is not None:
|
|
115
|
-
model_folder = model + "_" + str(dpi)
|
|
116
|
-
save_folder = save_folder / model_folder
|
|
117
|
-
|
|
118
|
-
batch_parser = ConverterWithServer(
|
|
119
|
-
model=model,
|
|
120
|
-
uri=uri,
|
|
121
|
-
gpus=str(gpu),
|
|
122
|
-
with_vllm_server=with_vllm_server,
|
|
123
|
-
concurrency=concurrency,
|
|
124
|
-
port=port,
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
if dry_run:
|
|
128
|
-
logger.info("Dry run, converting first 3 files")
|
|
129
|
-
batch_parser.parse(
|
|
130
|
-
files[:3],
|
|
131
|
-
out_folder=tempfile.mkdtemp(),
|
|
132
|
-
mode="document",
|
|
133
|
-
dpi=dpi,
|
|
134
|
-
debug=debug,
|
|
135
|
-
retrylast=retrylast,
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
tic = time.perf_counter()
|
|
139
|
-
batch_parser.parse(
|
|
140
|
-
files,
|
|
141
|
-
out_folder=str(save_folder),
|
|
142
|
-
mode="document",
|
|
143
|
-
dpi=dpi,
|
|
144
|
-
debug=debug,
|
|
145
|
-
retrylast=retrylast,
|
|
146
|
-
)
|
|
147
|
-
save_folder = batch_parser.get_out_folder()
|
|
148
|
-
|
|
149
|
-
total_time = time.perf_counter() - tic
|
|
150
|
-
logger.info(
|
|
151
|
-
f"Time taken to convert {len(files)} files: {total_time:.2f} seconds"
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
else:
|
|
155
|
-
save_folder = Path(retry)
|
|
156
|
-
total_time = None
|
|
157
|
-
|
|
158
|
-
df = run_pb_benchmark(ds, out_folder=save_folder / "results")
|
|
159
|
-
|
|
160
|
-
logger.info(
|
|
161
|
-
f"Number of pages: {ds['pdf_path'].unique().shape[0]}, Number of tests: {len(ds)}"
|
|
162
|
-
)
|
|
163
|
-
for col in ["type", "category"]:
|
|
164
|
-
if col in df.columns:
|
|
165
|
-
by_col_df = bootstrap_and_format_results(df, col, "result")
|
|
166
|
-
logger.info(f"By {col}:\n{by_col_df}")
|
|
167
|
-
if not debug:
|
|
168
|
-
by_col_df.to_excel(save_folder / f"by_{col}.xlsx")
|
|
169
|
-
|
|
170
|
-
logger.info("average result:")
|
|
171
|
-
avg = df.loc[df.type != "baseline"]["result"].mean()
|
|
172
|
-
logger.info(avg)
|
|
173
|
-
|
|
174
|
-
if not debug:
|
|
175
|
-
save_folder_test_results = (
|
|
176
|
-
save_folder
|
|
177
|
-
/ "test_results"
|
|
178
|
-
/ datetime.datetime.now().strftime("%Y-%m-%dT%Hh%Mm%Ss")
|
|
179
|
-
)
|
|
180
|
-
save_folder_test_results.mkdir(parents=True, exist_ok=True)
|
|
181
|
-
df.to_parquet(save_folder_test_results / "test_results.parquet")
|
|
182
|
-
|
|
183
|
-
with open(save_folder_test_results / "metrics.json", "w") as f:
|
|
184
|
-
metrics = {
|
|
185
|
-
"total_time": total_time,
|
|
186
|
-
"num_pages": len(files),
|
|
187
|
-
"num_tests": len(df),
|
|
188
|
-
"avg_result": avg,
|
|
189
|
-
"avg_doc_latency": df["doc_latency"].mean()
|
|
190
|
-
if "doc_latency" in df.columns
|
|
191
|
-
else None,
|
|
192
|
-
"avg_page_latency": df["page_latency"].mean()
|
|
193
|
-
if "page_latency" in df.columns
|
|
194
|
-
else None,
|
|
195
|
-
"avg_time_per_page": total_time / len(files)
|
|
196
|
-
if total_time is not None
|
|
197
|
-
else None,
|
|
198
|
-
"dpi": dpi,
|
|
199
|
-
"concurrency": concurrency,
|
|
200
|
-
"model": model,
|
|
201
|
-
}
|
|
202
|
-
for k, v in df.groupby("category")["result"].mean().items():
|
|
203
|
-
metrics[f"{k}"] = v
|
|
204
|
-
json.dump(metrics, f)
|
|
205
|
-
|
|
206
|
-
except Exception:
|
|
207
|
-
raise
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
def run_pb_benchmark(
|
|
211
|
-
ds: pd.DataFrame,
|
|
212
|
-
out_folder: Path,
|
|
213
|
-
num_workers: int = -1,
|
|
214
|
-
):
|
|
215
|
-
files = list(out_folder.rglob("*.zip"))
|
|
216
|
-
stem_to_zip_path = {path.stem: path for path in files}
|
|
217
|
-
pdf_to_zip = {}
|
|
218
|
-
for pdf_path in ds.pdf_path.unique():
|
|
219
|
-
if Path(pdf_path).stem not in stem_to_zip_path.keys():
|
|
220
|
-
logger.warning(f"No zip document found for {pdf_path}")
|
|
221
|
-
continue
|
|
222
|
-
pdf_to_zip[Path(pdf_path).stem] = stem_to_zip_path[Path(pdf_path).stem]
|
|
223
|
-
|
|
224
|
-
def worker(row):
|
|
225
|
-
zip_path = pdf_to_zip.get(Path(row["pdf_path"]).stem)
|
|
226
|
-
|
|
227
|
-
if zip_path is None:
|
|
228
|
-
return [
|
|
229
|
-
row
|
|
230
|
-
| {
|
|
231
|
-
"result": False,
|
|
232
|
-
"explanation": f"No zip document found for {row['pdf_path']}",
|
|
233
|
-
"best_match_score": 0.0,
|
|
234
|
-
"document_processed": False,
|
|
235
|
-
},
|
|
236
|
-
dict(
|
|
237
|
-
result=False,
|
|
238
|
-
explanation=f"No zip document found for {row['pdf_path']}",
|
|
239
|
-
pdf=row["pdf_path"],
|
|
240
|
-
page=row["page"],
|
|
241
|
-
id=f"{Path(row['pdf_path']).stem}-baseline",
|
|
242
|
-
type="baseline",
|
|
243
|
-
category="baseline",
|
|
244
|
-
),
|
|
245
|
-
]
|
|
246
|
-
doc = Document.from_zip(zip_path)
|
|
247
|
-
md_text = doc.text
|
|
248
|
-
tests_name = Path(doc.file_path).parent.name
|
|
249
|
-
tests = [load_single_test(row)]
|
|
250
|
-
|
|
251
|
-
tests.append(
|
|
252
|
-
BaselineTest(
|
|
253
|
-
pdf=row["pdf_path"],
|
|
254
|
-
page=row["page"],
|
|
255
|
-
id=f"{tests_name}-baseline",
|
|
256
|
-
type="baseline",
|
|
257
|
-
category="baseline",
|
|
258
|
-
)
|
|
259
|
-
)
|
|
260
|
-
|
|
261
|
-
results = []
|
|
262
|
-
|
|
263
|
-
for test in tests:
|
|
264
|
-
passed, explanation, best_match_score = test.run(md_text)
|
|
265
|
-
_dict = {
|
|
266
|
-
"test_id": test.id,
|
|
267
|
-
"result": passed,
|
|
268
|
-
"explanation": explanation,
|
|
269
|
-
"tests_name": tests_name,
|
|
270
|
-
"pdf_path": str(doc.file_path),
|
|
271
|
-
"doc_path": str(zip_path),
|
|
272
|
-
"doc_latency": doc.latency,
|
|
273
|
-
"page_latency": doc.pages[0].latency,
|
|
274
|
-
"best_match_score": best_match_score,
|
|
275
|
-
"document_processed": True,
|
|
276
|
-
} | test.model_dump()
|
|
277
|
-
|
|
278
|
-
results.append(_dict)
|
|
279
|
-
|
|
280
|
-
return results
|
|
281
|
-
|
|
282
|
-
results = Parallel(n_jobs=num_workers)(
|
|
283
|
-
delayed(worker)(row) for row in tqdm(ds.to_dict(orient="records"))
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
df = pd.DataFrame([r for r in results for r in r])
|
|
287
|
-
|
|
288
|
-
return df
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
def main():
|
|
292
|
-
fire.Fire(process_and_run_benchmark)
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
if __name__ == "__main__":
|
|
296
|
-
main()
|
|
@@ -1,271 +0,0 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
import subprocess
|
|
3
|
-
import sys
|
|
4
|
-
from glob import glob
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import streamlit as st
|
|
9
|
-
from huggingface_hub import snapshot_download
|
|
10
|
-
from pypdfium2.internal.bases import uuid
|
|
11
|
-
from streamlit import runtime
|
|
12
|
-
|
|
13
|
-
from vlmparse.benchpdf2md.bench_tests.benchmark_tsts import load_tests, save_tests
|
|
14
|
-
from vlmparse.benchpdf2md.st_visu_benchmark.highligh_text import highlight_text
|
|
15
|
-
from vlmparse.benchpdf2md.st_visu_benchmark.test_form import edit_test_form
|
|
16
|
-
from vlmparse.benchpdf2md.st_visu_benchmark.ui_elements import download_pdf_page
|
|
17
|
-
from vlmparse.benchpdf2md.st_visu_benchmark.utils import get_doc, save_new_test
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
@st.cache_data
|
|
21
|
-
def load_df(results_file):
|
|
22
|
-
return pd.read_parquet(results_file).set_index("test_id")
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@st.cache_data
|
|
26
|
-
def get_pdf_map(folder: Path) -> dict[str, Path]:
|
|
27
|
-
return {path.name: path for path in Path(folder).rglob("*.pdf")}
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@st.cache_data
|
|
31
|
-
def get_doc_zip_map(folder: Path) -> dict[str, Path]:
|
|
32
|
-
return {path.name: path for path in Path(folder).rglob("*.zip")}
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def run_streamlit(folder: str, dataset_path="pulseia/fr-bench-pdf2md") -> None:
|
|
36
|
-
st.set_page_config(layout="wide")
|
|
37
|
-
# tests_folder = Path(folder) / "tests"
|
|
38
|
-
preds_folder = Path(folder)
|
|
39
|
-
|
|
40
|
-
# tests = glob(str(tests_folder / "**/**/tests.jsonl"))
|
|
41
|
-
files = glob(str(preds_folder / "**/**/test_results/**/test_results.parquet"))
|
|
42
|
-
|
|
43
|
-
# map_tests = {Path(t).parent.name: t for t in tests}
|
|
44
|
-
if dataset_path == "pulseia/fr-bench-pdf2md":
|
|
45
|
-
local_folder_path = snapshot_download(
|
|
46
|
-
repo_id="pulseia/fr-bench-pdf2md",
|
|
47
|
-
repo_type="dataset",
|
|
48
|
-
)
|
|
49
|
-
dataset_path = local_folder_path
|
|
50
|
-
|
|
51
|
-
tests = glob(str(Path(dataset_path) / "**/*.jsonl"), recursive=True)
|
|
52
|
-
|
|
53
|
-
map_tests = {Path(t).parent.name: t for t in tests}
|
|
54
|
-
with st.sidebar:
|
|
55
|
-
sel_folders = [
|
|
56
|
-
(
|
|
57
|
-
Path(f).parent.parent.parent.parent.name,
|
|
58
|
-
Path(f).parent.parent.parent.name,
|
|
59
|
-
Path(f).parent.name,
|
|
60
|
-
)
|
|
61
|
-
for f in files
|
|
62
|
-
]
|
|
63
|
-
|
|
64
|
-
if len(sel_folders) == 0:
|
|
65
|
-
st.error(f"No results found in folder {preds_folder}")
|
|
66
|
-
return
|
|
67
|
-
pipe_folder, date1, date2 = st.selectbox("Dir", sel_folders, index=0)
|
|
68
|
-
res_folder = preds_folder / pipe_folder / date1 / "test_results" / date2
|
|
69
|
-
df = load_df(res_folder / "test_results.parquet")
|
|
70
|
-
|
|
71
|
-
test_type = st.selectbox("Test type", ["present", "absent", "order", "table"])
|
|
72
|
-
if "category" not in df.columns:
|
|
73
|
-
df["category"] = None
|
|
74
|
-
df["category"] = df["category"].map(str)
|
|
75
|
-
test_category = st.selectbox("Test category", df.category.map(str).unique())
|
|
76
|
-
|
|
77
|
-
only_failed = st.checkbox("Only failed", value=False)
|
|
78
|
-
only_not_checked = st.checkbox("Only not checked", value=False)
|
|
79
|
-
|
|
80
|
-
display_image = st.checkbox("Display image", value=False)
|
|
81
|
-
|
|
82
|
-
preds_folder = preds_folder / pipe_folder / date1 / "results"
|
|
83
|
-
|
|
84
|
-
df_sel = df.loc[(df.type == test_type) & (df.category == test_category)]
|
|
85
|
-
if only_failed:
|
|
86
|
-
df_sel = df_sel[~df_sel.result]
|
|
87
|
-
if only_not_checked:
|
|
88
|
-
df_sel = df_sel[df_sel.checked != True] # noqa: E712
|
|
89
|
-
|
|
90
|
-
if df_sel.shape[0] == 0:
|
|
91
|
-
st.markdown("No failed tests found")
|
|
92
|
-
st.stop()
|
|
93
|
-
idx = st.number_input(
|
|
94
|
-
f"Test index (out of {df_sel.shape[0]})",
|
|
95
|
-
value=0,
|
|
96
|
-
min_value=0,
|
|
97
|
-
max_value=df_sel.shape[0] - 1,
|
|
98
|
-
step=1,
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
row = df_sel.iloc[idx]
|
|
102
|
-
|
|
103
|
-
display_markdown = st.checkbox("Display markdown", value=True)
|
|
104
|
-
show_layout = st.checkbox("Show layout", value=False)
|
|
105
|
-
display_original_text = st.checkbox("Display original text", value=False)
|
|
106
|
-
pdf_map = get_pdf_map(Path(dataset_path))
|
|
107
|
-
|
|
108
|
-
pdf_path = pdf_map[row.pdf_path.split("/")[-1]]
|
|
109
|
-
|
|
110
|
-
download_pdf_page(pdf_path, page_no=0, file_name=f"{row.tests_name}.pdf")
|
|
111
|
-
|
|
112
|
-
doc_path = get_doc_zip_map(preds_folder)[row.doc_path.split("/")[-1]]
|
|
113
|
-
doc = get_doc(doc_path)
|
|
114
|
-
|
|
115
|
-
col1_head, col2_head = st.columns(2)
|
|
116
|
-
with col1_head:
|
|
117
|
-
pos_buttons = st.container()
|
|
118
|
-
st.markdown(f"Test: {row.id}" + ("✅" if row.checked else ""))
|
|
119
|
-
st.markdown("Success: " + str(row.result))
|
|
120
|
-
st.markdown("Reason: " + row.explanation)
|
|
121
|
-
|
|
122
|
-
tests_path = map_tests[row.tests_name]
|
|
123
|
-
|
|
124
|
-
if (
|
|
125
|
-
"tests" not in st.session_state
|
|
126
|
-
or st.session_state.get("current_tests_path") != tests_path
|
|
127
|
-
):
|
|
128
|
-
st.session_state["tests"] = load_tests(tests_path)
|
|
129
|
-
st.session_state["current_tests_path"] = tests_path
|
|
130
|
-
|
|
131
|
-
if "current_tests_path" not in st.session_state:
|
|
132
|
-
st.session_state["current_tests_path"] = tests_path
|
|
133
|
-
|
|
134
|
-
if display_original_text:
|
|
135
|
-
res = doc.pages[0].text
|
|
136
|
-
else:
|
|
137
|
-
|
|
138
|
-
@st.cache_data
|
|
139
|
-
def get_doc_page_md(doc_path):
|
|
140
|
-
return doc.pages[0].text
|
|
141
|
-
|
|
142
|
-
res = get_doc_page_md(row.doc_path)
|
|
143
|
-
|
|
144
|
-
with col2_head:
|
|
145
|
-
_tests = [test for test in st.session_state["tests"] if test.id == row.id]
|
|
146
|
-
|
|
147
|
-
if len(_tests) < 1:
|
|
148
|
-
st.error("No test found")
|
|
149
|
-
st.stop()
|
|
150
|
-
elif len(_tests) > 1:
|
|
151
|
-
st.error("Multiple tests found")
|
|
152
|
-
test_obj = _tests[0]
|
|
153
|
-
|
|
154
|
-
if st.button("Run test"):
|
|
155
|
-
success, message, best_match_score = test_obj.run(res)
|
|
156
|
-
st.markdown(f"Success: {success}, score: {best_match_score:.3f}")
|
|
157
|
-
st.markdown(message)
|
|
158
|
-
|
|
159
|
-
add_presence_test = st.checkbox("Add presence test")
|
|
160
|
-
if add_presence_test:
|
|
161
|
-
from vlmparse.benchpdf2md.bench_tests.benchmark_tsts import TextPresenceTest
|
|
162
|
-
|
|
163
|
-
test_obj_edited = edit_test_form(
|
|
164
|
-
TextPresenceTest(
|
|
165
|
-
pdf=row.pdf_path,
|
|
166
|
-
page=0,
|
|
167
|
-
id=f"presence_test_{uuid.uuid4()}",
|
|
168
|
-
type="present",
|
|
169
|
-
text="",
|
|
170
|
-
),
|
|
171
|
-
"present",
|
|
172
|
-
)
|
|
173
|
-
if test_obj_edited is not None:
|
|
174
|
-
st.session_state["tests"].append(test_obj_edited)
|
|
175
|
-
else:
|
|
176
|
-
test_obj_edited = edit_test_form(
|
|
177
|
-
test_obj,
|
|
178
|
-
test_type,
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
if test_obj_edited is not None:
|
|
182
|
-
save_new_test(
|
|
183
|
-
st.session_state["tests"],
|
|
184
|
-
test_obj_edited,
|
|
185
|
-
st.session_state["current_tests_path"],
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
col1_button, col2_button, col3_button = pos_buttons.columns(3)
|
|
189
|
-
|
|
190
|
-
with col1_button:
|
|
191
|
-
if st.button("✅ Validate"):
|
|
192
|
-
test_obj.checked = True
|
|
193
|
-
save_new_test(
|
|
194
|
-
st.session_state["tests"],
|
|
195
|
-
test_obj,
|
|
196
|
-
st.session_state["current_tests_path"],
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
with col2_button:
|
|
200
|
-
if test_type != "baseline":
|
|
201
|
-
if st.button("❌ Reject"):
|
|
202
|
-
st.session_state["tests"] = [
|
|
203
|
-
test for test in st.session_state["tests"] if test.id != row.id
|
|
204
|
-
]
|
|
205
|
-
save_tests(
|
|
206
|
-
st.session_state["tests"], st.session_state["current_tests_path"]
|
|
207
|
-
)
|
|
208
|
-
with col3_button:
|
|
209
|
-
if st.button("Supress page (Warning, this is irreversible)"):
|
|
210
|
-
import shutil
|
|
211
|
-
|
|
212
|
-
shutil.rmtree(Path(row.pdf_path).parent)
|
|
213
|
-
|
|
214
|
-
def show_text(res):
|
|
215
|
-
if test_obj:
|
|
216
|
-
res = highlight_text(test_obj, res)
|
|
217
|
-
|
|
218
|
-
with st.container(height=700):
|
|
219
|
-
if display_markdown:
|
|
220
|
-
st.markdown(res, unsafe_allow_html=True)
|
|
221
|
-
else:
|
|
222
|
-
st.text(res)
|
|
223
|
-
|
|
224
|
-
if display_image:
|
|
225
|
-
with col1_head:
|
|
226
|
-
show_text(res)
|
|
227
|
-
|
|
228
|
-
with col2_head:
|
|
229
|
-
|
|
230
|
-
@st.cache_data
|
|
231
|
-
def get_image(pipe_folder, date, test_id, show_layout):
|
|
232
|
-
return doc.pages[0].image
|
|
233
|
-
|
|
234
|
-
st.image(get_image(pipe_folder, date1, row.id, show_layout))
|
|
235
|
-
else:
|
|
236
|
-
show_text(res)
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
def parse_args() -> argparse.Namespace:
|
|
240
|
-
"""Parse command line arguments."""
|
|
241
|
-
parser = argparse.ArgumentParser(description="Document viewer with Streamlit")
|
|
242
|
-
parser.add_argument(
|
|
243
|
-
"folder", type=str, nargs="?", default=".", help="Root folder path"
|
|
244
|
-
)
|
|
245
|
-
parser.add_argument(
|
|
246
|
-
"--ds", type=str, default="pulseia/fr-bench-pdf2md", help="Dataset path"
|
|
247
|
-
)
|
|
248
|
-
return parser.parse_args()
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
def main() -> None:
|
|
252
|
-
"""Main entry point."""
|
|
253
|
-
args = parse_args()
|
|
254
|
-
folder = args.folder
|
|
255
|
-
|
|
256
|
-
if runtime.exists():
|
|
257
|
-
run_streamlit(folder, dataset_path=args.ds)
|
|
258
|
-
else:
|
|
259
|
-
try:
|
|
260
|
-
subprocess.run(
|
|
261
|
-
[sys.executable, "-m", "streamlit", "run", __file__, "--", folder],
|
|
262
|
-
check=True,
|
|
263
|
-
)
|
|
264
|
-
except KeyboardInterrupt:
|
|
265
|
-
print("\nStreamlit app terminated by user.")
|
|
266
|
-
except subprocess.CalledProcessError as e:
|
|
267
|
-
print(f"Error while running Streamlit: {e}")
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
if __name__ == "__main__":
|
|
271
|
-
main()
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
import streamlit as st
|
|
2
|
-
from rapidfuzz import process
|
|
3
|
-
from rapidfuzz.distance import Levenshtein
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class Match:
|
|
7
|
-
def __init__(self, start, end, dist):
|
|
8
|
-
self.start = start
|
|
9
|
-
self.end = end
|
|
10
|
-
self.dist = dist
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def find_near_matches(pattern: str, text: str, max_l_dist: int):
|
|
14
|
-
if not pattern or not text:
|
|
15
|
-
return []
|
|
16
|
-
|
|
17
|
-
matches = []
|
|
18
|
-
pattern_len = len(pattern)
|
|
19
|
-
|
|
20
|
-
for window_size in [pattern_len, pattern_len - 1, pattern_len + 1]:
|
|
21
|
-
if window_size <= 0 or window_size > len(text):
|
|
22
|
-
continue
|
|
23
|
-
|
|
24
|
-
chunks = [
|
|
25
|
-
(text[i : i + window_size], i) for i in range(len(text) - window_size + 1)
|
|
26
|
-
]
|
|
27
|
-
if not chunks:
|
|
28
|
-
continue
|
|
29
|
-
|
|
30
|
-
result = process.extractOne(
|
|
31
|
-
pattern, [c[0] for c in chunks], scorer=Levenshtein.distance
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
if result:
|
|
35
|
-
matched_text, score, idx = result
|
|
36
|
-
dist = int(score)
|
|
37
|
-
if dist <= max_l_dist:
|
|
38
|
-
start_pos = chunks[idx][1]
|
|
39
|
-
matches.append(Match(start_pos, start_pos + window_size, dist))
|
|
40
|
-
|
|
41
|
-
return matches
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
@st.cache_data
|
|
45
|
-
def highlight_text(test_obj, res):
|
|
46
|
-
texts_to_highlight = []
|
|
47
|
-
if hasattr(test_obj, "text"):
|
|
48
|
-
texts_to_highlight.append(("text", test_obj.text))
|
|
49
|
-
if hasattr(test_obj, "before"):
|
|
50
|
-
texts_to_highlight.append(("before", test_obj.before))
|
|
51
|
-
if hasattr(test_obj, "after"):
|
|
52
|
-
texts_to_highlight.append(("after", test_obj.after))
|
|
53
|
-
if hasattr(test_obj, "cell") and test_obj.cell:
|
|
54
|
-
texts_to_highlight.append(("cell", test_obj.cell))
|
|
55
|
-
if hasattr(test_obj, "up") and test_obj.up:
|
|
56
|
-
texts_to_highlight.append(("up", test_obj.up))
|
|
57
|
-
if hasattr(test_obj, "down") and test_obj.down:
|
|
58
|
-
texts_to_highlight.append(("down", test_obj.down))
|
|
59
|
-
if hasattr(test_obj, "left") and test_obj.left:
|
|
60
|
-
texts_to_highlight.append(("left", test_obj.left))
|
|
61
|
-
if hasattr(test_obj, "right") and test_obj.right:
|
|
62
|
-
texts_to_highlight.append(("right", test_obj.right))
|
|
63
|
-
if hasattr(test_obj, "top_heading") and test_obj.top_heading:
|
|
64
|
-
texts_to_highlight.append(("top_heading", test_obj.top_heading))
|
|
65
|
-
if hasattr(test_obj, "left_heading") and test_obj.left_heading:
|
|
66
|
-
texts_to_highlight.append(("left_heading", test_obj.left_heading))
|
|
67
|
-
|
|
68
|
-
matches_with_pos = []
|
|
69
|
-
for label, txt in texts_to_highlight:
|
|
70
|
-
if txt and txt.strip():
|
|
71
|
-
fuzzy_matches = find_near_matches(
|
|
72
|
-
txt, res, max_l_dist=min(20, len(txt) // 2)
|
|
73
|
-
)
|
|
74
|
-
for match in fuzzy_matches:
|
|
75
|
-
matches_with_pos.append((match.start, match.end, label, match.dist))
|
|
76
|
-
|
|
77
|
-
def remove_overlaps(matches):
|
|
78
|
-
matches = sorted(matches, key=lambda x: (x[3], x[0]))
|
|
79
|
-
result = []
|
|
80
|
-
for match in matches:
|
|
81
|
-
s1, e1, _, _ = match
|
|
82
|
-
overlapping = False
|
|
83
|
-
for s2, e2, _, _ in result:
|
|
84
|
-
if not (e1 <= s2 or e2 <= s1):
|
|
85
|
-
overlapping = True
|
|
86
|
-
break
|
|
87
|
-
if not overlapping:
|
|
88
|
-
result.append(match)
|
|
89
|
-
return result
|
|
90
|
-
|
|
91
|
-
matches_with_pos = remove_overlaps(matches_with_pos)
|
|
92
|
-
matches_with_pos.sort(key=lambda x: x[0], reverse=True)
|
|
93
|
-
|
|
94
|
-
colors = [
|
|
95
|
-
"yellow",
|
|
96
|
-
"lightgreen",
|
|
97
|
-
"lightblue",
|
|
98
|
-
"lightcoral",
|
|
99
|
-
"lightyellow",
|
|
100
|
-
"lightpink",
|
|
101
|
-
"lightgray",
|
|
102
|
-
"lavender",
|
|
103
|
-
"peachpuff",
|
|
104
|
-
"palegreen",
|
|
105
|
-
]
|
|
106
|
-
label_to_color = {}
|
|
107
|
-
|
|
108
|
-
for start, end, label, dist in matches_with_pos:
|
|
109
|
-
if label not in label_to_color:
|
|
110
|
-
label_to_color[label] = colors[len(label_to_color) % len(colors)]
|
|
111
|
-
color = label_to_color[label]
|
|
112
|
-
res = (
|
|
113
|
-
res[:start]
|
|
114
|
-
+ f'<span style="background-color: {color}; font-weight: bold;" title="{label}: edit distance={dist}">{res[start:end]}</span>'
|
|
115
|
-
+ res[end:]
|
|
116
|
-
)
|
|
117
|
-
return res
|