vlmparse 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/clients/docling.py +2 -2
- vlmparse/clients/dotsocr.py +11 -2
- vlmparse/clients/mineru.py +8 -7
- vlmparse/clients/openai_converter.py +1 -0
- vlmparse/converter_with_server.py +5 -4
- vlmparse/registries.py +2 -4
- vlmparse/servers/docker_server.py +1 -1
- vlmparse/servers/utils.py +3 -2
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/METADATA +17 -3
- vlmparse-0.1.5.dist-info/RECORD +36 -0
- vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
- vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
- vlmparse/benchpdf2md/create_dataset.py +0 -60
- vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
- vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
- vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
- vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
- vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
- vlmparse/benchpdf2md/run_benchmark.py +0 -296
- vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
- vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
- vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
- vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
- vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
- vlmparse/benchpdf2md/utils.py +0 -56
- vlmparse-0.1.4.dist-info/RECORD +0 -51
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/WHEEL +0 -0
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/top_level.txt +0 -0
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
import streamlit as st
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def edit_test_form(test_obj, test_type):
|
|
5
|
-
st.markdown("### Edit Test Fields")
|
|
6
|
-
with st.form("edit_test_fields"):
|
|
7
|
-
type_fields = {}
|
|
8
|
-
type_fields["max_diffs"] = st.number_input(
|
|
9
|
-
"Max Diffs", value=test_obj.max_diffs, min_value=0, step=1
|
|
10
|
-
)
|
|
11
|
-
type_fields["unidecode"] = st.checkbox("Unidecode", value=test_obj.unidecode)
|
|
12
|
-
type_fields["alphanum"] = st.checkbox("Alphanum", value=test_obj.alphanum)
|
|
13
|
-
type_fields["ignore_str"] = st.text_input(
|
|
14
|
-
"Ignore strings (seperarated by spaces)",
|
|
15
|
-
value=" ".join(test_obj.ignore_str),
|
|
16
|
-
)
|
|
17
|
-
type_fields["ignore_space"] = st.checkbox(
|
|
18
|
-
"Ignore space", value=test_obj.ignore_space
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
type_fields["ignore_str"] = (
|
|
22
|
-
type_fields["ignore_str"].split(" ") if type_fields["ignore_str"] else []
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
if test_type == "present" or test_type == "absent":
|
|
26
|
-
type_fields["text"] = st.text_area(
|
|
27
|
-
"Text", value=test_obj.text, height="content"
|
|
28
|
-
)
|
|
29
|
-
layout_cat_options = [
|
|
30
|
-
"text",
|
|
31
|
-
"footer",
|
|
32
|
-
"header",
|
|
33
|
-
"footnote",
|
|
34
|
-
"image",
|
|
35
|
-
"image_caption",
|
|
36
|
-
]
|
|
37
|
-
|
|
38
|
-
type_fields["layout_cat"] = st.selectbox(
|
|
39
|
-
"Layout Category",
|
|
40
|
-
layout_cat_options,
|
|
41
|
-
index=layout_cat_options.index(test_obj.layout_cat),
|
|
42
|
-
)
|
|
43
|
-
type_fields["case_sensitive"] = st.checkbox(
|
|
44
|
-
"Case Sensitive", value=test_obj.case_sensitive
|
|
45
|
-
)
|
|
46
|
-
type_fields["first_n"] = st.number_input(
|
|
47
|
-
"First N",
|
|
48
|
-
value=test_obj.first_n if test_obj.first_n else 0,
|
|
49
|
-
min_value=0,
|
|
50
|
-
step=100,
|
|
51
|
-
)
|
|
52
|
-
type_fields["last_n"] = st.number_input(
|
|
53
|
-
"Last N",
|
|
54
|
-
value=test_obj.last_n if test_obj.last_n else 0,
|
|
55
|
-
min_value=0,
|
|
56
|
-
step=100,
|
|
57
|
-
)
|
|
58
|
-
if type_fields["first_n"] == 0:
|
|
59
|
-
type_fields["first_n"] = None
|
|
60
|
-
if type_fields["last_n"] == 0:
|
|
61
|
-
type_fields["last_n"] = None
|
|
62
|
-
elif test_type == "order":
|
|
63
|
-
type_fields["before"] = st.text_area(
|
|
64
|
-
"Before", value=test_obj.before, height="content"
|
|
65
|
-
)
|
|
66
|
-
type_fields["after"] = st.text_area(
|
|
67
|
-
"After", value=test_obj.after, height="content"
|
|
68
|
-
)
|
|
69
|
-
elif test_type == "table":
|
|
70
|
-
type_fields["cell"] = st.text_input("Cell", value=test_obj.cell)
|
|
71
|
-
type_fields["up"] = st.text_input(
|
|
72
|
-
"Up", value=test_obj.up if test_obj.up else ""
|
|
73
|
-
)
|
|
74
|
-
type_fields["down"] = st.text_input(
|
|
75
|
-
"Down", value=test_obj.down if test_obj.down else ""
|
|
76
|
-
)
|
|
77
|
-
type_fields["left"] = st.text_input(
|
|
78
|
-
"Left", value=test_obj.left if test_obj.left else ""
|
|
79
|
-
)
|
|
80
|
-
type_fields["right"] = st.text_input(
|
|
81
|
-
"Right", value=test_obj.right if test_obj.right else ""
|
|
82
|
-
)
|
|
83
|
-
type_fields["top_heading"] = st.text_input(
|
|
84
|
-
"Top Heading",
|
|
85
|
-
value=test_obj.top_heading if test_obj.top_heading else "",
|
|
86
|
-
)
|
|
87
|
-
type_fields["left_heading"] = st.text_input(
|
|
88
|
-
"Left Heading",
|
|
89
|
-
value=test_obj.left_heading if test_obj.left_heading else "",
|
|
90
|
-
)
|
|
91
|
-
if st.form_submit_button("Save Changes"):
|
|
92
|
-
for field, value in type_fields.items():
|
|
93
|
-
setattr(test_obj, field, value)
|
|
94
|
-
|
|
95
|
-
return test_obj
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from typing import Optional
|
|
3
|
-
|
|
4
|
-
import streamlit as st
|
|
5
|
-
|
|
6
|
-
from vlmparse.benchpdf2md.st_visu_benchmark.utils import get_pdf_bytes
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def download_pdf_page(
|
|
10
|
-
pdf_path: Path, page_no: int = 0, file_name: Optional[str] = None
|
|
11
|
-
):
|
|
12
|
-
pdf_bytes = get_pdf_bytes(pdf_path, page_no)
|
|
13
|
-
if pdf_bytes:
|
|
14
|
-
st.download_button(
|
|
15
|
-
label="📄 Download PDF Page",
|
|
16
|
-
data=pdf_bytes,
|
|
17
|
-
file_name=file_name if file_name else f"{pdf_path.stem}.pdf",
|
|
18
|
-
mime="application/pdf",
|
|
19
|
-
use_container_width=True,
|
|
20
|
-
)
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
import io
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
import pypdfium2 as pdfium
|
|
5
|
-
import streamlit as st
|
|
6
|
-
|
|
7
|
-
from vlmparse.data_model.document import Document
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@st.cache_data
|
|
11
|
-
def get_pdf_bytes(pdf_path, page_no=0):
|
|
12
|
-
pdf_reader = pdfium.PdfDocument(pdf_path)
|
|
13
|
-
if page_no >= len(pdf_reader):
|
|
14
|
-
pdf_reader.close()
|
|
15
|
-
return None
|
|
16
|
-
|
|
17
|
-
# Create a new PDF
|
|
18
|
-
new_pdf = pdfium.PdfDocument.new()
|
|
19
|
-
|
|
20
|
-
# Import the chosen page into the new PDF
|
|
21
|
-
new_pdf.import_pages(pdf_reader, pages=[page_no])
|
|
22
|
-
|
|
23
|
-
bytes_io = io.BytesIO()
|
|
24
|
-
# Get bytes
|
|
25
|
-
new_pdf.save(bytes_io)
|
|
26
|
-
|
|
27
|
-
pdf_bytes = bytes_io.getvalue()
|
|
28
|
-
|
|
29
|
-
# Clean up
|
|
30
|
-
new_pdf.close()
|
|
31
|
-
pdf_reader.close()
|
|
32
|
-
|
|
33
|
-
return pdf_bytes
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
@st.cache_data
|
|
37
|
-
def get_doc(doc_path: Path):
|
|
38
|
-
return Document.from_zip(doc_path)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def save_new_test(tests, test_obj_edited, test_path):
|
|
42
|
-
from vlmparse.benchpdf2md.bench_tests.benchmark_tsts import save_tests
|
|
43
|
-
|
|
44
|
-
for test in tests:
|
|
45
|
-
if test.id == test_obj_edited.id:
|
|
46
|
-
test = test_obj_edited
|
|
47
|
-
else:
|
|
48
|
-
test = test
|
|
49
|
-
save_tests(tests, test_path)
|
|
50
|
-
st.success("Test updated successfully!")
|
vlmparse/benchpdf2md/utils.py
DELETED
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
import base64
|
|
2
|
-
from io import BytesIO
|
|
3
|
-
|
|
4
|
-
import numpy as np
|
|
5
|
-
import pandas as pd
|
|
6
|
-
from PIL import Image
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def vectorized_bootstrap_grouped_std(df, group_col, value_col, n_bootstrap=1000):
|
|
10
|
-
group_col = [group_col] if isinstance(group_col, str) else group_col
|
|
11
|
-
grouped = df.groupby(group_col)[value_col]
|
|
12
|
-
|
|
13
|
-
def bootstrap_group(group):
|
|
14
|
-
values = group.values
|
|
15
|
-
n = len(values)
|
|
16
|
-
bootstrap_samples = np.random.choice(
|
|
17
|
-
values, size=(n_bootstrap, n), replace=True
|
|
18
|
-
)
|
|
19
|
-
bootstrap_means = np.mean(bootstrap_samples, axis=1)
|
|
20
|
-
return pd.Series(
|
|
21
|
-
{"mean": np.mean(values), "bootstrap_std": np.std(bootstrap_means)}
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
result = grouped.apply(bootstrap_group)
|
|
25
|
-
return result.unstack(-1)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def format_results_vectorized(result_df, precision=2):
|
|
29
|
-
means = result_df["mean"].values
|
|
30
|
-
margins = 2 * result_df["bootstrap_std"].values
|
|
31
|
-
|
|
32
|
-
formatted = np.char.add(
|
|
33
|
-
np.char.add(np.round(means, precision).astype(str), " ± "),
|
|
34
|
-
np.round(margins, precision).astype(str),
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
return pd.DataFrame({"formatted_result": formatted}, index=result_df.index)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def bootstrap_and_format_results(
|
|
41
|
-
df, group_col, value_col, n_bootstrap=1000, precision=2
|
|
42
|
-
):
|
|
43
|
-
result_df = vectorized_bootstrap_grouped_std(df, group_col, value_col, n_bootstrap)
|
|
44
|
-
return format_results_vectorized(result_df, precision)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def to_base64(image: Image, extension="PNG"):
|
|
48
|
-
img_byte_arr = BytesIO()
|
|
49
|
-
image.save(img_byte_arr, format=extension)
|
|
50
|
-
img_byte_arr = img_byte_arr.getvalue()
|
|
51
|
-
return base64.b64encode(img_byte_arr).decode("utf-8")
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def from_base64(base64_str: str):
|
|
55
|
-
image_data = base64.b64decode(base64_str)
|
|
56
|
-
return Image.open(BytesIO(image_data))
|
vlmparse-0.1.4.dist-info/RECORD
DELETED
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
vlmparse/base_model.py,sha256=4U4UPe8SNArliKnUf8pp8zQugWYsnhg9okylt7mrW1U,381
|
|
2
|
-
vlmparse/build_doc.py,sha256=LAWrnFrqamN5PwJo57AUtQOPrMFGnCGw4gBjEKZ6pYo,2127
|
|
3
|
-
vlmparse/cli.py,sha256=tQma1IkOsFnqPKqqHVO1PJh18n1w82gp4ewA7oraJkE,15855
|
|
4
|
-
vlmparse/constants.py,sha256=7-47S01n4MI2ebR09bpdOo3_P16d-z-NVGsm6KJP8ls,110
|
|
5
|
-
vlmparse/converter.py,sha256=F0JSY9sFYUggCvaUCb27kKGJJpnZKW2FStMDVJoIOeQ,7383
|
|
6
|
-
vlmparse/converter_with_server.py,sha256=9yoqfv8akB0xZZ7Snjq3aHW5NPNam2AgbK7_rfFqNkk,3909
|
|
7
|
-
vlmparse/registries.py,sha256=TdSR1fx1Tz3roGk4Tk5ckIK6Iz-e4UD4erWUk96fFpQ,5846
|
|
8
|
-
vlmparse/utils.py,sha256=rcVrtPiQVj_8HAmFQOu___72uYIapp_X89yxrMNCBow,1236
|
|
9
|
-
vlmparse/benchpdf2md/create_dataset.py,sha256=0o4I0O3pHm1W7NYOTnW1JvPmgxJM8KLElKFvAbPAIic,1855
|
|
10
|
-
vlmparse/benchpdf2md/run_benchmark.py,sha256=LMHElWyWIgB4ppBL0s-qjfMz5FZQnZOEm5mXxd0p0C8,9800
|
|
11
|
-
vlmparse/benchpdf2md/utils.py,sha256=Q62vtvLIzxOEzSi-w210d7qnaRz-q_5ykmLNTkmbs-8,1732
|
|
12
|
-
vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py,sha256=XISZKlN1CiPBGDHgiDkjFkhGWxPxMFRu1GfStiOprmo,69527
|
|
13
|
-
vlmparse/benchpdf2md/bench_tests/utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
vlmparse/benchpdf2md/olmocrbench/repeatdetect.py,sha256=SWCfYgAVmtyNypCsEZnuj6HoNGjFjTOqPBn2fCnEsiQ,5509
|
|
15
|
-
vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py,sha256=XnVlN5Sy7mXkr7nqWbcCgROsoavFBXqn4haSoxVsyLM,8495
|
|
16
|
-
vlmparse/benchpdf2md/olmocrbench/tests.py,sha256=bszavrGQNCB3l60gPrsMgxtcMJvDjwZBBL_0B_K1oFA,53625
|
|
17
|
-
vlmparse/benchpdf2md/olmocrbench/katex/__init__.py,sha256=DD9Knd52Ur9WiatymQGI9B1-yJ7OkL2w5E0quITDkPY,72
|
|
18
|
-
vlmparse/benchpdf2md/olmocrbench/katex/render.py,sha256=l8mYRnz2I-10RE3JoxYrMfZ2UhxECTiRFazuf-dHnqU,18219
|
|
19
|
-
vlmparse/benchpdf2md/st_visu_benchmark/app.py,sha256=ng8o_q20rY88BFQSEi62dBylklV6yyGyrrMiTrl7xtM,8921
|
|
20
|
-
vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py,sha256=vsGtd9nPrtL5uDqALMumVlsllLvVZr2E6_d9TOLQoN4,3915
|
|
21
|
-
vlmparse/benchpdf2md/st_visu_benchmark/test_form.py,sha256=qNmFZoSdbWcw1EJKesgO7rYV_LpslYPzij_gyJEuDP8,3701
|
|
22
|
-
vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py,sha256=WkKncexShO3SU-DO7dPT4DOe-8UNjsCaHlj9L1B2mkI,572
|
|
23
|
-
vlmparse/benchpdf2md/st_visu_benchmark/utils.py,sha256=JSmOJQY1DDETtWmjWv07SlQlORE6yBewiMcE5qRZI_Q,1109
|
|
24
|
-
vlmparse/clients/chandra.py,sha256=zfu-A6Slh-fIAyrtrlVoCb6QHLBimnimefap_K9YwYw,9775
|
|
25
|
-
vlmparse/clients/deepseekocr.py,sha256=rQvaOaPPoDiZ0MzXqfqqH9BgUBfjmlfHu3NlMjSDgiQ,6501
|
|
26
|
-
vlmparse/clients/docling.py,sha256=K-Grl_nZiSdooEdEaflevprE56l3Keby9xSMBtFwdis,5355
|
|
27
|
-
vlmparse/clients/dotsocr.py,sha256=9ygvIVVOi9UhTUJwmrI-h6AjMV9vL9J2vMaBfUyTorY,9895
|
|
28
|
-
vlmparse/clients/granite_docling.py,sha256=EQpsv5qSJG0HtMSacmJStER2sq4TGf1EMU5_NmJsl4g,4634
|
|
29
|
-
vlmparse/clients/hunyuanocr.py,sha256=Xw0Q1l-3pQzaEgFngnfM8vrSWpnT3I99QvDaGZ8XooM,1712
|
|
30
|
-
vlmparse/clients/lightonocr.py,sha256=wx1Im8Z3wlRWwYbPqnSd3LqTtdAU8CnX5mzu1BuCUY8,1314
|
|
31
|
-
vlmparse/clients/mineru.py,sha256=OL56O6cMGksoVMVDyMYCQvwIvZEjKiAZXjEMEshqXaY,3549
|
|
32
|
-
vlmparse/clients/nanonetocr.py,sha256=BT5vaeerCsK5agvOaHK3NvLUqWd1FfDmrMmDYbp646I,1543
|
|
33
|
-
vlmparse/clients/olmocr.py,sha256=mQEDpfyLY8a80Zlps5mG0QaWytIgnNQZVEVWKWjPIjk,1849
|
|
34
|
-
vlmparse/clients/openai_converter.py,sha256=QBHomw3_K_BpMVboaMd4rlhrntsqEFT46kxpg9KqBIs,5636
|
|
35
|
-
vlmparse/clients/paddleocrvl.py,sha256=tmaqg3boV4edywiiiNiNiI3dBHi111wz4dFb52OISXw,1376
|
|
36
|
-
vlmparse/clients/prompts.py,sha256=-J60lqxgRzlkQ9VsQLxmWsIMaDt-gNqWqWoqHIw9CLc,4228
|
|
37
|
-
vlmparse/clients/pipe_utils/cleaner.py,sha256=oxBkBTOkluN1lmeNbzajRIe0_D__ZGwUOBaI_Ph0uxE,2396
|
|
38
|
-
vlmparse/clients/pipe_utils/html_to_md_conversion.py,sha256=cFFqzD2jCNw_968_eu3Wt--Ox7iJj2Rn5UoP_DZWosU,4112
|
|
39
|
-
vlmparse/clients/pipe_utils/utils.py,sha256=935ecIO446I0pstszE_1nrIPHn1Ffrxunq7fVd0dsd8,315
|
|
40
|
-
vlmparse/data_model/box.py,sha256=lJsh4qhjgYXZF5vTSJ1qMXD5GVlBi2_SBedBMlfJikU,16868
|
|
41
|
-
vlmparse/data_model/document.py,sha256=pdCZvWzRFkez53ZJpNaB4ezUW-OVUlbR3_SBmmgVzGQ,4217
|
|
42
|
-
vlmparse/servers/docker_server.py,sha256=nI7K8CEzJwSZxLY7Jg9IuYHHLR5YQpOSgY8Ln71POJA,6608
|
|
43
|
-
vlmparse/servers/utils.py,sha256=gMk5Y8FA1nlSxi7JzKxZu7XyljkYUZ5AnsTb3YFqu28,8821
|
|
44
|
-
vlmparse/st_viewer/fs_nav.py,sha256=7GNH68h2Loh5pQ64Pe72-D2cs2BLhqRXevEmKdFmPX0,1616
|
|
45
|
-
vlmparse/st_viewer/st_viewer.py,sha256=m2rQTtk5rlwErNmivNAg-4rkHkvNkvLhoJZxFQi7Dwk,2105
|
|
46
|
-
vlmparse-0.1.4.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
|
|
47
|
-
vlmparse-0.1.4.dist-info/METADATA,sha256=72_47P1ER-J8tzlEvE91Xf58u35p5eZZD1VvPbXzrqA,5112
|
|
48
|
-
vlmparse-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
49
|
-
vlmparse-0.1.4.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
|
|
50
|
-
vlmparse-0.1.4.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
|
|
51
|
-
vlmparse-0.1.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|