vlmparse 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. vlmparse/clients/docling.py +2 -2
  2. vlmparse/clients/dotsocr.py +11 -2
  3. vlmparse/clients/mineru.py +8 -7
  4. vlmparse/clients/openai_converter.py +1 -0
  5. vlmparse/converter_with_server.py +5 -4
  6. vlmparse/registries.py +2 -4
  7. vlmparse/servers/docker_server.py +1 -1
  8. vlmparse/servers/utils.py +3 -2
  9. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/METADATA +17 -3
  10. vlmparse-0.1.5.dist-info/RECORD +36 -0
  11. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
  12. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  13. vlmparse/benchpdf2md/create_dataset.py +0 -60
  14. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
  15. vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
  16. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
  17. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
  18. vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
  19. vlmparse/benchpdf2md/run_benchmark.py +0 -296
  20. vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
  21. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
  22. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
  23. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
  24. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
  25. vlmparse/benchpdf2md/utils.py +0 -56
  26. vlmparse-0.1.4.dist-info/RECORD +0 -51
  27. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/WHEEL +0 -0
  28. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/entry_points.txt +0 -0
  29. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/licenses/LICENSE +0 -0
  30. {vlmparse-0.1.4.dist-info → vlmparse-0.1.5.dist-info}/top_level.txt +0 -0
@@ -1,95 +0,0 @@
1
- import streamlit as st
2
-
3
-
4
- def edit_test_form(test_obj, test_type):
5
- st.markdown("### Edit Test Fields")
6
- with st.form("edit_test_fields"):
7
- type_fields = {}
8
- type_fields["max_diffs"] = st.number_input(
9
- "Max Diffs", value=test_obj.max_diffs, min_value=0, step=1
10
- )
11
- type_fields["unidecode"] = st.checkbox("Unidecode", value=test_obj.unidecode)
12
- type_fields["alphanum"] = st.checkbox("Alphanum", value=test_obj.alphanum)
13
- type_fields["ignore_str"] = st.text_input(
14
- "Ignore strings (seperarated by spaces)",
15
- value=" ".join(test_obj.ignore_str),
16
- )
17
- type_fields["ignore_space"] = st.checkbox(
18
- "Ignore space", value=test_obj.ignore_space
19
- )
20
-
21
- type_fields["ignore_str"] = (
22
- type_fields["ignore_str"].split(" ") if type_fields["ignore_str"] else []
23
- )
24
-
25
- if test_type == "present" or test_type == "absent":
26
- type_fields["text"] = st.text_area(
27
- "Text", value=test_obj.text, height="content"
28
- )
29
- layout_cat_options = [
30
- "text",
31
- "footer",
32
- "header",
33
- "footnote",
34
- "image",
35
- "image_caption",
36
- ]
37
-
38
- type_fields["layout_cat"] = st.selectbox(
39
- "Layout Category",
40
- layout_cat_options,
41
- index=layout_cat_options.index(test_obj.layout_cat),
42
- )
43
- type_fields["case_sensitive"] = st.checkbox(
44
- "Case Sensitive", value=test_obj.case_sensitive
45
- )
46
- type_fields["first_n"] = st.number_input(
47
- "First N",
48
- value=test_obj.first_n if test_obj.first_n else 0,
49
- min_value=0,
50
- step=100,
51
- )
52
- type_fields["last_n"] = st.number_input(
53
- "Last N",
54
- value=test_obj.last_n if test_obj.last_n else 0,
55
- min_value=0,
56
- step=100,
57
- )
58
- if type_fields["first_n"] == 0:
59
- type_fields["first_n"] = None
60
- if type_fields["last_n"] == 0:
61
- type_fields["last_n"] = None
62
- elif test_type == "order":
63
- type_fields["before"] = st.text_area(
64
- "Before", value=test_obj.before, height="content"
65
- )
66
- type_fields["after"] = st.text_area(
67
- "After", value=test_obj.after, height="content"
68
- )
69
- elif test_type == "table":
70
- type_fields["cell"] = st.text_input("Cell", value=test_obj.cell)
71
- type_fields["up"] = st.text_input(
72
- "Up", value=test_obj.up if test_obj.up else ""
73
- )
74
- type_fields["down"] = st.text_input(
75
- "Down", value=test_obj.down if test_obj.down else ""
76
- )
77
- type_fields["left"] = st.text_input(
78
- "Left", value=test_obj.left if test_obj.left else ""
79
- )
80
- type_fields["right"] = st.text_input(
81
- "Right", value=test_obj.right if test_obj.right else ""
82
- )
83
- type_fields["top_heading"] = st.text_input(
84
- "Top Heading",
85
- value=test_obj.top_heading if test_obj.top_heading else "",
86
- )
87
- type_fields["left_heading"] = st.text_input(
88
- "Left Heading",
89
- value=test_obj.left_heading if test_obj.left_heading else "",
90
- )
91
- if st.form_submit_button("Save Changes"):
92
- for field, value in type_fields.items():
93
- setattr(test_obj, field, value)
94
-
95
- return test_obj
@@ -1,20 +0,0 @@
1
- from pathlib import Path
2
- from typing import Optional
3
-
4
- import streamlit as st
5
-
6
- from vlmparse.benchpdf2md.st_visu_benchmark.utils import get_pdf_bytes
7
-
8
-
9
- def download_pdf_page(
10
- pdf_path: Path, page_no: int = 0, file_name: Optional[str] = None
11
- ):
12
- pdf_bytes = get_pdf_bytes(pdf_path, page_no)
13
- if pdf_bytes:
14
- st.download_button(
15
- label="📄 Download PDF Page",
16
- data=pdf_bytes,
17
- file_name=file_name if file_name else f"{pdf_path.stem}.pdf",
18
- mime="application/pdf",
19
- use_container_width=True,
20
- )
@@ -1,50 +0,0 @@
1
- import io
2
- from pathlib import Path
3
-
4
- import pypdfium2 as pdfium
5
- import streamlit as st
6
-
7
- from vlmparse.data_model.document import Document
8
-
9
-
10
- @st.cache_data
11
- def get_pdf_bytes(pdf_path, page_no=0):
12
- pdf_reader = pdfium.PdfDocument(pdf_path)
13
- if page_no >= len(pdf_reader):
14
- pdf_reader.close()
15
- return None
16
-
17
- # Create a new PDF
18
- new_pdf = pdfium.PdfDocument.new()
19
-
20
- # Import the chosen page into the new PDF
21
- new_pdf.import_pages(pdf_reader, pages=[page_no])
22
-
23
- bytes_io = io.BytesIO()
24
- # Get bytes
25
- new_pdf.save(bytes_io)
26
-
27
- pdf_bytes = bytes_io.getvalue()
28
-
29
- # Clean up
30
- new_pdf.close()
31
- pdf_reader.close()
32
-
33
- return pdf_bytes
34
-
35
-
36
- @st.cache_data
37
- def get_doc(doc_path: Path):
38
- return Document.from_zip(doc_path)
39
-
40
-
41
- def save_new_test(tests, test_obj_edited, test_path):
42
- from vlmparse.benchpdf2md.bench_tests.benchmark_tsts import save_tests
43
-
44
- for test in tests:
45
- if test.id == test_obj_edited.id:
46
- test = test_obj_edited
47
- else:
48
- test = test
49
- save_tests(tests, test_path)
50
- st.success("Test updated successfully!")
@@ -1,56 +0,0 @@
1
- import base64
2
- from io import BytesIO
3
-
4
- import numpy as np
5
- import pandas as pd
6
- from PIL import Image
7
-
8
-
9
- def vectorized_bootstrap_grouped_std(df, group_col, value_col, n_bootstrap=1000):
10
- group_col = [group_col] if isinstance(group_col, str) else group_col
11
- grouped = df.groupby(group_col)[value_col]
12
-
13
- def bootstrap_group(group):
14
- values = group.values
15
- n = len(values)
16
- bootstrap_samples = np.random.choice(
17
- values, size=(n_bootstrap, n), replace=True
18
- )
19
- bootstrap_means = np.mean(bootstrap_samples, axis=1)
20
- return pd.Series(
21
- {"mean": np.mean(values), "bootstrap_std": np.std(bootstrap_means)}
22
- )
23
-
24
- result = grouped.apply(bootstrap_group)
25
- return result.unstack(-1)
26
-
27
-
28
- def format_results_vectorized(result_df, precision=2):
29
- means = result_df["mean"].values
30
- margins = 2 * result_df["bootstrap_std"].values
31
-
32
- formatted = np.char.add(
33
- np.char.add(np.round(means, precision).astype(str), " ± "),
34
- np.round(margins, precision).astype(str),
35
- )
36
-
37
- return pd.DataFrame({"formatted_result": formatted}, index=result_df.index)
38
-
39
-
40
- def bootstrap_and_format_results(
41
- df, group_col, value_col, n_bootstrap=1000, precision=2
42
- ):
43
- result_df = vectorized_bootstrap_grouped_std(df, group_col, value_col, n_bootstrap)
44
- return format_results_vectorized(result_df, precision)
45
-
46
-
47
- def to_base64(image: Image, extension="PNG"):
48
- img_byte_arr = BytesIO()
49
- image.save(img_byte_arr, format=extension)
50
- img_byte_arr = img_byte_arr.getvalue()
51
- return base64.b64encode(img_byte_arr).decode("utf-8")
52
-
53
-
54
- def from_base64(base64_str: str):
55
- image_data = base64.b64decode(base64_str)
56
- return Image.open(BytesIO(image_data))
@@ -1,51 +0,0 @@
1
- vlmparse/base_model.py,sha256=4U4UPe8SNArliKnUf8pp8zQugWYsnhg9okylt7mrW1U,381
2
- vlmparse/build_doc.py,sha256=LAWrnFrqamN5PwJo57AUtQOPrMFGnCGw4gBjEKZ6pYo,2127
3
- vlmparse/cli.py,sha256=tQma1IkOsFnqPKqqHVO1PJh18n1w82gp4ewA7oraJkE,15855
4
- vlmparse/constants.py,sha256=7-47S01n4MI2ebR09bpdOo3_P16d-z-NVGsm6KJP8ls,110
5
- vlmparse/converter.py,sha256=F0JSY9sFYUggCvaUCb27kKGJJpnZKW2FStMDVJoIOeQ,7383
6
- vlmparse/converter_with_server.py,sha256=9yoqfv8akB0xZZ7Snjq3aHW5NPNam2AgbK7_rfFqNkk,3909
7
- vlmparse/registries.py,sha256=TdSR1fx1Tz3roGk4Tk5ckIK6Iz-e4UD4erWUk96fFpQ,5846
8
- vlmparse/utils.py,sha256=rcVrtPiQVj_8HAmFQOu___72uYIapp_X89yxrMNCBow,1236
9
- vlmparse/benchpdf2md/create_dataset.py,sha256=0o4I0O3pHm1W7NYOTnW1JvPmgxJM8KLElKFvAbPAIic,1855
10
- vlmparse/benchpdf2md/run_benchmark.py,sha256=LMHElWyWIgB4ppBL0s-qjfMz5FZQnZOEm5mXxd0p0C8,9800
11
- vlmparse/benchpdf2md/utils.py,sha256=Q62vtvLIzxOEzSi-w210d7qnaRz-q_5ykmLNTkmbs-8,1732
12
- vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py,sha256=XISZKlN1CiPBGDHgiDkjFkhGWxPxMFRu1GfStiOprmo,69527
13
- vlmparse/benchpdf2md/bench_tests/utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- vlmparse/benchpdf2md/olmocrbench/repeatdetect.py,sha256=SWCfYgAVmtyNypCsEZnuj6HoNGjFjTOqPBn2fCnEsiQ,5509
15
- vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py,sha256=XnVlN5Sy7mXkr7nqWbcCgROsoavFBXqn4haSoxVsyLM,8495
16
- vlmparse/benchpdf2md/olmocrbench/tests.py,sha256=bszavrGQNCB3l60gPrsMgxtcMJvDjwZBBL_0B_K1oFA,53625
17
- vlmparse/benchpdf2md/olmocrbench/katex/__init__.py,sha256=DD9Knd52Ur9WiatymQGI9B1-yJ7OkL2w5E0quITDkPY,72
18
- vlmparse/benchpdf2md/olmocrbench/katex/render.py,sha256=l8mYRnz2I-10RE3JoxYrMfZ2UhxECTiRFazuf-dHnqU,18219
19
- vlmparse/benchpdf2md/st_visu_benchmark/app.py,sha256=ng8o_q20rY88BFQSEi62dBylklV6yyGyrrMiTrl7xtM,8921
20
- vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py,sha256=vsGtd9nPrtL5uDqALMumVlsllLvVZr2E6_d9TOLQoN4,3915
21
- vlmparse/benchpdf2md/st_visu_benchmark/test_form.py,sha256=qNmFZoSdbWcw1EJKesgO7rYV_LpslYPzij_gyJEuDP8,3701
22
- vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py,sha256=WkKncexShO3SU-DO7dPT4DOe-8UNjsCaHlj9L1B2mkI,572
23
- vlmparse/benchpdf2md/st_visu_benchmark/utils.py,sha256=JSmOJQY1DDETtWmjWv07SlQlORE6yBewiMcE5qRZI_Q,1109
24
- vlmparse/clients/chandra.py,sha256=zfu-A6Slh-fIAyrtrlVoCb6QHLBimnimefap_K9YwYw,9775
25
- vlmparse/clients/deepseekocr.py,sha256=rQvaOaPPoDiZ0MzXqfqqH9BgUBfjmlfHu3NlMjSDgiQ,6501
26
- vlmparse/clients/docling.py,sha256=K-Grl_nZiSdooEdEaflevprE56l3Keby9xSMBtFwdis,5355
27
- vlmparse/clients/dotsocr.py,sha256=9ygvIVVOi9UhTUJwmrI-h6AjMV9vL9J2vMaBfUyTorY,9895
28
- vlmparse/clients/granite_docling.py,sha256=EQpsv5qSJG0HtMSacmJStER2sq4TGf1EMU5_NmJsl4g,4634
29
- vlmparse/clients/hunyuanocr.py,sha256=Xw0Q1l-3pQzaEgFngnfM8vrSWpnT3I99QvDaGZ8XooM,1712
30
- vlmparse/clients/lightonocr.py,sha256=wx1Im8Z3wlRWwYbPqnSd3LqTtdAU8CnX5mzu1BuCUY8,1314
31
- vlmparse/clients/mineru.py,sha256=OL56O6cMGksoVMVDyMYCQvwIvZEjKiAZXjEMEshqXaY,3549
32
- vlmparse/clients/nanonetocr.py,sha256=BT5vaeerCsK5agvOaHK3NvLUqWd1FfDmrMmDYbp646I,1543
33
- vlmparse/clients/olmocr.py,sha256=mQEDpfyLY8a80Zlps5mG0QaWytIgnNQZVEVWKWjPIjk,1849
34
- vlmparse/clients/openai_converter.py,sha256=QBHomw3_K_BpMVboaMd4rlhrntsqEFT46kxpg9KqBIs,5636
35
- vlmparse/clients/paddleocrvl.py,sha256=tmaqg3boV4edywiiiNiNiI3dBHi111wz4dFb52OISXw,1376
36
- vlmparse/clients/prompts.py,sha256=-J60lqxgRzlkQ9VsQLxmWsIMaDt-gNqWqWoqHIw9CLc,4228
37
- vlmparse/clients/pipe_utils/cleaner.py,sha256=oxBkBTOkluN1lmeNbzajRIe0_D__ZGwUOBaI_Ph0uxE,2396
38
- vlmparse/clients/pipe_utils/html_to_md_conversion.py,sha256=cFFqzD2jCNw_968_eu3Wt--Ox7iJj2Rn5UoP_DZWosU,4112
39
- vlmparse/clients/pipe_utils/utils.py,sha256=935ecIO446I0pstszE_1nrIPHn1Ffrxunq7fVd0dsd8,315
40
- vlmparse/data_model/box.py,sha256=lJsh4qhjgYXZF5vTSJ1qMXD5GVlBi2_SBedBMlfJikU,16868
41
- vlmparse/data_model/document.py,sha256=pdCZvWzRFkez53ZJpNaB4ezUW-OVUlbR3_SBmmgVzGQ,4217
42
- vlmparse/servers/docker_server.py,sha256=nI7K8CEzJwSZxLY7Jg9IuYHHLR5YQpOSgY8Ln71POJA,6608
43
- vlmparse/servers/utils.py,sha256=gMk5Y8FA1nlSxi7JzKxZu7XyljkYUZ5AnsTb3YFqu28,8821
44
- vlmparse/st_viewer/fs_nav.py,sha256=7GNH68h2Loh5pQ64Pe72-D2cs2BLhqRXevEmKdFmPX0,1616
45
- vlmparse/st_viewer/st_viewer.py,sha256=m2rQTtk5rlwErNmivNAg-4rkHkvNkvLhoJZxFQi7Dwk,2105
46
- vlmparse-0.1.4.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
47
- vlmparse-0.1.4.dist-info/METADATA,sha256=72_47P1ER-J8tzlEvE91Xf58u35p5eZZD1VvPbXzrqA,5112
48
- vlmparse-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
49
- vlmparse-0.1.4.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
50
- vlmparse-0.1.4.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
51
- vlmparse-0.1.4.dist-info/RECORD,,