vlmparse 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. vlmparse/build_doc.py +10 -4
  2. vlmparse/clients/deepseekocr.py +155 -4
  3. vlmparse/clients/docling.py +2 -2
  4. vlmparse/clients/dotsocr.py +11 -2
  5. vlmparse/clients/mineru.py +8 -7
  6. vlmparse/clients/openai_converter.py +1 -0
  7. vlmparse/constants.py +2 -0
  8. vlmparse/converter.py +19 -5
  9. vlmparse/converter_with_server.py +5 -4
  10. vlmparse/registries.py +2 -4
  11. vlmparse/servers/docker_server.py +1 -1
  12. vlmparse/servers/utils.py +3 -2
  13. vlmparse/utils.py +2 -2
  14. {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/METADATA +17 -3
  15. vlmparse-0.1.5.dist-info/RECORD +36 -0
  16. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
  17. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  18. vlmparse/benchpdf2md/create_dataset.py +0 -60
  19. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
  20. vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
  21. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
  22. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
  23. vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
  24. vlmparse/benchpdf2md/run_benchmark.py +0 -296
  25. vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
  26. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
  27. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
  28. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
  29. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
  30. vlmparse/benchpdf2md/utils.py +0 -56
  31. vlmparse-0.1.3.dist-info/RECORD +0 -50
  32. {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/WHEEL +0 -0
  33. {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/entry_points.txt +0 -0
  34. {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/licenses/LICENSE +0 -0
  35. {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/top_level.txt +0 -0
@@ -1,95 +0,0 @@
1
- import streamlit as st
2
-
3
-
4
- def edit_test_form(test_obj, test_type):
5
- st.markdown("### Edit Test Fields")
6
- with st.form("edit_test_fields"):
7
- type_fields = {}
8
- type_fields["max_diffs"] = st.number_input(
9
- "Max Diffs", value=test_obj.max_diffs, min_value=0, step=1
10
- )
11
- type_fields["unidecode"] = st.checkbox("Unidecode", value=test_obj.unidecode)
12
- type_fields["alphanum"] = st.checkbox("Alphanum", value=test_obj.alphanum)
13
- type_fields["ignore_str"] = st.text_input(
14
- "Ignore strings (seperarated by spaces)",
15
- value=" ".join(test_obj.ignore_str),
16
- )
17
- type_fields["ignore_space"] = st.checkbox(
18
- "Ignore space", value=test_obj.ignore_space
19
- )
20
-
21
- type_fields["ignore_str"] = (
22
- type_fields["ignore_str"].split(" ") if type_fields["ignore_str"] else []
23
- )
24
-
25
- if test_type == "present" or test_type == "absent":
26
- type_fields["text"] = st.text_area(
27
- "Text", value=test_obj.text, height="content"
28
- )
29
- layout_cat_options = [
30
- "text",
31
- "footer",
32
- "header",
33
- "footnote",
34
- "image",
35
- "image_caption",
36
- ]
37
-
38
- type_fields["layout_cat"] = st.selectbox(
39
- "Layout Category",
40
- layout_cat_options,
41
- index=layout_cat_options.index(test_obj.layout_cat),
42
- )
43
- type_fields["case_sensitive"] = st.checkbox(
44
- "Case Sensitive", value=test_obj.case_sensitive
45
- )
46
- type_fields["first_n"] = st.number_input(
47
- "First N",
48
- value=test_obj.first_n if test_obj.first_n else 0,
49
- min_value=0,
50
- step=100,
51
- )
52
- type_fields["last_n"] = st.number_input(
53
- "Last N",
54
- value=test_obj.last_n if test_obj.last_n else 0,
55
- min_value=0,
56
- step=100,
57
- )
58
- if type_fields["first_n"] == 0:
59
- type_fields["first_n"] = None
60
- if type_fields["last_n"] == 0:
61
- type_fields["last_n"] = None
62
- elif test_type == "order":
63
- type_fields["before"] = st.text_area(
64
- "Before", value=test_obj.before, height="content"
65
- )
66
- type_fields["after"] = st.text_area(
67
- "After", value=test_obj.after, height="content"
68
- )
69
- elif test_type == "table":
70
- type_fields["cell"] = st.text_input("Cell", value=test_obj.cell)
71
- type_fields["up"] = st.text_input(
72
- "Up", value=test_obj.up if test_obj.up else ""
73
- )
74
- type_fields["down"] = st.text_input(
75
- "Down", value=test_obj.down if test_obj.down else ""
76
- )
77
- type_fields["left"] = st.text_input(
78
- "Left", value=test_obj.left if test_obj.left else ""
79
- )
80
- type_fields["right"] = st.text_input(
81
- "Right", value=test_obj.right if test_obj.right else ""
82
- )
83
- type_fields["top_heading"] = st.text_input(
84
- "Top Heading",
85
- value=test_obj.top_heading if test_obj.top_heading else "",
86
- )
87
- type_fields["left_heading"] = st.text_input(
88
- "Left Heading",
89
- value=test_obj.left_heading if test_obj.left_heading else "",
90
- )
91
- if st.form_submit_button("Save Changes"):
92
- for field, value in type_fields.items():
93
- setattr(test_obj, field, value)
94
-
95
- return test_obj
@@ -1,20 +0,0 @@
1
- from pathlib import Path
2
- from typing import Optional
3
-
4
- import streamlit as st
5
-
6
- from vlmparse.benchpdf2md.st_visu_benchmark.utils import get_pdf_bytes
7
-
8
-
9
- def download_pdf_page(
10
- pdf_path: Path, page_no: int = 0, file_name: Optional[str] = None
11
- ):
12
- pdf_bytes = get_pdf_bytes(pdf_path, page_no)
13
- if pdf_bytes:
14
- st.download_button(
15
- label="📄 Download PDF Page",
16
- data=pdf_bytes,
17
- file_name=file_name if file_name else f"{pdf_path.stem}.pdf",
18
- mime="application/pdf",
19
- use_container_width=True,
20
- )
@@ -1,50 +0,0 @@
1
- import io
2
- from pathlib import Path
3
-
4
- import pypdfium2 as pdfium
5
- import streamlit as st
6
-
7
- from vlmparse.data_model.document import Document
8
-
9
-
10
- @st.cache_data
11
- def get_pdf_bytes(pdf_path, page_no=0):
12
- pdf_reader = pdfium.PdfDocument(pdf_path)
13
- if page_no >= len(pdf_reader):
14
- pdf_reader.close()
15
- return None
16
-
17
- # Create a new PDF
18
- new_pdf = pdfium.PdfDocument.new()
19
-
20
- # Import the chosen page into the new PDF
21
- new_pdf.import_pages(pdf_reader, pages=[page_no])
22
-
23
- bytes_io = io.BytesIO()
24
- # Get bytes
25
- new_pdf.save(bytes_io)
26
-
27
- pdf_bytes = bytes_io.getvalue()
28
-
29
- # Clean up
30
- new_pdf.close()
31
- pdf_reader.close()
32
-
33
- return pdf_bytes
34
-
35
-
36
- @st.cache_data
37
- def get_doc(doc_path: Path):
38
- return Document.from_zip(doc_path)
39
-
40
-
41
- def save_new_test(tests, test_obj_edited, test_path):
42
- from vlmparse.benchpdf2md.bench_tests.benchmark_tsts import save_tests
43
-
44
- for test in tests:
45
- if test.id == test_obj_edited.id:
46
- test = test_obj_edited
47
- else:
48
- test = test
49
- save_tests(tests, test_path)
50
- st.success("Test updated successfully!")
@@ -1,56 +0,0 @@
1
- import base64
2
- from io import BytesIO
3
-
4
- import numpy as np
5
- import pandas as pd
6
- from PIL import Image
7
-
8
-
9
- def vectorized_bootstrap_grouped_std(df, group_col, value_col, n_bootstrap=1000):
10
- group_col = [group_col] if isinstance(group_col, str) else group_col
11
- grouped = df.groupby(group_col)[value_col]
12
-
13
- def bootstrap_group(group):
14
- values = group.values
15
- n = len(values)
16
- bootstrap_samples = np.random.choice(
17
- values, size=(n_bootstrap, n), replace=True
18
- )
19
- bootstrap_means = np.mean(bootstrap_samples, axis=1)
20
- return pd.Series(
21
- {"mean": np.mean(values), "bootstrap_std": np.std(bootstrap_means)}
22
- )
23
-
24
- result = grouped.apply(bootstrap_group)
25
- return result.unstack(-1)
26
-
27
-
28
- def format_results_vectorized(result_df, precision=2):
29
- means = result_df["mean"].values
30
- margins = 2 * result_df["bootstrap_std"].values
31
-
32
- formatted = np.char.add(
33
- np.char.add(np.round(means, precision).astype(str), " ± "),
34
- np.round(margins, precision).astype(str),
35
- )
36
-
37
- return pd.DataFrame({"formatted_result": formatted}, index=result_df.index)
38
-
39
-
40
- def bootstrap_and_format_results(
41
- df, group_col, value_col, n_bootstrap=1000, precision=2
42
- ):
43
- result_df = vectorized_bootstrap_grouped_std(df, group_col, value_col, n_bootstrap)
44
- return format_results_vectorized(result_df, precision)
45
-
46
-
47
- def to_base64(image: Image, extension="PNG"):
48
- img_byte_arr = BytesIO()
49
- image.save(img_byte_arr, format=extension)
50
- img_byte_arr = img_byte_arr.getvalue()
51
- return base64.b64encode(img_byte_arr).decode("utf-8")
52
-
53
-
54
- def from_base64(base64_str: str):
55
- image_data = base64.b64decode(base64_str)
56
- return Image.open(BytesIO(image_data))
@@ -1,50 +0,0 @@
1
- vlmparse/base_model.py,sha256=4U4UPe8SNArliKnUf8pp8zQugWYsnhg9okylt7mrW1U,381
2
- vlmparse/build_doc.py,sha256=9evdU6GwVAZU15dZ1Qght6hNo_QxBQN8X3gmYdU2ltg,1965
3
- vlmparse/cli.py,sha256=tQma1IkOsFnqPKqqHVO1PJh18n1w82gp4ewA7oraJkE,15855
4
- vlmparse/converter.py,sha256=5wTA_cFyDMDSY8YgLzZV9SVBKmHjEbJCW8KPoJjmVVA,6880
5
- vlmparse/converter_with_server.py,sha256=9yoqfv8akB0xZZ7Snjq3aHW5NPNam2AgbK7_rfFqNkk,3909
6
- vlmparse/registries.py,sha256=TdSR1fx1Tz3roGk4Tk5ckIK6Iz-e4UD4erWUk96fFpQ,5846
7
- vlmparse/utils.py,sha256=jZWbNMwpZSZL--ZzvL8wPG_7mwpw9Pi36qTO9TjvHZU,1239
8
- vlmparse/benchpdf2md/create_dataset.py,sha256=0o4I0O3pHm1W7NYOTnW1JvPmgxJM8KLElKFvAbPAIic,1855
9
- vlmparse/benchpdf2md/run_benchmark.py,sha256=LMHElWyWIgB4ppBL0s-qjfMz5FZQnZOEm5mXxd0p0C8,9800
10
- vlmparse/benchpdf2md/utils.py,sha256=Q62vtvLIzxOEzSi-w210d7qnaRz-q_5ykmLNTkmbs-8,1732
11
- vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py,sha256=XISZKlN1CiPBGDHgiDkjFkhGWxPxMFRu1GfStiOprmo,69527
12
- vlmparse/benchpdf2md/bench_tests/utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- vlmparse/benchpdf2md/olmocrbench/repeatdetect.py,sha256=SWCfYgAVmtyNypCsEZnuj6HoNGjFjTOqPBn2fCnEsiQ,5509
14
- vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py,sha256=XnVlN5Sy7mXkr7nqWbcCgROsoavFBXqn4haSoxVsyLM,8495
15
- vlmparse/benchpdf2md/olmocrbench/tests.py,sha256=bszavrGQNCB3l60gPrsMgxtcMJvDjwZBBL_0B_K1oFA,53625
16
- vlmparse/benchpdf2md/olmocrbench/katex/__init__.py,sha256=DD9Knd52Ur9WiatymQGI9B1-yJ7OkL2w5E0quITDkPY,72
17
- vlmparse/benchpdf2md/olmocrbench/katex/render.py,sha256=l8mYRnz2I-10RE3JoxYrMfZ2UhxECTiRFazuf-dHnqU,18219
18
- vlmparse/benchpdf2md/st_visu_benchmark/app.py,sha256=ng8o_q20rY88BFQSEi62dBylklV6yyGyrrMiTrl7xtM,8921
19
- vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py,sha256=vsGtd9nPrtL5uDqALMumVlsllLvVZr2E6_d9TOLQoN4,3915
20
- vlmparse/benchpdf2md/st_visu_benchmark/test_form.py,sha256=qNmFZoSdbWcw1EJKesgO7rYV_LpslYPzij_gyJEuDP8,3701
21
- vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py,sha256=WkKncexShO3SU-DO7dPT4DOe-8UNjsCaHlj9L1B2mkI,572
22
- vlmparse/benchpdf2md/st_visu_benchmark/utils.py,sha256=JSmOJQY1DDETtWmjWv07SlQlORE6yBewiMcE5qRZI_Q,1109
23
- vlmparse/clients/chandra.py,sha256=zfu-A6Slh-fIAyrtrlVoCb6QHLBimnimefap_K9YwYw,9775
24
- vlmparse/clients/deepseekocr.py,sha256=iCG5wI5yPv98hIPgVJX4gkkkH1OekblZjFhh5ORVWAk,1813
25
- vlmparse/clients/docling.py,sha256=K-Grl_nZiSdooEdEaflevprE56l3Keby9xSMBtFwdis,5355
26
- vlmparse/clients/dotsocr.py,sha256=9ygvIVVOi9UhTUJwmrI-h6AjMV9vL9J2vMaBfUyTorY,9895
27
- vlmparse/clients/granite_docling.py,sha256=EQpsv5qSJG0HtMSacmJStER2sq4TGf1EMU5_NmJsl4g,4634
28
- vlmparse/clients/hunyuanocr.py,sha256=Xw0Q1l-3pQzaEgFngnfM8vrSWpnT3I99QvDaGZ8XooM,1712
29
- vlmparse/clients/lightonocr.py,sha256=wx1Im8Z3wlRWwYbPqnSd3LqTtdAU8CnX5mzu1BuCUY8,1314
30
- vlmparse/clients/mineru.py,sha256=OL56O6cMGksoVMVDyMYCQvwIvZEjKiAZXjEMEshqXaY,3549
31
- vlmparse/clients/nanonetocr.py,sha256=BT5vaeerCsK5agvOaHK3NvLUqWd1FfDmrMmDYbp646I,1543
32
- vlmparse/clients/olmocr.py,sha256=mQEDpfyLY8a80Zlps5mG0QaWytIgnNQZVEVWKWjPIjk,1849
33
- vlmparse/clients/openai_converter.py,sha256=QBHomw3_K_BpMVboaMd4rlhrntsqEFT46kxpg9KqBIs,5636
34
- vlmparse/clients/paddleocrvl.py,sha256=tmaqg3boV4edywiiiNiNiI3dBHi111wz4dFb52OISXw,1376
35
- vlmparse/clients/prompts.py,sha256=-J60lqxgRzlkQ9VsQLxmWsIMaDt-gNqWqWoqHIw9CLc,4228
36
- vlmparse/clients/pipe_utils/cleaner.py,sha256=oxBkBTOkluN1lmeNbzajRIe0_D__ZGwUOBaI_Ph0uxE,2396
37
- vlmparse/clients/pipe_utils/html_to_md_conversion.py,sha256=cFFqzD2jCNw_968_eu3Wt--Ox7iJj2Rn5UoP_DZWosU,4112
38
- vlmparse/clients/pipe_utils/utils.py,sha256=935ecIO446I0pstszE_1nrIPHn1Ffrxunq7fVd0dsd8,315
39
- vlmparse/data_model/box.py,sha256=lJsh4qhjgYXZF5vTSJ1qMXD5GVlBi2_SBedBMlfJikU,16868
40
- vlmparse/data_model/document.py,sha256=pdCZvWzRFkez53ZJpNaB4ezUW-OVUlbR3_SBmmgVzGQ,4217
41
- vlmparse/servers/docker_server.py,sha256=nI7K8CEzJwSZxLY7Jg9IuYHHLR5YQpOSgY8Ln71POJA,6608
42
- vlmparse/servers/utils.py,sha256=gMk5Y8FA1nlSxi7JzKxZu7XyljkYUZ5AnsTb3YFqu28,8821
43
- vlmparse/st_viewer/fs_nav.py,sha256=7GNH68h2Loh5pQ64Pe72-D2cs2BLhqRXevEmKdFmPX0,1616
44
- vlmparse/st_viewer/st_viewer.py,sha256=m2rQTtk5rlwErNmivNAg-4rkHkvNkvLhoJZxFQi7Dwk,2105
45
- vlmparse-0.1.3.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
46
- vlmparse-0.1.3.dist-info/METADATA,sha256=JkSI4uFnnF59WReyhfRFZZVoe6KLk0ZJrjG0FQkUIPI,5112
47
- vlmparse-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
48
- vlmparse-0.1.3.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
49
- vlmparse-0.1.3.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
50
- vlmparse-0.1.3.dist-info/RECORD,,