vlmparse 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +1763 -0
  2. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  3. vlmparse/benchpdf2md/create_dataset.py +60 -0
  4. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +1 -0
  5. vlmparse/benchpdf2md/olmocrbench/katex/render.py +592 -0
  6. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +175 -0
  7. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +256 -0
  8. vlmparse/benchpdf2md/olmocrbench/tests.py +1334 -0
  9. vlmparse/benchpdf2md/run_benchmark.py +296 -0
  10. vlmparse/benchpdf2md/st_visu_benchmark/app.py +271 -0
  11. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +117 -0
  12. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +95 -0
  13. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +20 -0
  14. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +50 -0
  15. vlmparse/benchpdf2md/utils.py +56 -0
  16. vlmparse/clients/chandra.py +323 -0
  17. vlmparse/clients/deepseekocr.py +52 -0
  18. vlmparse/clients/docling.py +146 -0
  19. vlmparse/clients/dotsocr.py +277 -0
  20. vlmparse/clients/granite_docling.py +132 -0
  21. vlmparse/clients/hunyuanocr.py +45 -0
  22. vlmparse/clients/lightonocr.py +43 -0
  23. vlmparse/clients/mineru.py +119 -0
  24. vlmparse/clients/nanonetocr.py +29 -0
  25. vlmparse/clients/olmocr.py +46 -0
  26. vlmparse/clients/openai_converter.py +173 -0
  27. vlmparse/clients/paddleocrvl.py +48 -0
  28. vlmparse/clients/pipe_utils/cleaner.py +74 -0
  29. vlmparse/clients/pipe_utils/html_to_md_conversion.py +136 -0
  30. vlmparse/clients/pipe_utils/utils.py +12 -0
  31. vlmparse/clients/prompts.py +66 -0
  32. vlmparse/data_model/box.py +551 -0
  33. vlmparse/data_model/document.py +148 -0
  34. vlmparse/servers/docker_server.py +199 -0
  35. vlmparse/servers/utils.py +250 -0
  36. vlmparse/st_viewer/fs_nav.py +53 -0
  37. vlmparse/st_viewer/st_viewer.py +80 -0
  38. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/METADATA +12 -1
  39. vlmparse-0.1.3.dist-info/RECORD +50 -0
  40. vlmparse-0.1.0.dist-info/RECORD +0 -13
  41. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/WHEEL +0 -0
  42. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/entry_points.txt +0 -0
  43. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/licenses/LICENSE +0 -0
  44. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,80 @@
1
+ import argparse
2
+ import subprocess
3
+ import sys
4
+
5
+ import streamlit as st
6
+ from streamlit import runtime
7
+
8
+ from vlmparse.data_model.document import Document
9
+ from vlmparse.st_viewer.fs_nav import file_selector
10
+
11
+ st.set_page_config(layout="wide")
12
+
13
+
14
+ @st.cache_resource
15
+ def get_doc(file_path):
16
+ return Document.from_zip(file_path)
17
+
18
+
19
+ def render_sidebar_controls(doc, file_path):
20
+ """Render sidebar controls and return settings."""
21
+ return {
22
+ "page_no": st.number_input("Page", 0, len(doc.pages) - 1, 0),
23
+ "plot_layouts": st.checkbox("Plot layouts", value=True),
24
+ }
25
+
26
+
27
+ def run_streamlit(folder: str) -> None:
28
+ with st.sidebar:
29
+ file_path = file_selector(folder)
30
+
31
+ if not file_path:
32
+ st.info("Please select a file from the sidebar.")
33
+ return
34
+
35
+ doc = get_doc(file_path)
36
+
37
+ with st.sidebar:
38
+ settings = render_sidebar_controls(doc, file_path)
39
+
40
+ col1, col2 = st.columns(2)
41
+ with col1:
42
+ with st.container(height=700):
43
+ st.write(doc.pages[settings["page_no"]].text)
44
+
45
+ with col2:
46
+ if settings["plot_layouts"]:
47
+ st.image(doc.pages[settings["page_no"]].get_image_with_boxes(layout=True))
48
+ else:
49
+ st.image(doc.pages[settings["page_no"]].image)
50
+
51
+
52
+ def parse_args() -> argparse.Namespace:
53
+ """Parse command line arguments."""
54
+ parser = argparse.ArgumentParser(description="Document viewer with Streamlit")
55
+ parser.add_argument(
56
+ "folder", type=str, nargs="?", default=".", help="Root folder path"
57
+ )
58
+ return parser.parse_args()
59
+
60
+
61
+ def main() -> None:
62
+ """Main entry point."""
63
+ folder = parse_args().folder
64
+
65
+ if runtime.exists():
66
+ run_streamlit(folder)
67
+ else:
68
+ try:
69
+ subprocess.run(
70
+ [sys.executable, "-m", "streamlit", "run", __file__, "--", folder],
71
+ check=True,
72
+ )
73
+ except KeyboardInterrupt:
74
+ print("\nStreamlit app terminated by user.")
75
+ except subprocess.CalledProcessError as e:
76
+ print(f"Error while running Streamlit: {e}")
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vlmparse
3
- Version: 0.1.0
3
+ Version: 0.1.3
4
4
  Requires-Python: >=3.12.0
5
5
  Description-Content-Type: text/markdown
6
6
  License-File: LICENSE
@@ -19,6 +19,7 @@ Requires-Dist: pypdfium2>=4.30.0
19
19
  Requires-Dist: fire>=0.7.1
20
20
  Requires-Dist: lxml>=6.0.2
21
21
  Requires-Dist: tabulate>=0.9.0
22
+ Requires-Dist: beautifulsoup4>=4.14.2
22
23
  Provides-Extra: dev
23
24
  Requires-Dist: jupyter; extra == "dev"
24
25
  Provides-Extra: docling-core
@@ -182,3 +183,13 @@ server.start()
182
183
 
183
184
  server.stop()
184
185
  ```
186
+
187
+
188
+ Converter with automatic server deployment:
189
+
190
+ ```python
191
+ from vlmparse.converter_with_server import ConverterWithServer
192
+
193
+ converter_with_server = ConverterWithServer(model="mineru2.5")
194
+ documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
195
+ ```
@@ -0,0 +1,50 @@
1
+ vlmparse/base_model.py,sha256=4U4UPe8SNArliKnUf8pp8zQugWYsnhg9okylt7mrW1U,381
2
+ vlmparse/build_doc.py,sha256=9evdU6GwVAZU15dZ1Qght6hNo_QxBQN8X3gmYdU2ltg,1965
3
+ vlmparse/cli.py,sha256=tQma1IkOsFnqPKqqHVO1PJh18n1w82gp4ewA7oraJkE,15855
4
+ vlmparse/converter.py,sha256=5wTA_cFyDMDSY8YgLzZV9SVBKmHjEbJCW8KPoJjmVVA,6880
5
+ vlmparse/converter_with_server.py,sha256=9yoqfv8akB0xZZ7Snjq3aHW5NPNam2AgbK7_rfFqNkk,3909
6
+ vlmparse/registries.py,sha256=TdSR1fx1Tz3roGk4Tk5ckIK6Iz-e4UD4erWUk96fFpQ,5846
7
+ vlmparse/utils.py,sha256=jZWbNMwpZSZL--ZzvL8wPG_7mwpw9Pi36qTO9TjvHZU,1239
8
+ vlmparse/benchpdf2md/create_dataset.py,sha256=0o4I0O3pHm1W7NYOTnW1JvPmgxJM8KLElKFvAbPAIic,1855
9
+ vlmparse/benchpdf2md/run_benchmark.py,sha256=LMHElWyWIgB4ppBL0s-qjfMz5FZQnZOEm5mXxd0p0C8,9800
10
+ vlmparse/benchpdf2md/utils.py,sha256=Q62vtvLIzxOEzSi-w210d7qnaRz-q_5ykmLNTkmbs-8,1732
11
+ vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py,sha256=XISZKlN1CiPBGDHgiDkjFkhGWxPxMFRu1GfStiOprmo,69527
12
+ vlmparse/benchpdf2md/bench_tests/utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ vlmparse/benchpdf2md/olmocrbench/repeatdetect.py,sha256=SWCfYgAVmtyNypCsEZnuj6HoNGjFjTOqPBn2fCnEsiQ,5509
14
+ vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py,sha256=XnVlN5Sy7mXkr7nqWbcCgROsoavFBXqn4haSoxVsyLM,8495
15
+ vlmparse/benchpdf2md/olmocrbench/tests.py,sha256=bszavrGQNCB3l60gPrsMgxtcMJvDjwZBBL_0B_K1oFA,53625
16
+ vlmparse/benchpdf2md/olmocrbench/katex/__init__.py,sha256=DD9Knd52Ur9WiatymQGI9B1-yJ7OkL2w5E0quITDkPY,72
17
+ vlmparse/benchpdf2md/olmocrbench/katex/render.py,sha256=l8mYRnz2I-10RE3JoxYrMfZ2UhxECTiRFazuf-dHnqU,18219
18
+ vlmparse/benchpdf2md/st_visu_benchmark/app.py,sha256=ng8o_q20rY88BFQSEi62dBylklV6yyGyrrMiTrl7xtM,8921
19
+ vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py,sha256=vsGtd9nPrtL5uDqALMumVlsllLvVZr2E6_d9TOLQoN4,3915
20
+ vlmparse/benchpdf2md/st_visu_benchmark/test_form.py,sha256=qNmFZoSdbWcw1EJKesgO7rYV_LpslYPzij_gyJEuDP8,3701
21
+ vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py,sha256=WkKncexShO3SU-DO7dPT4DOe-8UNjsCaHlj9L1B2mkI,572
22
+ vlmparse/benchpdf2md/st_visu_benchmark/utils.py,sha256=JSmOJQY1DDETtWmjWv07SlQlORE6yBewiMcE5qRZI_Q,1109
23
+ vlmparse/clients/chandra.py,sha256=zfu-A6Slh-fIAyrtrlVoCb6QHLBimnimefap_K9YwYw,9775
24
+ vlmparse/clients/deepseekocr.py,sha256=iCG5wI5yPv98hIPgVJX4gkkkH1OekblZjFhh5ORVWAk,1813
25
+ vlmparse/clients/docling.py,sha256=K-Grl_nZiSdooEdEaflevprE56l3Keby9xSMBtFwdis,5355
26
+ vlmparse/clients/dotsocr.py,sha256=9ygvIVVOi9UhTUJwmrI-h6AjMV9vL9J2vMaBfUyTorY,9895
27
+ vlmparse/clients/granite_docling.py,sha256=EQpsv5qSJG0HtMSacmJStER2sq4TGf1EMU5_NmJsl4g,4634
28
+ vlmparse/clients/hunyuanocr.py,sha256=Xw0Q1l-3pQzaEgFngnfM8vrSWpnT3I99QvDaGZ8XooM,1712
29
+ vlmparse/clients/lightonocr.py,sha256=wx1Im8Z3wlRWwYbPqnSd3LqTtdAU8CnX5mzu1BuCUY8,1314
30
+ vlmparse/clients/mineru.py,sha256=OL56O6cMGksoVMVDyMYCQvwIvZEjKiAZXjEMEshqXaY,3549
31
+ vlmparse/clients/nanonetocr.py,sha256=BT5vaeerCsK5agvOaHK3NvLUqWd1FfDmrMmDYbp646I,1543
32
+ vlmparse/clients/olmocr.py,sha256=mQEDpfyLY8a80Zlps5mG0QaWytIgnNQZVEVWKWjPIjk,1849
33
+ vlmparse/clients/openai_converter.py,sha256=QBHomw3_K_BpMVboaMd4rlhrntsqEFT46kxpg9KqBIs,5636
34
+ vlmparse/clients/paddleocrvl.py,sha256=tmaqg3boV4edywiiiNiNiI3dBHi111wz4dFb52OISXw,1376
35
+ vlmparse/clients/prompts.py,sha256=-J60lqxgRzlkQ9VsQLxmWsIMaDt-gNqWqWoqHIw9CLc,4228
36
+ vlmparse/clients/pipe_utils/cleaner.py,sha256=oxBkBTOkluN1lmeNbzajRIe0_D__ZGwUOBaI_Ph0uxE,2396
37
+ vlmparse/clients/pipe_utils/html_to_md_conversion.py,sha256=cFFqzD2jCNw_968_eu3Wt--Ox7iJj2Rn5UoP_DZWosU,4112
38
+ vlmparse/clients/pipe_utils/utils.py,sha256=935ecIO446I0pstszE_1nrIPHn1Ffrxunq7fVd0dsd8,315
39
+ vlmparse/data_model/box.py,sha256=lJsh4qhjgYXZF5vTSJ1qMXD5GVlBi2_SBedBMlfJikU,16868
40
+ vlmparse/data_model/document.py,sha256=pdCZvWzRFkez53ZJpNaB4ezUW-OVUlbR3_SBmmgVzGQ,4217
41
+ vlmparse/servers/docker_server.py,sha256=nI7K8CEzJwSZxLY7Jg9IuYHHLR5YQpOSgY8Ln71POJA,6608
42
+ vlmparse/servers/utils.py,sha256=gMk5Y8FA1nlSxi7JzKxZu7XyljkYUZ5AnsTb3YFqu28,8821
43
+ vlmparse/st_viewer/fs_nav.py,sha256=7GNH68h2Loh5pQ64Pe72-D2cs2BLhqRXevEmKdFmPX0,1616
44
+ vlmparse/st_viewer/st_viewer.py,sha256=m2rQTtk5rlwErNmivNAg-4rkHkvNkvLhoJZxFQi7Dwk,2105
45
+ vlmparse-0.1.3.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
46
+ vlmparse-0.1.3.dist-info/METADATA,sha256=JkSI4uFnnF59WReyhfRFZZVoe6KLk0ZJrjG0FQkUIPI,5112
47
+ vlmparse-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
48
+ vlmparse-0.1.3.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
49
+ vlmparse-0.1.3.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
50
+ vlmparse-0.1.3.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- vlmparse/base_model.py,sha256=4U4UPe8SNArliKnUf8pp8zQugWYsnhg9okylt7mrW1U,381
2
- vlmparse/build_doc.py,sha256=9evdU6GwVAZU15dZ1Qght6hNo_QxBQN8X3gmYdU2ltg,1965
3
- vlmparse/cli.py,sha256=tQma1IkOsFnqPKqqHVO1PJh18n1w82gp4ewA7oraJkE,15855
4
- vlmparse/converter.py,sha256=5wTA_cFyDMDSY8YgLzZV9SVBKmHjEbJCW8KPoJjmVVA,6880
5
- vlmparse/converter_with_server.py,sha256=9yoqfv8akB0xZZ7Snjq3aHW5NPNam2AgbK7_rfFqNkk,3909
6
- vlmparse/registries.py,sha256=TdSR1fx1Tz3roGk4Tk5ckIK6Iz-e4UD4erWUk96fFpQ,5846
7
- vlmparse/utils.py,sha256=jZWbNMwpZSZL--ZzvL8wPG_7mwpw9Pi36qTO9TjvHZU,1239
8
- vlmparse-0.1.0.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
9
- vlmparse-0.1.0.dist-info/METADATA,sha256=4xxtT6rE3pJyqfqbVjl8Llj7C5Az99TeusxXmMHzMMQ,4788
10
- vlmparse-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- vlmparse-0.1.0.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
12
- vlmparse-0.1.0.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
13
- vlmparse-0.1.0.dist-info/RECORD,,