vlmparse 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. vlmparse/build_doc.py +10 -4
  2. vlmparse/clients/deepseekocr.py +155 -4
  3. vlmparse/clients/docling.py +2 -2
  4. vlmparse/clients/dotsocr.py +11 -2
  5. vlmparse/clients/mineru.py +8 -7
  6. vlmparse/clients/openai_converter.py +1 -0
  7. vlmparse/constants.py +2 -0
  8. vlmparse/converter.py +19 -5
  9. vlmparse/converter_with_server.py +5 -4
  10. vlmparse/registries.py +2 -4
  11. vlmparse/servers/docker_server.py +1 -1
  12. vlmparse/servers/utils.py +3 -2
  13. vlmparse/utils.py +2 -2
  14. {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/METADATA +17 -3
  15. vlmparse-0.1.5.dist-info/RECORD +36 -0
  16. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
  17. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  18. vlmparse/benchpdf2md/create_dataset.py +0 -60
  19. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
  20. vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
  21. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
  22. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
  23. vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
  24. vlmparse/benchpdf2md/run_benchmark.py +0 -296
  25. vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
  26. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
  27. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
  28. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
  29. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
  30. vlmparse/benchpdf2md/utils.py +0 -56
  31. vlmparse-0.1.3.dist-info/RECORD +0 -50
  32. {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/WHEEL +0 -0
  33. {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/entry_points.txt +0 -0
  34. {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/licenses/LICENSE +0 -0
  35. {vlmparse-0.1.3.dist-info → vlmparse-0.1.5.dist-info}/top_level.txt +0 -0
@@ -1,175 +0,0 @@
1
- import random
2
- import re
3
- import string
4
- import time
5
- import unittest
6
-
7
-
8
- class RepeatDetector:
9
- def __init__(self, max_ngram_size: int = 10):
10
- self.max_ngram_size = max_ngram_size
11
- self.data = ""
12
-
13
- def add_letters(self, new_str: str):
14
- self.data += new_str
15
-
16
- def ngram_repeats(self) -> list[int]:
17
- result = [0] * self.max_ngram_size
18
-
19
- if not self.data:
20
- return result
21
-
22
- # Normalize all whitespace to single spaces
23
- text = re.sub(r"\s+", " ", self.data)
24
-
25
- # For each n-gram size
26
- for size in range(1, self.max_ngram_size + 1):
27
- if len(text) < size:
28
- continue
29
-
30
- # Get the last n-gram
31
- target = text[-size:]
32
-
33
- # Count backwards from the end to find repeats
34
- count = 0
35
- pos = len(text) - size # Start position for previous n-gram
36
-
37
- while pos >= 0:
38
- if text[pos : pos + size] == target:
39
- count += 1
40
- pos -= size # Move back by the size of the n-gram
41
- else:
42
- break
43
-
44
- result[size - 1] = count
45
-
46
- return result
47
-
48
-
49
- class RepeatDetectorTest(unittest.TestCase):
50
- def test_basicTest1(self):
51
- d = RepeatDetector(max_ngram_size=3)
52
- d.add_letters("a")
53
- self.assertEqual(d.ngram_repeats(), [1, 0, 0])
54
-
55
- def test_basicTest2(self):
56
- d = RepeatDetector(max_ngram_size=3)
57
- d.add_letters("abab")
58
- self.assertEqual(d.ngram_repeats(), [1, 2, 1])
59
-
60
- def test_longer_sequence(self):
61
- d = RepeatDetector(max_ngram_size=3)
62
- d.add_letters("aabaabaa")
63
- self.assertEqual(d.ngram_repeats(), [2, 1, 2])
64
-
65
- def test_no_repeats(self):
66
- d = RepeatDetector(max_ngram_size=3)
67
- d.add_letters("abc")
68
- self.assertEqual(d.ngram_repeats(), [1, 1, 1])
69
-
70
- def test_empty_data(self):
71
- d = RepeatDetector(max_ngram_size=3)
72
- self.assertEqual(d.ngram_repeats(), [0, 0, 0])
73
-
74
- def test_max_ngram_greater_than_data_length(self):
75
- d = RepeatDetector(max_ngram_size=5)
76
- d.add_letters("abc")
77
- self.assertEqual(d.ngram_repeats(), [1, 1, 1, 0, 0])
78
-
79
- def test_large_single_char(self):
80
- d = RepeatDetector(max_ngram_size=5)
81
- d.add_letters("a" * 10000)
82
- self.assertEqual(d.ngram_repeats(), [10000, 5000, 3333, 2500, 2000])
83
-
84
- def test_repeating_pattern(self):
85
- d = RepeatDetector(max_ngram_size=5)
86
- d.add_letters("abcabcabcabc")
87
- self.assertEqual(d.ngram_repeats(), [1, 1, 4, 1, 1])
88
-
89
- def test_mixed_characters(self):
90
- d = RepeatDetector(max_ngram_size=4)
91
- d.add_letters("abcdabcabcdabc")
92
- self.assertEqual(d.ngram_repeats(), [1, 1, 1, 1])
93
-
94
- def test_palindrome(self):
95
- d = RepeatDetector(max_ngram_size=5)
96
- d.add_letters("racecar")
97
- self.assertEqual(d.ngram_repeats(), [1, 1, 1, 1, 1])
98
-
99
- def test_repeats_not_at_end(self):
100
- d = RepeatDetector(max_ngram_size=3)
101
- d.add_letters("abcabcxyz")
102
- self.assertEqual(d.ngram_repeats(), [1, 1, 1])
103
-
104
- def test_long_repeat_at_end(self):
105
- d = RepeatDetector(max_ngram_size=5)
106
- d.add_letters("abcabcabcabcabcabcabcabcabcabc")
107
- self.assertEqual(d.ngram_repeats(), [1, 1, 10, 1, 1])
108
-
109
- def test_large_repeating_pattern(self):
110
- d = RepeatDetector(max_ngram_size=4)
111
- pattern = "abcd"
112
- repeat_count = 1000
113
- d.add_letters(pattern * repeat_count)
114
- self.assertEqual(d.ngram_repeats(), [1, 1, 1, repeat_count])
115
-
116
- def test_unicode_characters(self):
117
- d = RepeatDetector(max_ngram_size=3)
118
- d.add_letters("αβγαβγ")
119
- self.assertEqual(d.ngram_repeats(), [1, 1, 2])
120
-
121
- def test_random_data(self):
122
- random.seed(42)
123
- d = RepeatDetector(max_ngram_size=5)
124
- data = "".join(random.choices(string.ascii_letters, k=10000))
125
- d.add_letters(data)
126
- counts = d.ngram_repeats()
127
- for count in counts:
128
- self.assertTrue(0 <= count <= len(data))
129
-
130
- def test_special_characters(self):
131
- d = RepeatDetector(max_ngram_size=4)
132
- d.add_letters("@@##@@##")
133
- self.assertEqual(d.ngram_repeats(), [2, 1, 1, 2])
134
-
135
- def test_incremental_addition(self):
136
- d = RepeatDetector(max_ngram_size=3)
137
- d.add_letters("abc")
138
- self.assertEqual(d.ngram_repeats(), [1, 1, 1])
139
- d.add_letters("abc")
140
- self.assertEqual(d.ngram_repeats(), [1, 1, 2])
141
- d.add_letters("abc")
142
- self.assertEqual(d.ngram_repeats(), [1, 1, 3])
143
-
144
- def test_long_non_repeating_sequence(self):
145
- d = RepeatDetector(max_ngram_size=5)
146
- d.add_letters("abcdefghijklmnopqrstuvwxyz")
147
- self.assertEqual(d.ngram_repeats(), [1, 1, 1, 1, 1])
148
-
149
- def test_alternating_characters(self):
150
- d = RepeatDetector(max_ngram_size=4)
151
- d.add_letters("ababababab")
152
- self.assertEqual(d.ngram_repeats(), [1, 5, 1, 2])
153
-
154
-
155
- class BenchmarkRepeatDetect(unittest.TestCase):
156
- def testLargeRandom(self):
157
- all_data = []
158
-
159
- for _ in range(1000):
160
- all_data.append("".join(random.choices("a", k=10000)))
161
-
162
- start = time.perf_counter()
163
-
164
- for data in all_data:
165
- d = RepeatDetector(max_ngram_size=20)
166
- d.add_letters(data)
167
- print(d.ngram_repeats())
168
-
169
- end = time.perf_counter()
170
-
171
- print(f"testLargeRandom took {end-start:0.0001f} seconds")
172
-
173
-
174
- if __name__ == "__main__":
175
- unittest.main()
@@ -1,256 +0,0 @@
1
- import datetime
2
- import json
3
- import os
4
- import time
5
- from dataclasses import asdict
6
- from pathlib import Path
7
-
8
- import fire
9
- import pandas as pd
10
- from huggingface_hub import snapshot_download
11
- from joblib import Parallel, delayed
12
- from loguru import logger
13
- from tqdm import tqdm
14
-
15
- from vlmparse.benchpdf2md.utils import bootstrap_and_format_results
16
- from vlmparse.data_model.document import Document
17
- from vlmparse.registries import converter_config_registry, docker_config_registry
18
-
19
- IN_FOLDER = Path(
20
- "/mnt/projects/rag-pretraitement/data/docparser/benchmarks/select_difficult_pdf/validated_tests/tiny_test_tests_first_batch/tests/tiny_text_long_text/"
21
- )
22
-
23
- OUT_FOLDER = Path(
24
- os.getenv(
25
- "OUT_FOLDER_FR_BENCHMARK",
26
- "/mnt/projects/rag-pretraitement/data/docparser/benchmarks/fr-bench-pdf2md-preds",
27
- )
28
- )
29
- IN_FOLDER = Path(
30
- "/data/data/docparser/benchmarks/select_difficult_pdf/validated_tests/tiny_test_tests_first_batch/tests/tiny_text_long_text/"
31
- )
32
-
33
- OUT_FOLDER = Path(
34
- os.getenv(
35
- "OUT_FOLDER_FR_BENCHMARK",
36
- "/data/data/docparser/benchmarks/fr-bench-pdf2md-preds",
37
- )
38
- )
39
-
40
-
41
- def process_and_run_benchmark(
42
- model="gemini-2.5-flash-lite",
43
- uri: str | None = None,
44
- retry: str | None = None,
45
- concurrency: int = 1,
46
- debug: bool = False,
47
- gpu: int = 1,
48
- regenerate: bool = False,
49
- in_folder: Path | str = "allenai/olmOCR-bench",
50
- save_folder: Path | str = OUT_FOLDER,
51
- retrylast: bool = False,
52
- dry_run: bool = True,
53
- filter_type: str | list[str] | None = None,
54
- ):
55
- save_folder = Path(save_folder)
56
-
57
- # if not in_folder.exists():
58
- # raise ValueError(f"Input folder does not exist: {in_folder}")
59
- # if not in_folder.is_dir():
60
- # raise ValueError(f"Input path is not a directory: {in_folder}")
61
-
62
- # ds = create_dataset(in_folder)
63
-
64
- if in_folder == "allenai/olmOCR-bench":
65
- local_folder_path = snapshot_download(
66
- repo_id=in_folder,
67
- repo_type="dataset", # Use "model" or "space" for other types
68
- )
69
- in_folder = local_folder_path
70
- logger.info(f"In folder: {in_folder}")
71
-
72
- pdfs = list(Path(in_folder).rglob("*.pdf"))
73
-
74
- try:
75
- if retrylast:
76
- retry = save_folder / model
77
- previous_runs = sorted(os.listdir(retry))
78
- if len(previous_runs) > 0:
79
- retry = retry / previous_runs[-1]
80
- else:
81
- raise ValueError(
82
- "No previous runs found, do not use the retrylast flag"
83
- )
84
- files = list(sorted(set(pdfs)))
85
- if retry is None or regenerate:
86
- files = list(sorted(set(pdfs)))
87
- logger.info(f"Number of files to convert: {len(files)}")
88
- if retry is not None:
89
- already_processed = [
90
- f.removesuffix(".zip") for f in os.listdir(retry / "results")
91
- ]
92
- files = [
93
- f
94
- for f in files
95
- if Path(f).name.removesuffix(".pdf") not in already_processed
96
- ]
97
-
98
- logger.info(f"Number of files after filtering: {len(files)}")
99
-
100
- if len(files) == 0:
101
- raise ValueError(
102
- f"No PDF files found in the input folder: {in_folder}\nDataset paths: {pdfs[:5]}"
103
- )
104
-
105
- save_folder = (
106
- (
107
- save_folder
108
- / model
109
- / (datetime.datetime.now().strftime("%Y-%m-%dT%Hh%Mm%Ss"))
110
- )
111
- if not retry
112
- else retry
113
- )
114
-
115
- if uri is None:
116
- docker_config = docker_config_registry.get(model)
117
- if docker_config is not None:
118
- docker_config.gpu_device_ids = [str(gpu)]
119
- server = docker_config.get_server(auto_stop=True)
120
- server.start()
121
- client = docker_config.get_client()
122
- else:
123
- client = converter_config_registry.get(model).get_client()
124
- else:
125
- client = converter_config_registry.get(model, uri=uri).get_client()
126
- client.num_concurrent_pages = concurrency if not debug else 1
127
- client.num_concurrent_files = concurrency if not debug else 1
128
- client.debug = debug
129
-
130
- if dry_run:
131
- client.save_folder = None
132
- logger.info("Dry run, converting first 3 files")
133
- client.batch(files[:3])
134
-
135
- client.save_folder = str(save_folder)
136
- tic = time.perf_counter()
137
- client.batch(files)
138
- total_time = time.perf_counter() - tic
139
- logger.info(
140
- f"Time taken to convert {len(files)} files: {total_time:.2f} seconds"
141
- )
142
-
143
- else:
144
- save_folder = Path(retry)
145
- total_time = None
146
-
147
- tests_files = list(Path(in_folder).rglob("**/*.jsonl"))
148
- if filter_type is not None:
149
- tests_files = [tf for tf in tests_files if filter_type in tf.name]
150
-
151
- df = run_olmocr_benchmark(tests_files, out_folder=save_folder / "results")
152
-
153
- logger.info(
154
- f"Number of pages: {df['pdf_path'].unique().shape[0]}, Number of tests: {len(df)}"
155
- )
156
- if "type" in df.columns:
157
- by_type_df = bootstrap_and_format_results(df, "type", "result")
158
- logger.info(f"By type:\n{by_type_df}")
159
-
160
- import pdb
161
-
162
- pdb.set_trace()
163
-
164
- if "tests_name" in df.columns:
165
- by_tests_name_df = bootstrap_and_format_results(df, "tests_name", "result")
166
- logger.info(f"By tests_name:\n{by_tests_name_df}")
167
-
168
- logger.info("average result:")
169
- avg = df.loc[df.type != "baseline"]["result"].mean()
170
- logger.info(avg)
171
-
172
- if not debug:
173
- save_folder_test_results = (
174
- save_folder
175
- / "test_results"
176
- / datetime.datetime.now().strftime("%Y-%m-%dT%Hh%Mm%Ss")
177
- )
178
- save_folder_test_results.mkdir(parents=True, exist_ok=True)
179
- df.to_parquet(save_folder_test_results / "test_results.parquet")
180
- by_type_df.to_excel(save_folder_test_results / "by_type.xlsx")
181
-
182
- with open(save_folder_test_results / "metrics.json", "w") as f:
183
- json.dump(
184
- {
185
- "total_time": total_time,
186
- "num_pages": len(files),
187
- "num_tests": len(df),
188
- "avg_result": avg,
189
- "avg_doc_latency": df["doc_latency"].mean(),
190
- "avg_page_latency": df["page_latency"].mean(),
191
- "avg_time_per_page": total_time / len(files)
192
- if total_time is not None
193
- else None,
194
- },
195
- f,
196
- )
197
-
198
- except Exception:
199
- raise
200
-
201
-
202
- def run_olmocr_benchmark(
203
- tests_files: list[Path],
204
- out_folder: Path,
205
- num_workers: int = 64,
206
- ):
207
- from vlmparse.benchpdf2md.olmocrbench.tests import load_tests
208
-
209
- files = list(out_folder.rglob("*.zip"))
210
- map_files = {path.stem: path for path in files}
211
- tests = [test for tf in tests_files for test in load_tests(tf)]
212
-
213
- def worker(test):
214
- key = Path(test.pdf).stem
215
-
216
- _dict = {
217
- "test_id": test.id,
218
- } | asdict(test)
219
- if key not in map_files:
220
- logger.warning(f"No zip document found for {test.pdf}")
221
- _dict["result"] = False
222
- _dict["explanation"] = f"No zip document found for {test.pdf}"
223
-
224
- else:
225
- file_path = map_files[key]
226
-
227
- doc = Document.from_zip(file_path)
228
- md_text = doc.text
229
- tests_name = Path(doc.file_path).parent.name
230
-
231
- passed, explanation = test.run(md_text)
232
- _dict["result"] = passed
233
- _dict["explanation"] = explanation
234
- _dict["tests_name"] = tests_name
235
- _dict["pdf_path"] = str(doc.file_path)
236
- _dict["doc_path"] = str(file_path)
237
- _dict["doc_latency"] = doc.latency
238
- _dict["page_latency"] = doc.pages[0].latency
239
-
240
- return _dict
241
-
242
- results = Parallel(n_jobs=num_workers)(
243
- delayed(worker)(test) for test in tqdm(tests)
244
- )
245
-
246
- df = pd.DataFrame(results)
247
-
248
- return df
249
-
250
-
251
- def main():
252
- fire.Fire(process_and_run_benchmark)
253
-
254
-
255
- if __name__ == "__main__":
256
- main()