anymd 0.0.10 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -1
- package/scripts/batch-ocr.py +168 -65
- package/scripts/benchmark-ocr.py +266 -0
- package/src/md-enhancer.ts +14 -2
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "anymd",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.12",
|
|
4
4
|
"description": "Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"markdown",
|
|
@@ -46,9 +46,13 @@
|
|
|
46
46
|
"dependencies": {
|
|
47
47
|
"markdownlint": "^0.40.0",
|
|
48
48
|
"p-map": "^7.0.4",
|
|
49
|
+
"turndown": "^7.2.2",
|
|
49
50
|
"yoctocolors": "^2.1.2",
|
|
50
51
|
"zod": "^4.3.6"
|
|
51
52
|
},
|
|
53
|
+
"devDependencies": {
|
|
54
|
+
"@types/turndown": "^5.0.6"
|
|
55
|
+
},
|
|
52
56
|
"engines": {
|
|
53
57
|
"bun": ">=1.0.0"
|
|
54
58
|
},
|
package/scripts/batch-ocr.py
CHANGED
|
@@ -2,12 +2,13 @@ import gc
|
|
|
2
2
|
import json
|
|
3
3
|
import sys
|
|
4
4
|
import time
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
|
|
8
9
|
import pypdfium2 as pdfium
|
|
9
10
|
import pypdfium2.raw as pdfium_c
|
|
10
|
-
from PIL import Image
|
|
11
|
+
from PIL import Image, ImageChops, ImageOps
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def _get_arg(flag: str, default: str) -> str:
|
|
@@ -24,9 +25,19 @@ OUTPUT_BASE = Path(_get_arg('--output-base', 'output/ocr-raw'))
|
|
|
24
25
|
STATUS_FILE = Path(_get_arg('--status-file', 'output/ocr-progress.json'))
|
|
25
26
|
LOG_FILE = Path(_get_arg('--log-file', 'output/ocr-log.txt'))
|
|
26
27
|
|
|
27
|
-
MODEL_ID = 'mlx-community/chandra-
|
|
28
|
+
MODEL_ID = 'mlx-community/chandra-4bit'
|
|
28
29
|
IMAGE_DPI = 150
|
|
29
30
|
MAX_TOKENS = 8192
|
|
31
|
+
CROP_PADDING = 20
|
|
32
|
+
CROP_THRESHOLD = 10
|
|
33
|
+
VISION_MAX_PIXELS = 300_000
|
|
34
|
+
|
|
35
|
+
DIACRITICS = frozenset(
|
|
36
|
+
'àáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ'
|
|
37
|
+
'ÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴĐ'
|
|
38
|
+
)
|
|
39
|
+
MIN_DIACRITICS_RATIO = 0.15
|
|
40
|
+
MIN_CHARS_FOR_PAGE = 50
|
|
30
41
|
|
|
31
42
|
|
|
32
43
|
def _to_output_name(pdf_path: str) -> str:
|
|
@@ -37,12 +48,7 @@ def _to_output_name(pdf_path: str) -> str:
|
|
|
37
48
|
return str(rel.with_suffix('')).replace('/', '--')
|
|
38
49
|
|
|
39
50
|
|
|
40
|
-
OCR_PROMPT =
|
|
41
|
-
OCR this image to markdown.
|
|
42
|
-
Output clean, readable markdown preserving the document structure.
|
|
43
|
-
Use proper headings, lists, tables, and formatting.
|
|
44
|
-
For math expressions, use LaTeX notation with $ delimiters.
|
|
45
|
-
""".strip()
|
|
51
|
+
OCR_PROMPT = 'OCR this image to markdown. Preserve structure: headings, lists, tables. LaTeX $ for math.'
|
|
46
52
|
|
|
47
53
|
|
|
48
54
|
def _log(msg: str) -> None:
|
|
@@ -109,6 +115,42 @@ def _flatten_page(page: object) -> None:
|
|
|
109
115
|
NATIVE_TEXT_THRESHOLD = 50
|
|
110
116
|
|
|
111
117
|
|
|
118
|
+
def _crop_margins(img: Image.Image) -> Image.Image:
|
|
119
|
+
bg = Image.new('RGB', img.size, (255, 255, 255))
|
|
120
|
+
diff = ImageChops.difference(img, bg)
|
|
121
|
+
gray = ImageOps.grayscale(diff)
|
|
122
|
+
bbox = gray.point(lambda x: 255 if x > CROP_THRESHOLD else 0).getbbox()
|
|
123
|
+
bg.close()
|
|
124
|
+
diff.close()
|
|
125
|
+
gray.close()
|
|
126
|
+
if not bbox:
|
|
127
|
+
return img
|
|
128
|
+
x0 = max(0, bbox[0] - CROP_PADDING)
|
|
129
|
+
y0 = max(0, bbox[1] - CROP_PADDING)
|
|
130
|
+
x1 = min(img.size[0], bbox[2] + CROP_PADDING)
|
|
131
|
+
y1 = min(img.size[1], bbox[3] + CROP_PADDING)
|
|
132
|
+
cropped = img.crop((x0, y0, x1, y1))
|
|
133
|
+
img.close()
|
|
134
|
+
return cropped
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _tesseract_ocr(image: Image.Image) -> str:
|
|
138
|
+
import pytesseract # noqa: PLC0415
|
|
139
|
+
|
|
140
|
+
return pytesseract.image_to_string(image, lang='vie')
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _tesseract_quality_ok(text: str) -> bool:
|
|
144
|
+
alpha_count = sum(1 for c in text if c.isalpha())
|
|
145
|
+
if alpha_count < MIN_CHARS_FOR_PAGE:
|
|
146
|
+
return False
|
|
147
|
+
diacritics_count = 0
|
|
148
|
+
for ch in text:
|
|
149
|
+
if ch in DIACRITICS:
|
|
150
|
+
diacritics_count += 1
|
|
151
|
+
return diacritics_count / alpha_count >= MIN_DIACRITICS_RATIO
|
|
152
|
+
|
|
153
|
+
|
|
112
154
|
def _render_page(doc: pdfium.PdfDocument, page_idx: int) -> tuple[Image.Image | None, str]:
|
|
113
155
|
page_obj = doc[page_idx]
|
|
114
156
|
text = page_obj.get_textpage().get_text_range()
|
|
@@ -118,27 +160,25 @@ def _render_page(doc: pdfium.PdfDocument, page_idx: int) -> tuple[Image.Image |
|
|
|
118
160
|
return None, text.strip()
|
|
119
161
|
|
|
120
162
|
min_dim = min(page_obj.get_width(), page_obj.get_height())
|
|
121
|
-
scale_dpi = max((
|
|
163
|
+
scale_dpi = max((768 / min_dim) * 72, IMAGE_DPI)
|
|
122
164
|
_flatten_page(page_obj)
|
|
123
165
|
page_obj = doc[page_idx]
|
|
124
166
|
pil_image: Image.Image = page_obj.render(scale=scale_dpi / 72).to_pil().convert('RGB')
|
|
125
|
-
return pil_image, ''
|
|
167
|
+
return _crop_margins(pil_image), ''
|
|
126
168
|
|
|
127
169
|
|
|
128
|
-
def
|
|
170
|
+
def _chandra_ocr(
|
|
129
171
|
model: object,
|
|
130
172
|
processor: object,
|
|
131
|
-
|
|
173
|
+
formatted_prompt: str,
|
|
132
174
|
image: Image.Image,
|
|
133
175
|
) -> str:
|
|
134
176
|
from mlx_vlm import generate # noqa: PLC0415
|
|
135
|
-
from mlx_vlm.prompt_utils import apply_chat_template # noqa: PLC0415
|
|
136
177
|
|
|
137
|
-
formatted = apply_chat_template(processor, config, OCR_PROMPT, num_images=1)
|
|
138
178
|
result = generate(
|
|
139
179
|
model, # type: ignore[arg-type]
|
|
140
180
|
processor, # type: ignore[arg-type]
|
|
141
|
-
|
|
181
|
+
formatted_prompt, # type: ignore[arg-type]
|
|
142
182
|
image, # type: ignore[arg-type]
|
|
143
183
|
max_tokens=MAX_TOKENS,
|
|
144
184
|
temperature=0.0,
|
|
@@ -147,11 +187,44 @@ def _ocr_page(
|
|
|
147
187
|
return result.text
|
|
148
188
|
|
|
149
189
|
|
|
150
|
-
|
|
190
|
+
@dataclass
|
|
191
|
+
class ChandraState:
|
|
192
|
+
model: object = None
|
|
193
|
+
processor: object = None
|
|
194
|
+
formatted_prompt: str = ''
|
|
195
|
+
loaded: bool = False
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _load_chandra(state: ChandraState) -> None:
|
|
199
|
+
if state.loaded:
|
|
200
|
+
return
|
|
201
|
+
_log(f'Loading MLX model {MODEL_ID}...')
|
|
202
|
+
t0 = time.time()
|
|
203
|
+
from mlx_vlm import load # noqa: PLC0415
|
|
204
|
+
from mlx_vlm.prompt_utils import apply_chat_template # noqa: PLC0415
|
|
205
|
+
from mlx_vlm.utils import load_config # noqa: PLC0415
|
|
206
|
+
|
|
207
|
+
state.model, state.processor = load(MODEL_ID)
|
|
208
|
+
config = load_config(MODEL_ID)
|
|
209
|
+
state.formatted_prompt = apply_chat_template(state.processor, config, OCR_PROMPT, num_images=1)
|
|
210
|
+
ip = state.processor.image_processor
|
|
211
|
+
ip.max_pixels = VISION_MAX_PIXELS
|
|
212
|
+
ip.min_pixels = VISION_MAX_PIXELS // 4
|
|
213
|
+
state.loaded = True
|
|
214
|
+
_log(f'Model loaded in {time.time() - t0:.0f}s')
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@dataclass
|
|
218
|
+
class PageCounts:
|
|
219
|
+
tess: int = 0
|
|
220
|
+
vlm: int = 0
|
|
221
|
+
txt: int = 0
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _ocr_one_file( # noqa: PLR0913, PLR0915, PLR0917
|
|
151
225
|
pdf: str,
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
config: dict[str, object],
|
|
226
|
+
chandra: ChandraState,
|
|
227
|
+
counts: PageCounts,
|
|
155
228
|
done_count: int,
|
|
156
229
|
idx: int,
|
|
157
230
|
pending_len: int,
|
|
@@ -164,52 +237,76 @@ def _ocr_one_file( # noqa: PLR0913, PLR0917
|
|
|
164
237
|
unique_name = _to_output_name(pdf)
|
|
165
238
|
display_name = Path(pdf).stem
|
|
166
239
|
file_start = time.time()
|
|
240
|
+
md_path = OUTPUT_BASE / f'{unique_name}.md'
|
|
241
|
+
tmp_path = OUTPUT_BASE / f'{unique_name}.md.tmp'
|
|
167
242
|
doc = pdfium.PdfDocument(pdf)
|
|
168
243
|
try:
|
|
169
244
|
doc.init_forms()
|
|
170
245
|
num_pages = len(doc)
|
|
171
246
|
_log(f'[{idx}/{pending_len}] ({done_count + idx}/{total}) OCR {display_name} ({num_pages}p)')
|
|
172
|
-
all_markdown: list[str] = []
|
|
173
|
-
|
|
174
|
-
for p_idx in range(num_pages):
|
|
175
|
-
_save_progress(
|
|
176
|
-
Progress(
|
|
177
|
-
done=done_count + idx - 1,
|
|
178
|
-
total=total,
|
|
179
|
-
errors=errors,
|
|
180
|
-
current_file=display_name,
|
|
181
|
-
current_page=f'{p_idx + 1}/{num_pages}',
|
|
182
|
-
current_pages_total=num_pages,
|
|
183
|
-
current_file_started=file_start,
|
|
184
|
-
elapsed=time.time() - pipeline_start,
|
|
185
|
-
avg_per_file=avg,
|
|
186
|
-
recent_files=recent_files,
|
|
187
|
-
)
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
page_t = time.time()
|
|
191
|
-
image, native_text = _render_page(doc, p_idx)
|
|
192
|
-
|
|
193
|
-
if image is not None:
|
|
194
|
-
md = _ocr_page(model, processor, config, image)
|
|
195
|
-
image.close()
|
|
196
|
-
tag = 'ocr'
|
|
197
|
-
else:
|
|
198
|
-
md = native_text
|
|
199
|
-
tag = 'txt'
|
|
200
247
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
if p_idx % 5 == 0:
|
|
205
|
-
_free_memory()
|
|
248
|
+
executor = ThreadPoolExecutor(max_workers=1)
|
|
249
|
+
with tmp_path.open('w', encoding='utf-8') as out:
|
|
250
|
+
next_future = executor.submit(_render_page, doc, 0)
|
|
206
251
|
|
|
207
|
-
|
|
208
|
-
|
|
252
|
+
for p_idx in range(num_pages):
|
|
253
|
+
_save_progress(
|
|
254
|
+
Progress(
|
|
255
|
+
done=done_count + idx - 1,
|
|
256
|
+
total=total,
|
|
257
|
+
errors=errors,
|
|
258
|
+
current_file=display_name,
|
|
259
|
+
current_page=f'{p_idx + 1}/{num_pages}',
|
|
260
|
+
current_pages_total=num_pages,
|
|
261
|
+
current_file_started=file_start,
|
|
262
|
+
elapsed=time.time() - pipeline_start,
|
|
263
|
+
avg_per_file=avg,
|
|
264
|
+
recent_files=recent_files,
|
|
265
|
+
)
|
|
266
|
+
)
|
|
209
267
|
|
|
210
|
-
|
|
268
|
+
page_t = time.time()
|
|
269
|
+
image, native_text = next_future.result()
|
|
270
|
+
|
|
271
|
+
if p_idx + 1 < num_pages:
|
|
272
|
+
next_future = executor.submit(_render_page, doc, p_idx + 1)
|
|
273
|
+
|
|
274
|
+
if image is not None:
|
|
275
|
+
tess_text = _tesseract_ocr(image)
|
|
276
|
+
if _tesseract_quality_ok(tess_text):
|
|
277
|
+
md = tess_text
|
|
278
|
+
tag = 'tess'
|
|
279
|
+
counts.tess += 1
|
|
280
|
+
else:
|
|
281
|
+
_load_chandra(chandra)
|
|
282
|
+
md = _chandra_ocr(chandra.model, chandra.processor, chandra.formatted_prompt, image)
|
|
283
|
+
tag = 'vlm'
|
|
284
|
+
counts.vlm += 1
|
|
285
|
+
image.close()
|
|
286
|
+
else:
|
|
287
|
+
md = native_text
|
|
288
|
+
tag = 'txt'
|
|
289
|
+
counts.txt += 1
|
|
290
|
+
|
|
291
|
+
if p_idx > 0:
|
|
292
|
+
out.write('\n\n')
|
|
293
|
+
out.write(md)
|
|
294
|
+
out.flush()
|
|
295
|
+
_log(f' p{p_idx + 1}/{num_pages} [{tag}] {time.time() - page_t:.0f}s ({len(md)} chars)')
|
|
296
|
+
|
|
297
|
+
if p_idx % 10 == 9: # noqa: PLR2004
|
|
298
|
+
_free_memory()
|
|
299
|
+
|
|
300
|
+
executor.shutdown(wait=False)
|
|
301
|
+
|
|
302
|
+
tmp_path.rename(md_path)
|
|
211
303
|
_free_memory()
|
|
212
|
-
return num_pages
|
|
304
|
+
return num_pages # noqa: TRY300
|
|
305
|
+
except Exception:
|
|
306
|
+
for p in [tmp_path, md_path]:
|
|
307
|
+
if p.exists():
|
|
308
|
+
p.unlink()
|
|
309
|
+
raise
|
|
213
310
|
finally:
|
|
214
311
|
doc.close()
|
|
215
312
|
|
|
@@ -238,14 +335,9 @@ def _run_ocr() -> None: # noqa: PLR0914
|
|
|
238
335
|
_save_progress(Progress(total, total, 0, '-', '-', 0, 0, 0, 0, []))
|
|
239
336
|
return
|
|
240
337
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
from mlx_vlm.utils import load_config # noqa: PLC0415
|
|
245
|
-
|
|
246
|
-
model, processor = load(MODEL_ID)
|
|
247
|
-
config = load_config(MODEL_ID)
|
|
248
|
-
_log(f'Model loaded in {time.time() - t0:.0f}s')
|
|
338
|
+
chandra = ChandraState()
|
|
339
|
+
counts = PageCounts()
|
|
340
|
+
_log('Hybrid OCR: Tesseract fast-pass, chandra VLM fallback')
|
|
249
341
|
|
|
250
342
|
errors = 0
|
|
251
343
|
file_times: list[float] = []
|
|
@@ -263,7 +355,17 @@ def _run_ocr() -> None: # noqa: PLR0914
|
|
|
263
355
|
|
|
264
356
|
try:
|
|
265
357
|
num_pages = _ocr_one_file(
|
|
266
|
-
pdf,
|
|
358
|
+
pdf,
|
|
359
|
+
chandra,
|
|
360
|
+
counts,
|
|
361
|
+
done_count,
|
|
362
|
+
i,
|
|
363
|
+
len(pending),
|
|
364
|
+
total,
|
|
365
|
+
errors,
|
|
366
|
+
avg,
|
|
367
|
+
pipeline_start,
|
|
368
|
+
recent_files,
|
|
267
369
|
)
|
|
268
370
|
elapsed = time.time() - t1
|
|
269
371
|
file_times.append(elapsed)
|
|
@@ -297,6 +399,7 @@ def _run_ocr() -> None: # noqa: PLR0914
|
|
|
297
399
|
|
|
298
400
|
total_time = _format_duration(time.time() - pipeline_start)
|
|
299
401
|
_log(f'OCR complete. Done: {done_count + len(pending)}, Errors: {errors}, Time: {total_time}')
|
|
402
|
+
_log(f'Engine stats: tess={counts.tess} vlm={counts.vlm} txt={counts.txt}')
|
|
300
403
|
|
|
301
404
|
|
|
302
405
|
def main() -> None:
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
|
8
|
+
|
|
9
|
+
import pypdfium2 as pdfium
|
|
10
|
+
import pypdfium2.raw as pdfium_c
|
|
11
|
+
from PIL import Image, ImageChops, ImageOps
|
|
12
|
+
|
|
13
|
+
CROP_PADDING = 20
|
|
14
|
+
CROP_THRESHOLD = 10
|
|
15
|
+
IMAGE_DPI = 150
|
|
16
|
+
|
|
17
|
+
DIACRITICS = frozenset(
|
|
18
|
+
'àáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ'
|
|
19
|
+
'ÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴĐ'
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _get_arg(flag: str, default: str) -> str:
|
|
24
|
+
try:
|
|
25
|
+
idx = sys.argv.index(flag)
|
|
26
|
+
return sys.argv[idx + 1]
|
|
27
|
+
except (ValueError, IndexError):
|
|
28
|
+
return default
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
PDF_PATH = _get_arg('--pdf', '')
|
|
32
|
+
PAGES_STR = _get_arg('--pages', '0,1,2')
|
|
33
|
+
ENGINES_STR = _get_arg('--engines', 'tesseract,paddleocr,easyocr,surya,vietocr')
|
|
34
|
+
|
|
35
|
+
if not PDF_PATH:
|
|
36
|
+
print(
|
|
37
|
+
'Usage: python benchmark-ocr.py --pdf <path> [--pages 0,1,2] [--engines tesseract,paddleocr,easyocr,surya,vietocr]'
|
|
38
|
+
)
|
|
39
|
+
sys.exit(1)
|
|
40
|
+
|
|
41
|
+
PAGE_INDICES = [int(p.strip()) for p in PAGES_STR.split(',')]
|
|
42
|
+
ENGINES = [e.strip() for e in ENGINES_STR.split(',')]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _crop_margins(img: Image.Image) -> Image.Image:
|
|
46
|
+
bg = Image.new('RGB', img.size, (255, 255, 255))
|
|
47
|
+
diff = ImageChops.difference(img, bg)
|
|
48
|
+
gray = ImageOps.grayscale(diff)
|
|
49
|
+
bbox = gray.point(lambda x: 255 if x > CROP_THRESHOLD else 0).getbbox()
|
|
50
|
+
bg.close()
|
|
51
|
+
diff.close()
|
|
52
|
+
gray.close()
|
|
53
|
+
if not bbox:
|
|
54
|
+
return img
|
|
55
|
+
x0 = max(0, bbox[0] - CROP_PADDING)
|
|
56
|
+
y0 = max(0, bbox[1] - CROP_PADDING)
|
|
57
|
+
x1 = min(img.size[0], bbox[2] + CROP_PADDING)
|
|
58
|
+
y1 = min(img.size[1], bbox[3] + CROP_PADDING)
|
|
59
|
+
return img.crop((x0, y0, x1, y1))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def render_page(doc: pdfium.PdfDocument, page_idx: int) -> Image.Image:
|
|
63
|
+
page_obj = doc[page_idx]
|
|
64
|
+
pdfium_c.FPDFPage_Flatten(page_obj, pdfium_c.FLAT_NORMALDISPLAY)
|
|
65
|
+
page_obj = doc[page_idx]
|
|
66
|
+
min_dim = min(page_obj.get_width(), page_obj.get_height())
|
|
67
|
+
scale_dpi = max((768 / min_dim) * 72, IMAGE_DPI)
|
|
68
|
+
pil_image: Image.Image = page_obj.render(scale=scale_dpi / 72).to_pil().convert('RGB')
|
|
69
|
+
return _crop_margins(pil_image)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _count_vietnamese_diacritics(text: str) -> int:
|
|
73
|
+
count = 0
|
|
74
|
+
for ch in text:
|
|
75
|
+
if ch in DIACRITICS:
|
|
76
|
+
count += 1
|
|
77
|
+
return count
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def run_tesseract(image: Image.Image, _state: dict) -> str:
|
|
81
|
+
import pytesseract # noqa: PLC0415
|
|
82
|
+
|
|
83
|
+
return pytesseract.image_to_string(image, lang='vie')
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def run_paddleocr(image: Image.Image, _state: dict) -> str:
|
|
87
|
+
import numpy as np # noqa: PLC0415
|
|
88
|
+
|
|
89
|
+
if 'engine' not in _state:
|
|
90
|
+
from paddleocr import PaddleOCR # noqa: PLC0415
|
|
91
|
+
|
|
92
|
+
_state['engine'] = PaddleOCR(lang='vi')
|
|
93
|
+
|
|
94
|
+
img_array = np.array(image)
|
|
95
|
+
lines: list[str] = []
|
|
96
|
+
for page_result in _state['engine'].predict(img_array):
|
|
97
|
+
j = page_result.json
|
|
98
|
+
res = j.get('res', {})
|
|
99
|
+
if 'rec_texts' in res:
|
|
100
|
+
lines.extend(res['rec_texts'])
|
|
101
|
+
return '\n'.join(lines)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def run_easyocr(image: Image.Image, _state: dict) -> str:
|
|
105
|
+
import numpy as np # noqa: PLC0415
|
|
106
|
+
|
|
107
|
+
if 'reader' not in _state:
|
|
108
|
+
import easyocr # noqa: PLC0415
|
|
109
|
+
|
|
110
|
+
_state['reader'] = easyocr.Reader(['vi'], gpu=False)
|
|
111
|
+
|
|
112
|
+
img_array = np.array(image)
|
|
113
|
+
results = _state['reader'].readtext(img_array)
|
|
114
|
+
lines = []
|
|
115
|
+
for _bbox, text, _conf in results:
|
|
116
|
+
lines.append(text)
|
|
117
|
+
return '\n'.join(lines)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def run_surya(image: Image.Image, _state: dict) -> str:
|
|
121
|
+
if 'rec' not in _state:
|
|
122
|
+
from surya.detection import DetectionPredictor # noqa: PLC0415
|
|
123
|
+
from surya.foundation import FoundationPredictor # noqa: PLC0415
|
|
124
|
+
from surya.recognition import RecognitionPredictor # noqa: PLC0415
|
|
125
|
+
|
|
126
|
+
foundation = FoundationPredictor()
|
|
127
|
+
_state['det'] = DetectionPredictor()
|
|
128
|
+
_state['rec'] = RecognitionPredictor(foundation)
|
|
129
|
+
|
|
130
|
+
predictions = _state['rec']([image], det_predictor=_state['det'])
|
|
131
|
+
return '\n'.join(tl.text for tl in predictions[0].text_lines)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def run_vietocr(image: Image.Image, _state: dict) -> str:
|
|
135
|
+
if 'detector' not in _state:
|
|
136
|
+
import torch # noqa: PLC0415
|
|
137
|
+
from vietocr.tool.config import Cfg # noqa: PLC0415
|
|
138
|
+
from vietocr.tool.predictor import Predictor # noqa: PLC0415
|
|
139
|
+
|
|
140
|
+
config = Cfg.load_config_from_name('vgg_transformer')
|
|
141
|
+
config['cnn']['pretrained'] = False
|
|
142
|
+
if torch.backends.mps.is_available():
|
|
143
|
+
config['device'] = 'mps'
|
|
144
|
+
else:
|
|
145
|
+
config['device'] = 'cpu'
|
|
146
|
+
_state['detector'] = Predictor(config)
|
|
147
|
+
|
|
148
|
+
return _state['detector'].predict(image)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
ENGINE_RUNNERS = {
|
|
152
|
+
'tesseract': run_tesseract,
|
|
153
|
+
'paddleocr': run_paddleocr,
|
|
154
|
+
'easyocr': run_easyocr,
|
|
155
|
+
'surya': run_surya,
|
|
156
|
+
'vietocr': run_vietocr,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def main() -> None: # noqa: C901, PLR0912, PLR0914, PLR0915
|
|
161
|
+
doc = pdfium.PdfDocument(PDF_PATH)
|
|
162
|
+
num_pages = len(doc)
|
|
163
|
+
print(f'PDF: {PDF_PATH} ({num_pages} pages)')
|
|
164
|
+
print(f'Testing pages: {PAGE_INDICES}')
|
|
165
|
+
print(f'Engines: {ENGINES}')
|
|
166
|
+
print('=' * 80)
|
|
167
|
+
|
|
168
|
+
images: dict[int, Image.Image] = {}
|
|
169
|
+
for p_idx in PAGE_INDICES:
|
|
170
|
+
if p_idx >= num_pages:
|
|
171
|
+
print(f' SKIP page {p_idx} (only {num_pages} pages)')
|
|
172
|
+
else:
|
|
173
|
+
t0 = time.time()
|
|
174
|
+
images[p_idx] = render_page(doc, p_idx)
|
|
175
|
+
render_time = time.time() - t0
|
|
176
|
+
img = images[p_idx]
|
|
177
|
+
print(f' Rendered page {p_idx}: {img.size[0]}x{img.size[1]} in {render_time:.1f}s')
|
|
178
|
+
|
|
179
|
+
doc.close()
|
|
180
|
+
print()
|
|
181
|
+
|
|
182
|
+
states: dict[str, dict] = {}
|
|
183
|
+
results: list[dict] = []
|
|
184
|
+
|
|
185
|
+
for engine_name in ENGINES:
|
|
186
|
+
if engine_name not in ENGINE_RUNNERS:
|
|
187
|
+
print(f'Unknown engine: {engine_name}')
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
states.setdefault(engine_name, {})
|
|
191
|
+
runner = ENGINE_RUNNERS[engine_name]
|
|
192
|
+
print(f'=== {engine_name.upper()} ===')
|
|
193
|
+
|
|
194
|
+
for p_idx in sorted(images.keys()):
|
|
195
|
+
img = images[p_idx]
|
|
196
|
+
print(f' page {p_idx} ... ', end='', flush=True)
|
|
197
|
+
|
|
198
|
+
t0 = time.time()
|
|
199
|
+
try:
|
|
200
|
+
text = runner(img, states[engine_name])
|
|
201
|
+
elapsed = time.time() - t0
|
|
202
|
+
char_count = len(text)
|
|
203
|
+
alpha_count = sum(1 for c in text if c.isalpha())
|
|
204
|
+
diacritics_count = _count_vietnamese_diacritics(text)
|
|
205
|
+
print(f'{elapsed:.1f}s | {char_count} chars | {alpha_count} alpha | {diacritics_count} diacritics')
|
|
206
|
+
|
|
207
|
+
results.append({
|
|
208
|
+
'engine': engine_name,
|
|
209
|
+
'page': p_idx,
|
|
210
|
+
'time_s': round(elapsed, 2),
|
|
211
|
+
'chars': char_count,
|
|
212
|
+
'alpha': alpha_count,
|
|
213
|
+
'diacritics': diacritics_count,
|
|
214
|
+
'text_preview': text[:300].replace('\n', ' '),
|
|
215
|
+
'text_full': text,
|
|
216
|
+
})
|
|
217
|
+
except Exception as run_err: # noqa: BLE001
|
|
218
|
+
elapsed = time.time() - t0
|
|
219
|
+
print(f'ERROR {elapsed:.1f}s: {run_err}')
|
|
220
|
+
results.append({
|
|
221
|
+
'engine': engine_name,
|
|
222
|
+
'page': p_idx,
|
|
223
|
+
'time_s': round(elapsed, 2),
|
|
224
|
+
'error': str(run_err),
|
|
225
|
+
})
|
|
226
|
+
|
|
227
|
+
print()
|
|
228
|
+
print('=' * 80)
|
|
229
|
+
print('SUMMARY')
|
|
230
|
+
print('=' * 80)
|
|
231
|
+
header = f'{"Engine":<12} {"Page":<6} {"Time":>8} {"Chars":>8} {"Alpha":>8} {"Diacritics":>10} Preview'
|
|
232
|
+
print(header)
|
|
233
|
+
print('-' * 100)
|
|
234
|
+
for r in results:
|
|
235
|
+
if 'error' in r:
|
|
236
|
+
print(f'{r["engine"]:<12} {r["page"]:<6} {r["time_s"]:>7.1f}s {"ERROR":>8}')
|
|
237
|
+
else:
|
|
238
|
+
preview = r['text_preview'][:60]
|
|
239
|
+
print(
|
|
240
|
+
f'{r["engine"]:<12} {r["page"]:<6} {r["time_s"]:>7.1f}s'
|
|
241
|
+
f' {r["chars"]:>8} {r["alpha"]:>8} {r["diacritics"]:>10}'
|
|
242
|
+
f' {preview}'
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
out_path = Path(PDF_PATH).parent / 'benchmark-results.json'
|
|
246
|
+
summary = []
|
|
247
|
+
for r in results:
|
|
248
|
+
s = dict(r)
|
|
249
|
+
s.pop('text_full', None)
|
|
250
|
+
summary.append(s)
|
|
251
|
+
out_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False) + '\n', encoding='utf-8')
|
|
252
|
+
print(f'\nResults saved to {out_path}')
|
|
253
|
+
|
|
254
|
+
print('\n\nFULL TEXT OUTPUT PER ENGINE/PAGE:')
|
|
255
|
+
print('=' * 80)
|
|
256
|
+
for r in results:
|
|
257
|
+
if 'text_full' in r:
|
|
258
|
+
print(f'\n--- [{r["engine"]}] page {r["page"]} ({r["time_s"]}s, {r["chars"]} chars) ---')
|
|
259
|
+
full = r['text_full']
|
|
260
|
+
print(full[:2000])
|
|
261
|
+
if len(full) > 2000: # noqa: PLR2004
|
|
262
|
+
print(f'... (truncated, {len(full)} total chars)')
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
if __name__ == '__main__':
|
|
266
|
+
main()
|
package/src/md-enhancer.ts
CHANGED
|
@@ -2,12 +2,21 @@
|
|
|
2
2
|
import { readFile, writeFile } from 'node:fs/promises'
|
|
3
3
|
import { basename, join } from 'node:path'
|
|
4
4
|
import pMap from 'p-map'
|
|
5
|
+
import TurndownService from 'turndown'
|
|
5
6
|
|
|
6
7
|
import type { CleanResult } from '~/types'
|
|
7
8
|
|
|
8
9
|
import { ensureDir, loadExistingMdFiles, logger } from '~/utils'
|
|
9
10
|
|
|
10
|
-
const
|
|
11
|
+
const td = new TurndownService({ bulletListMarker: '-', emDelimiter: '*', headingStyle: 'atx' })
|
|
12
|
+
td.remove('style')
|
|
13
|
+
const turndown = td,
|
|
14
|
+
HTML_DETECT_REGEX = /<\/?[a-z][a-z0-9]*[^>]*>/iu,
|
|
15
|
+
stripHtml = (text: string): string => {
|
|
16
|
+
if (!HTML_DETECT_REGEX.test(text)) return text
|
|
17
|
+
return turndown.turndown(text)
|
|
18
|
+
},
|
|
19
|
+
BOLD_LINE_REGEX = /^\*\*(?<content>.+)\*\*$/u,
|
|
11
20
|
PHAN_REGEX = /^(?:Phần|PHẦN)\s+/u,
|
|
12
21
|
CHUONG_REGEX = /^(?:Chương|CHƯƠNG)\s+/u,
|
|
13
22
|
MUC_REGEX = /^(?:Mục|MỤC)\s+\d/u,
|
|
@@ -18,6 +27,7 @@ const BOLD_LINE_REGEX = /^\*\*(?<content>.+)\*\*$/u,
|
|
|
18
27
|
HEADER_TABLE_LINE_REGEX = /^\|.*\|$/u,
|
|
19
28
|
TABLE_SEP_REGEX = /^\|\s*-+/u,
|
|
20
29
|
MULTIPLE_BLANKS_REGEX = /\n{3,}/gu,
|
|
30
|
+
MULTIPLE_SPACES_REGEX = / {2,}/gu,
|
|
21
31
|
// oxlint-disable-next-line no-control-regex
|
|
22
32
|
// eslint-disable-next-line no-control-regex
|
|
23
33
|
CONTROL_CHARS_REGEX = /[\u0000-\u0008\u000B\u000C\u000E-\u001F]/gu,
|
|
@@ -85,9 +95,11 @@ const BOLD_LINE_REGEX = /^\*\*(?<content>.+)\*\*$/u,
|
|
|
85
95
|
return enhanced
|
|
86
96
|
},
|
|
87
97
|
enhanceMarkdown = (text: string): string => {
|
|
88
|
-
const
|
|
98
|
+
const cleaned = stripHtml(text)
|
|
99
|
+
const enhanced = processLines(cleaned.split('\n'))
|
|
89
100
|
let output = enhanced.join('\n')
|
|
90
101
|
output = output.replace(MULTIPLE_BLANKS_REGEX, '\n\n')
|
|
102
|
+
output = output.replace(MULTIPLE_SPACES_REGEX, ' ')
|
|
91
103
|
output = output.trim()
|
|
92
104
|
return output
|
|
93
105
|
},
|