mathcraft-ocr 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/MANIFEST.in +13 -13
- {mathcraft_ocr-0.2.0/mathcraft_ocr.egg-info → mathcraft_ocr-0.2.2}/PKG-INFO +1 -1
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/__init__.py +1 -1
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/formula_lines.py +73 -1
- mathcraft_ocr-0.2.2/mathcraft_ocr/latex_alignment.py +100 -0
- mathcraft_ocr-0.2.2/mathcraft_ocr/latex_quality.py +96 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/manifests/models.v1.json +57 -57
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/runtime.py +90 -1
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2/mathcraft_ocr.egg-info}/PKG-INFO +1 -1
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr.egg-info/SOURCES.txt +3 -1
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/pyproject.toml +7 -3
- mathcraft_ocr-0.2.2/readme.md +389 -0
- mathcraft_ocr-0.2.0/README.md +0 -230
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/LICENSE +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/README_MATHCRAFT_OCR.md +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/__main__.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/adapters/__init__.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/adapters/common.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/adapters/formula_detector.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/adapters/formula_recognizer.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/adapters/text_detector.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/adapters/text_recognizer.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/api.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/cache.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/cli.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/debug_blocks.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/doctor.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/downloader.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/error_patterns.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/errors.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/hardware.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/image.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/layout.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/manifest.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/profiles.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/providers.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/results.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/serialization.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr/worker.py +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr.egg-info/dependency_links.txt +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr.egg-info/entry_points.txt +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr.egg-info/requires.txt +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/mathcraft_ocr.egg-info/top_level.txt +0 -0
- {mathcraft_ocr-0.2.0 → mathcraft_ocr-0.2.2}/setup.cfg +0 -0
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
include LICENSE
|
|
1
|
+
include LICENSE
|
|
2
2
|
include README_MATHCRAFT_OCR.md
|
|
3
|
-
include pyproject.toml
|
|
4
|
-
graft mathcraft_ocr
|
|
5
|
-
prune build
|
|
6
|
-
prune dist
|
|
7
|
-
prune docs
|
|
8
|
-
prune release_assets
|
|
9
|
-
prune scripts
|
|
10
|
-
prune src
|
|
11
|
-
prune test
|
|
12
|
-
prune test_pdf
|
|
13
|
-
global-exclude __pycache__
|
|
14
|
-
global-exclude *.py[cod]
|
|
3
|
+
include pyproject.toml
|
|
4
|
+
graft mathcraft_ocr
|
|
5
|
+
prune build
|
|
6
|
+
prune dist
|
|
7
|
+
prune docs
|
|
8
|
+
prune release_assets
|
|
9
|
+
prune scripts
|
|
10
|
+
prune src
|
|
11
|
+
prune test
|
|
12
|
+
prune test_pdf
|
|
13
|
+
global-exclude __pycache__
|
|
14
|
+
global-exclude *.py[cod]
|
|
@@ -7,6 +7,8 @@ import re
|
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
|
|
10
|
+
from .latex_alignment import align_latex_relation_lines
|
|
11
|
+
|
|
10
12
|
|
|
11
13
|
@dataclass(frozen=True)
|
|
12
14
|
class FormulaLineCrop:
|
|
@@ -50,7 +52,7 @@ def compose_aligned_formula(lines: list[str] | tuple[str, ...]) -> str:
|
|
|
50
52
|
cleaned = [line for line in cleaned if line]
|
|
51
53
|
if len(cleaned) <= 1:
|
|
52
54
|
return cleaned[0] if cleaned else ""
|
|
53
|
-
body = " \\\\\n".join(cleaned)
|
|
55
|
+
body = " \\\\\n".join(align_latex_relation_lines(cleaned))
|
|
54
56
|
return "\\begin{aligned}\n" + body + "\n\\end{aligned}"
|
|
55
57
|
|
|
56
58
|
|
|
@@ -81,6 +83,8 @@ def _split_formula_rows(rgb: np.ndarray) -> tuple[FormulaLineCrop, ...]:
|
|
|
81
83
|
if _band_looks_like_formula_row(mask, top, bottom, image_width=width)
|
|
82
84
|
]
|
|
83
85
|
bands = _filter_annotation_bands(mask, bands, image_width=width)
|
|
86
|
+
if _looks_like_compact_fraction_split(mask, bands, image_width=width, image_height=height):
|
|
87
|
+
return ()
|
|
84
88
|
if len(bands) < 2:
|
|
85
89
|
return ()
|
|
86
90
|
|
|
@@ -191,6 +195,51 @@ def _filter_annotation_bands(
|
|
|
191
195
|
return [(top, bottom) for active, top, bottom in stats if active >= min_active]
|
|
192
196
|
|
|
193
197
|
|
|
198
|
+
def _looks_like_compact_fraction_split(
|
|
199
|
+
mask: np.ndarray,
|
|
200
|
+
bands: list[tuple[int, int]],
|
|
201
|
+
*,
|
|
202
|
+
image_width: int,
|
|
203
|
+
image_height: int,
|
|
204
|
+
) -> bool:
|
|
205
|
+
if len(bands) != 2:
|
|
206
|
+
return False
|
|
207
|
+
if image_height < 70 or image_width / max(1, image_height) > 2.65:
|
|
208
|
+
return False
|
|
209
|
+
|
|
210
|
+
first = _band_bounds(mask, *bands[0])
|
|
211
|
+
second = _band_bounds(mask, *bands[1])
|
|
212
|
+
if first is None or second is None:
|
|
213
|
+
return False
|
|
214
|
+
|
|
215
|
+
first_left, _first_top, first_right, first_bottom = first
|
|
216
|
+
second_left, second_top, second_right, _second_bottom = second
|
|
217
|
+
vertical_gap = second_top - first_bottom - 1
|
|
218
|
+
if vertical_gap < 0 or vertical_gap > max(12, int(round(image_height * 0.09))):
|
|
219
|
+
return False
|
|
220
|
+
|
|
221
|
+
first_width = first_right - first_left + 1
|
|
222
|
+
second_width = second_right - second_left + 1
|
|
223
|
+
first_inset = min(first_left, image_width - first_right - 1)
|
|
224
|
+
second_inset = min(second_left, image_width - second_right - 1)
|
|
225
|
+
first_is_centered_fragment = (
|
|
226
|
+
first_width <= image_width * 0.86 and first_inset >= image_width * 0.06
|
|
227
|
+
)
|
|
228
|
+
second_is_wider = second_width >= first_width * 1.15 and second_inset <= image_width * 0.08
|
|
229
|
+
return first_is_centered_fragment and second_is_wider
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _band_bounds(mask: np.ndarray, top: int, bottom: int) -> tuple[int, int, int, int] | None:
|
|
233
|
+
points = np.argwhere(mask[top : bottom + 1, :])
|
|
234
|
+
if points.size == 0:
|
|
235
|
+
return None
|
|
236
|
+
y1 = int(points[:, 0].min()) + top
|
|
237
|
+
y2 = int(points[:, 0].max()) + top
|
|
238
|
+
x1 = int(points[:, 1].min())
|
|
239
|
+
x2 = int(points[:, 1].max())
|
|
240
|
+
return x1, y1, x2, y2
|
|
241
|
+
|
|
242
|
+
|
|
194
243
|
def _band_active_columns(mask: np.ndarray, top: int, bottom: int) -> int:
|
|
195
244
|
return int(np.count_nonzero(mask[top : bottom + 1, :].any(axis=0)))
|
|
196
245
|
|
|
@@ -229,6 +278,9 @@ def _split_wide_line_segments(line: FormulaLineCrop) -> tuple[FormulaLineCrop, .
|
|
|
229
278
|
return (line,)
|
|
230
279
|
|
|
231
280
|
mask = _ink_mask(line.image)
|
|
281
|
+
if _has_large_vertical_delimiter(mask):
|
|
282
|
+
return (line,)
|
|
283
|
+
|
|
232
284
|
column_counts = mask.sum(axis=0)
|
|
233
285
|
column_threshold = max(2, int(round(height * 0.035)))
|
|
234
286
|
column_has_ink = column_counts >= column_threshold
|
|
@@ -264,6 +316,26 @@ def _split_wide_line_segments(line: FormulaLineCrop) -> tuple[FormulaLineCrop, .
|
|
|
264
316
|
return tuple(segments) if len(segments) > 1 else (line,)
|
|
265
317
|
|
|
266
318
|
|
|
319
|
+
def _has_large_vertical_delimiter(mask: np.ndarray) -> bool:
|
|
320
|
+
height, width = mask.shape
|
|
321
|
+
if height < 48 or width < 80:
|
|
322
|
+
return False
|
|
323
|
+
content = np.argwhere(mask)
|
|
324
|
+
if content.size == 0:
|
|
325
|
+
return False
|
|
326
|
+
content_height = int(content[:, 0].max() - content[:, 0].min() + 1)
|
|
327
|
+
if content_height < height * 0.42:
|
|
328
|
+
return False
|
|
329
|
+
window = max(6, int(round(width * 0.035)))
|
|
330
|
+
column_counts = mask.sum(axis=0)
|
|
331
|
+
tall_threshold = max(12, int(round(content_height * 0.42)))
|
|
332
|
+
left_has_tall = bool(np.any(column_counts[:window] >= tall_threshold))
|
|
333
|
+
right_has_tall = bool(np.any(column_counts[-window:] >= tall_threshold))
|
|
334
|
+
interior = column_counts[window:-window] if width > window * 2 else column_counts
|
|
335
|
+
interior_tall_count = int(np.count_nonzero(interior >= tall_threshold))
|
|
336
|
+
return left_has_tall or right_has_tall or interior_tall_count >= 2
|
|
337
|
+
|
|
338
|
+
|
|
267
339
|
def _crop_segment(
|
|
268
340
|
segment_rgb: np.ndarray,
|
|
269
341
|
offset_x: int,
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
RELATION_COMMANDS = (
|
|
7
|
+
r"\leqslant",
|
|
8
|
+
r"\geqslant",
|
|
9
|
+
r"\leq",
|
|
10
|
+
r"\geq",
|
|
11
|
+
r"\approx",
|
|
12
|
+
r"\equiv",
|
|
13
|
+
r"\sim",
|
|
14
|
+
)
|
|
15
|
+
RELATION_SYMBOLS = ("=", "<", ">")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def align_latex_relation_lines(lines: list[str] | tuple[str, ...]) -> tuple[str, ...]:
|
|
19
|
+
cleaned = [line.strip() for line in lines if line and line.strip()]
|
|
20
|
+
if len(cleaned) <= 1:
|
|
21
|
+
return tuple(cleaned)
|
|
22
|
+
|
|
23
|
+
aligned: list[str] = []
|
|
24
|
+
saw_relation = False
|
|
25
|
+
for index, line in enumerate(cleaned):
|
|
26
|
+
relation = find_alignment_relation(line)
|
|
27
|
+
if relation is None:
|
|
28
|
+
if index > 0 and saw_relation:
|
|
29
|
+
aligned.append(r"&\quad " + line)
|
|
30
|
+
else:
|
|
31
|
+
aligned.append(line)
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
left, operator, right = relation
|
|
35
|
+
saw_relation = True
|
|
36
|
+
if left:
|
|
37
|
+
aligned.append(f"{left} &{operator} {right}".rstrip())
|
|
38
|
+
else:
|
|
39
|
+
aligned.append(f"&{operator} {right}".rstrip())
|
|
40
|
+
return tuple(aligned)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def find_alignment_relation(text: str) -> tuple[str, str, str] | None:
|
|
44
|
+
depth = 0
|
|
45
|
+
index = 0
|
|
46
|
+
while index < len(text):
|
|
47
|
+
char = text[index]
|
|
48
|
+
if char == "\\":
|
|
49
|
+
command = _read_command(text, index)
|
|
50
|
+
if depth == 0 and command in RELATION_COMMANDS:
|
|
51
|
+
return _split_relation(text, index, command)
|
|
52
|
+
index += len(command) if command else 1
|
|
53
|
+
continue
|
|
54
|
+
if char == "{" and not _is_escaped(text, index):
|
|
55
|
+
depth += 1
|
|
56
|
+
index += 1
|
|
57
|
+
continue
|
|
58
|
+
if char == "}" and not _is_escaped(text, index):
|
|
59
|
+
depth = max(0, depth - 1)
|
|
60
|
+
index += 1
|
|
61
|
+
continue
|
|
62
|
+
if depth == 0 and char in RELATION_SYMBOLS and not _is_escaped(text, index):
|
|
63
|
+
if _is_duplicate_relation_neighbor(text, index, char):
|
|
64
|
+
index += 1
|
|
65
|
+
continue
|
|
66
|
+
return _split_relation(text, index, char)
|
|
67
|
+
index += 1
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _split_relation(text: str, index: int, operator: str) -> tuple[str, str, str]:
|
|
72
|
+
left = text[:index].strip()
|
|
73
|
+
right = text[index + len(operator) :].strip()
|
|
74
|
+
return left, operator, right
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _read_command(text: str, index: int) -> str:
|
|
78
|
+
if index >= len(text) or text[index] != "\\":
|
|
79
|
+
return ""
|
|
80
|
+
end = index + 1
|
|
81
|
+
while end < len(text) and text[end].isalpha():
|
|
82
|
+
end += 1
|
|
83
|
+
return text[index:end]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _is_duplicate_relation_neighbor(text: str, index: int, char: str) -> bool:
|
|
87
|
+
if char != "=":
|
|
88
|
+
return False
|
|
89
|
+
left = text[:index].rstrip()
|
|
90
|
+
right = text[index + 1 :].lstrip()
|
|
91
|
+
return left.endswith("=") or right.startswith("=")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _is_escaped(text: str, index: int) -> bool:
|
|
95
|
+
backslashes = 0
|
|
96
|
+
pos = index - 1
|
|
97
|
+
while pos >= 0 and text[pos] == "\\":
|
|
98
|
+
backslashes += 1
|
|
99
|
+
pos -= 1
|
|
100
|
+
return backslashes % 2 == 1
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections import Counter
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
_TOKEN_RE = re.compile(r"\\[A-Za-z]+|[A-Za-z]+|\d+|[^\s]")
|
|
10
|
+
_BEGIN_RE = re.compile(r"\\begin\s*\{\s*([^{}]+)\s*\}")
|
|
11
|
+
_END_RE = re.compile(r"\\end\s*\{\s*([^{}]+)\s*\}")
|
|
12
|
+
_LEFT_RE = re.compile(r"\\left(?![A-Za-z])")
|
|
13
|
+
_RIGHT_RE = re.compile(r"\\right(?![A-Za-z])")
|
|
14
|
+
_DUPLICATE_EQUALS_RE = re.compile(r"(?<!\\)=\s*=")
|
|
15
|
+
_NOISY_REPEAT_TOKENS = {r"\quad", r"\qquad", r"\cdots", r"\ldots"}
|
|
16
|
+
|
|
17
|
+
SEVERE_LATEX_QUALITY_FLAGS = {
|
|
18
|
+
"duplicate_relation",
|
|
19
|
+
"repeated_token_run",
|
|
20
|
+
"excessive_repeated_token",
|
|
21
|
+
"unbalanced_group",
|
|
22
|
+
"mismatched_environment",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def latex_quality_flags(text: str) -> tuple[str, ...]:
|
|
27
|
+
value = str(text or "")
|
|
28
|
+
flags: set[str] = set()
|
|
29
|
+
if _DUPLICATE_EQUALS_RE.search(value):
|
|
30
|
+
flags.add("duplicate_relation")
|
|
31
|
+
if _has_repeated_token_run(value):
|
|
32
|
+
flags.add("repeated_token_run")
|
|
33
|
+
if _has_excessive_repeated_token(value):
|
|
34
|
+
flags.add("excessive_repeated_token")
|
|
35
|
+
if _group_balance(value) != 0:
|
|
36
|
+
flags.add("unbalanced_group")
|
|
37
|
+
if not _environments_balanced(value):
|
|
38
|
+
flags.add("mismatched_environment")
|
|
39
|
+
if len(_LEFT_RE.findall(value)) != len(_RIGHT_RE.findall(value)):
|
|
40
|
+
flags.add("unbalanced_left_right")
|
|
41
|
+
return tuple(sorted(flags))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def has_severe_latex_quality_issue(text: str) -> bool:
|
|
45
|
+
return bool(set(latex_quality_flags(text)) & SEVERE_LATEX_QUALITY_FLAGS)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _has_repeated_token_run(text: str) -> bool:
|
|
49
|
+
previous = ""
|
|
50
|
+
run = 0
|
|
51
|
+
for token in _TOKEN_RE.findall(text):
|
|
52
|
+
if token == previous:
|
|
53
|
+
run += 1
|
|
54
|
+
else:
|
|
55
|
+
previous = token
|
|
56
|
+
run = 1
|
|
57
|
+
if run >= 8:
|
|
58
|
+
return True
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _has_excessive_repeated_token(text: str) -> bool:
|
|
63
|
+
counts = Counter(token for token in _TOKEN_RE.findall(text) if token in _NOISY_REPEAT_TOKENS)
|
|
64
|
+
return any(count >= 18 for count in counts.values())
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _group_balance(text: str) -> int:
|
|
68
|
+
depth = 0
|
|
69
|
+
for index, char in enumerate(text):
|
|
70
|
+
if char == "{" and not _is_escaped(text, index):
|
|
71
|
+
depth += 1
|
|
72
|
+
elif char == "}" and not _is_escaped(text, index):
|
|
73
|
+
depth -= 1
|
|
74
|
+
return depth
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _environments_balanced(text: str) -> bool:
|
|
78
|
+
stack: list[str] = []
|
|
79
|
+
for match in re.finditer(r"\\(?:begin|end)\s*\{\s*([^{}]+)\s*\}", text):
|
|
80
|
+
command = match.group(0)
|
|
81
|
+
env = match.group(1).strip()
|
|
82
|
+
if command.startswith(r"\begin"):
|
|
83
|
+
stack.append(env)
|
|
84
|
+
continue
|
|
85
|
+
if not stack or stack.pop() != env:
|
|
86
|
+
return False
|
|
87
|
+
return not stack
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _is_escaped(text: str, index: int) -> bool:
|
|
91
|
+
backslashes = 0
|
|
92
|
+
pos = index - 1
|
|
93
|
+
while pos >= 0 and text[pos] == "\\":
|
|
94
|
+
backslashes += 1
|
|
95
|
+
pos -= 1
|
|
96
|
+
return backslashes % 2 == 1
|
|
@@ -1,60 +1,60 @@
|
|
|
1
|
-
{
|
|
2
|
-
"version": 1,
|
|
3
|
-
"models": {
|
|
4
|
-
"mathcraft-formula-det": {
|
|
5
|
-
"version": "1",
|
|
6
|
-
"runtime": "onnx",
|
|
7
|
-
"files": [
|
|
8
|
-
{
|
|
9
|
-
"path": "mathcraft-mfd.onnx",
|
|
10
|
-
"sha256": "40d4fc852d99bcbf25a9478897d2f49fbbb8f7fdd6569c088cd1c31386293bd7"
|
|
11
|
-
}
|
|
12
|
-
],
|
|
13
|
-
"sources": [
|
|
14
|
-
"https://github.com/SakuraMathcraft/MathCraft-Models/releases/download/v1.0.0/mathcraft-formula-det.zip"
|
|
15
|
-
]
|
|
16
|
-
},
|
|
17
|
-
"mathcraft-formula-rec": {
|
|
18
|
-
"version": "1",
|
|
19
|
-
"runtime": "onnx",
|
|
20
|
-
"files": [
|
|
21
|
-
{
|
|
22
|
-
"path": "config.json",
|
|
23
|
-
"sha256": "9f3812441d397c871b9b2a74e8d956b939aec5f4f45745bba9214e968d56449d"
|
|
24
|
-
},
|
|
25
|
-
{
|
|
26
|
-
"path": "encoder_model.onnx",
|
|
27
|
-
"sha256": "bd8d5c322792e9ec45793af5569e9748f82a3d728a9e00213dbfc56c1486f37d"
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"path": "decoder_model.onnx",
|
|
31
|
-
"sha256": "fd0f92d7a012f3dae41e1ac79421aea0ea888b5a66cb3f9a004e424f82f3daed"
|
|
32
|
-
},
|
|
33
|
-
{
|
|
34
|
-
"path": "generation_config.json",
|
|
35
|
-
"sha256": "cbea88288d5576a9655ad04e2456768544be22273a1c5ca160e0d16384639b4f"
|
|
36
|
-
},
|
|
37
|
-
{
|
|
38
|
-
"path": "preprocessor_config.json",
|
|
39
|
-
"sha256": "36a945a7cc645688b9ef64dabae16979cf5f7c1c448569cc306694edc0598b9b"
|
|
40
|
-
},
|
|
41
|
-
{
|
|
42
|
-
"path": "special_tokens_map.json",
|
|
43
|
-
"sha256": "8c785abebea9ae3257b61681b4e6fd8365ceafde980c21970d001e834cf10835"
|
|
44
|
-
},
|
|
45
|
-
{
|
|
46
|
-
"path": "tokenizer.json",
|
|
47
|
-
"sha256": "3e2ab757277d22639bec28c9d7972e352d3d1dba223051fa674002dc5ab64df3"
|
|
48
|
-
},
|
|
49
|
-
{
|
|
50
|
-
"path": "tokenizer_config.json",
|
|
51
|
-
"sha256": "7ffff31747c73b1a462b766abfc128e03f669e5b8452fe6e175b1430a078ac8d"
|
|
52
|
-
}
|
|
53
|
-
],
|
|
54
|
-
"sources": [
|
|
55
|
-
"https://github.com/SakuraMathcraft/MathCraft-Models/releases/download/v1.0.0/mathcraft-formula-rec.zip"
|
|
56
|
-
]
|
|
57
|
-
},
|
|
1
|
+
{
|
|
2
|
+
"version": 1,
|
|
3
|
+
"models": {
|
|
4
|
+
"mathcraft-formula-det": {
|
|
5
|
+
"version": "1",
|
|
6
|
+
"runtime": "onnx",
|
|
7
|
+
"files": [
|
|
8
|
+
{
|
|
9
|
+
"path": "mathcraft-mfd.onnx",
|
|
10
|
+
"sha256": "40d4fc852d99bcbf25a9478897d2f49fbbb8f7fdd6569c088cd1c31386293bd7"
|
|
11
|
+
}
|
|
12
|
+
],
|
|
13
|
+
"sources": [
|
|
14
|
+
"https://github.com/SakuraMathcraft/MathCraft-Models/releases/download/v1.0.0/mathcraft-formula-det.zip"
|
|
15
|
+
]
|
|
16
|
+
},
|
|
17
|
+
"mathcraft-formula-rec": {
|
|
18
|
+
"version": "1",
|
|
19
|
+
"runtime": "onnx",
|
|
20
|
+
"files": [
|
|
21
|
+
{
|
|
22
|
+
"path": "config.json",
|
|
23
|
+
"sha256": "9f3812441d397c871b9b2a74e8d956b939aec5f4f45745bba9214e968d56449d"
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
"path": "encoder_model.onnx",
|
|
27
|
+
"sha256": "bd8d5c322792e9ec45793af5569e9748f82a3d728a9e00213dbfc56c1486f37d"
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"path": "decoder_model.onnx",
|
|
31
|
+
"sha256": "fd0f92d7a012f3dae41e1ac79421aea0ea888b5a66cb3f9a004e424f82f3daed"
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"path": "generation_config.json",
|
|
35
|
+
"sha256": "cbea88288d5576a9655ad04e2456768544be22273a1c5ca160e0d16384639b4f"
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"path": "preprocessor_config.json",
|
|
39
|
+
"sha256": "36a945a7cc645688b9ef64dabae16979cf5f7c1c448569cc306694edc0598b9b"
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"path": "special_tokens_map.json",
|
|
43
|
+
"sha256": "8c785abebea9ae3257b61681b4e6fd8365ceafde980c21970d001e834cf10835"
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"path": "tokenizer.json",
|
|
47
|
+
"sha256": "3e2ab757277d22639bec28c9d7972e352d3d1dba223051fa674002dc5ab64df3"
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"path": "tokenizer_config.json",
|
|
51
|
+
"sha256": "7ffff31747c73b1a462b766abfc128e03f669e5b8452fe6e175b1430a078ac8d"
|
|
52
|
+
}
|
|
53
|
+
],
|
|
54
|
+
"sources": [
|
|
55
|
+
"https://github.com/SakuraMathcraft/MathCraft-Models/releases/download/v1.0.0/mathcraft-formula-rec.zip"
|
|
56
|
+
]
|
|
57
|
+
},
|
|
58
58
|
"mathcraft-text-det": {
|
|
59
59
|
"version": "1",
|
|
60
60
|
"runtime": "onnx",
|
|
@@ -27,6 +27,11 @@ from .errors import ModelCacheError
|
|
|
27
27
|
from .formula_lines import compose_aligned_formula, compose_formula_line, split_formula_line_groups
|
|
28
28
|
from .hardware import choose_rec_batch_num, detect_hardware_info
|
|
29
29
|
from .image import load_image_rgb, rgb_to_bgr
|
|
30
|
+
from .latex_quality import (
|
|
31
|
+
SEVERE_LATEX_QUALITY_FLAGS,
|
|
32
|
+
has_severe_latex_quality_issue,
|
|
33
|
+
latex_quality_flags,
|
|
34
|
+
)
|
|
30
35
|
from .layout import (
|
|
31
36
|
annotate_blocks,
|
|
32
37
|
box_to_points,
|
|
@@ -422,6 +427,7 @@ class MathCraftRuntime:
|
|
|
422
427
|
text=formula_text,
|
|
423
428
|
score=min(formula_box.score, formula_score),
|
|
424
429
|
source="formula_rec",
|
|
430
|
+
confidence_flags=latex_quality_flags(formula_text),
|
|
425
431
|
)
|
|
426
432
|
)
|
|
427
433
|
if not blocks:
|
|
@@ -433,6 +439,7 @@ class MathCraftRuntime:
|
|
|
433
439
|
text=formula.text,
|
|
434
440
|
score=formula.score,
|
|
435
441
|
source="formula_fallback",
|
|
442
|
+
confidence_flags=latex_quality_flags(formula.text),
|
|
436
443
|
)
|
|
437
444
|
)
|
|
438
445
|
blocks = list(resolve_formula_text_conflicts(blocks, image_size=(int(width), int(height))))
|
|
@@ -475,7 +482,30 @@ class MathCraftRuntime:
|
|
|
475
482
|
count = len(line_group.crops)
|
|
476
483
|
grouped_results.append(flat_results[offset : offset + count])
|
|
477
484
|
offset += count
|
|
478
|
-
|
|
485
|
+
grouped_results = _repair_severe_segmented_lines(
|
|
486
|
+
rgb,
|
|
487
|
+
line_groups,
|
|
488
|
+
grouped_results,
|
|
489
|
+
model_dir,
|
|
490
|
+
provider_info,
|
|
491
|
+
max_new_tokens=max_new_tokens,
|
|
492
|
+
)
|
|
493
|
+
merged_text, merged_score = _merge_formula_group_results(grouped_results)
|
|
494
|
+
if has_severe_latex_quality_issue(merged_text):
|
|
495
|
+
fallback_text, fallback_score = recognize_formula_image(
|
|
496
|
+
rgb,
|
|
497
|
+
model_dir,
|
|
498
|
+
provider_info,
|
|
499
|
+
max_new_tokens=max_new_tokens,
|
|
500
|
+
)
|
|
501
|
+
if _prefer_formula_fallback(
|
|
502
|
+
merged_text,
|
|
503
|
+
merged_score,
|
|
504
|
+
fallback_text,
|
|
505
|
+
fallback_score,
|
|
506
|
+
):
|
|
507
|
+
return fallback_text, fallback_score
|
|
508
|
+
return merged_text, merged_score
|
|
479
509
|
return recognize_formula_image(
|
|
480
510
|
rgb,
|
|
481
511
|
model_dir,
|
|
@@ -591,5 +621,64 @@ def _merge_formula_group_results(results: list[list[tuple[str, float]]]) -> tupl
|
|
|
591
621
|
return text, score
|
|
592
622
|
|
|
593
623
|
|
|
624
|
+
def _repair_severe_segmented_lines(
|
|
625
|
+
rgb,
|
|
626
|
+
line_groups,
|
|
627
|
+
grouped_results: list[list[tuple[str, float]]],
|
|
628
|
+
model_dir: Path,
|
|
629
|
+
provider_info: ProviderInfo,
|
|
630
|
+
*,
|
|
631
|
+
max_new_tokens: int,
|
|
632
|
+
) -> list[list[tuple[str, float]]]:
|
|
633
|
+
repaired = [list(line_results) for line_results in grouped_results]
|
|
634
|
+
for index, (line_group, line_results) in enumerate(zip(line_groups, grouped_results)):
|
|
635
|
+
if len(line_group.crops) <= 1:
|
|
636
|
+
continue
|
|
637
|
+
line_text = compose_formula_line([text for text, _score in line_results])
|
|
638
|
+
if not has_severe_latex_quality_issue(line_text):
|
|
639
|
+
continue
|
|
640
|
+
fallback_text, fallback_score = recognize_formula_image(
|
|
641
|
+
_crop_formula_line_group(rgb, line_group),
|
|
642
|
+
model_dir,
|
|
643
|
+
provider_info,
|
|
644
|
+
max_new_tokens=max_new_tokens,
|
|
645
|
+
)
|
|
646
|
+
line_score = _mean_score(line_results)
|
|
647
|
+
if _prefer_formula_fallback(line_text, line_score, fallback_text, fallback_score):
|
|
648
|
+
repaired[index] = [(fallback_text, fallback_score)]
|
|
649
|
+
return repaired
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
def _crop_formula_line_group(rgb, line_group) -> object:
|
|
653
|
+
left = min(crop.box[0] for crop in line_group.crops)
|
|
654
|
+
top = min(crop.box[1] for crop in line_group.crops)
|
|
655
|
+
right = max(crop.box[2] for crop in line_group.crops)
|
|
656
|
+
bottom = max(crop.box[3] for crop in line_group.crops)
|
|
657
|
+
return rgb[top:bottom, left:right].copy()
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def _mean_score(results: list[tuple[str, float]]) -> float:
|
|
661
|
+
if not results:
|
|
662
|
+
return 0.0
|
|
663
|
+
return float(sum(score for _text, score in results) / len(results))
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def _prefer_formula_fallback(
|
|
667
|
+
candidate_text: str,
|
|
668
|
+
candidate_score: float,
|
|
669
|
+
fallback_text: str,
|
|
670
|
+
fallback_score: float,
|
|
671
|
+
) -> bool:
|
|
672
|
+
candidate_flags = set(latex_quality_flags(candidate_text))
|
|
673
|
+
fallback_flags = set(latex_quality_flags(fallback_text))
|
|
674
|
+
candidate_severe = candidate_flags & SEVERE_LATEX_QUALITY_FLAGS
|
|
675
|
+
fallback_severe = fallback_flags & SEVERE_LATEX_QUALITY_FLAGS
|
|
676
|
+
if len(fallback_severe) < len(candidate_severe):
|
|
677
|
+
return fallback_score >= candidate_score * 0.78
|
|
678
|
+
if fallback_severe == candidate_severe:
|
|
679
|
+
return fallback_score > candidate_score + 0.04
|
|
680
|
+
return False
|
|
681
|
+
|
|
682
|
+
|
|
594
683
|
def _formula_mask_margin(width: int, height: int) -> int:
|
|
595
684
|
return max(2, int(round(max(width, height) / 640.0)))
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
LICENSE
|
|
2
2
|
MANIFEST.in
|
|
3
|
-
README.md
|
|
4
3
|
README_MATHCRAFT_OCR.md
|
|
5
4
|
pyproject.toml
|
|
5
|
+
readme.md
|
|
6
6
|
mathcraft_ocr/__init__.py
|
|
7
7
|
mathcraft_ocr/__main__.py
|
|
8
8
|
mathcraft_ocr/api.py
|
|
@@ -16,6 +16,8 @@ mathcraft_ocr/errors.py
|
|
|
16
16
|
mathcraft_ocr/formula_lines.py
|
|
17
17
|
mathcraft_ocr/hardware.py
|
|
18
18
|
mathcraft_ocr/image.py
|
|
19
|
+
mathcraft_ocr/latex_alignment.py
|
|
20
|
+
mathcraft_ocr/latex_quality.py
|
|
19
21
|
mathcraft_ocr/layout.py
|
|
20
22
|
mathcraft_ocr/manifest.py
|
|
21
23
|
mathcraft_ocr/profiles.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mathcraft-ocr"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.2"
|
|
8
8
|
description = "ONNX-only OCR runtime for mathematical documents"
|
|
9
9
|
readme = "README_MATHCRAFT_OCR.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -79,13 +79,17 @@ exclude = ["src*", "test*", "scripts*", "docs*", "release_assets*", "build*", "d
|
|
|
79
79
|
[tool.setuptools.package-data]
|
|
80
80
|
mathcraft_ocr = ["manifests/*.json"]
|
|
81
81
|
|
|
82
|
+
[tool.pytest.ini_options]
|
|
83
|
+
pythonpath = ["src"]
|
|
84
|
+
testpaths = ["test"]
|
|
85
|
+
addopts = ["--ignore=python311", "--ignore=tools/deps", "--ignore=.venv", "--ignore=build", "--ignore=dist"]
|
|
86
|
+
|
|
82
87
|
[tool.vulture]
|
|
83
88
|
paths = ["src"]
|
|
84
89
|
exclude = [
|
|
85
90
|
"build/",
|
|
86
91
|
"python311/",
|
|
87
|
-
"
|
|
88
|
-
"src/get-pip.py"
|
|
92
|
+
"tools/deps/python311/"
|
|
89
93
|
]
|
|
90
94
|
min_confidence = 70
|
|
91
95
|
sort_by_size = true
|