anymd 0.0.6 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/pdf-to-md.py +38 -9
- package/src/bootstrap.ts +1 -1
- package/tui.tsx +3 -1
package/package.json
CHANGED
package/scripts/pdf-to-md.py
CHANGED
|
@@ -6,14 +6,30 @@ from pathlib import Path
|
|
|
6
6
|
from marker.converters.pdf import PdfConverter
|
|
7
7
|
from marker.models import create_model_dict
|
|
8
8
|
from marker.output import text_from_rendered
|
|
9
|
+
from markitdown import MarkItDown
|
|
9
10
|
|
|
10
11
|
MIN_ARGS = 2
|
|
12
|
+
MIN_FALLBACK_CHARS = 10
|
|
13
|
+
|
|
14
|
+
_mid = MarkItDown()
|
|
11
15
|
|
|
12
16
|
|
|
13
17
|
def _emit(data: dict[str, object]) -> None:
|
|
14
18
|
print(json.dumps(data), flush=True)
|
|
15
19
|
|
|
16
20
|
|
|
21
|
+
def _markitdown_fallback(pdf_path: str) -> str | None:
|
|
22
|
+
try:
|
|
23
|
+
result = _mid.convert(pdf_path)
|
|
24
|
+
text = result.text_content.strip()
|
|
25
|
+
except Exception: # noqa: BLE001
|
|
26
|
+
return None
|
|
27
|
+
else:
|
|
28
|
+
if len(text) < MIN_FALLBACK_CHARS:
|
|
29
|
+
return None
|
|
30
|
+
return text
|
|
31
|
+
|
|
32
|
+
|
|
17
33
|
def _convert_one(converter: PdfConverter, pdf_path: str, out_path: str, index: int, total: int) -> None:
|
|
18
34
|
t1 = time.time()
|
|
19
35
|
try:
|
|
@@ -29,15 +45,28 @@ def _convert_one(converter: PdfConverter, pdf_path: str, out_path: str, index: i
|
|
|
29
45
|
'seconds': round(time.time() - t1, 1),
|
|
30
46
|
'chars': len(md),
|
|
31
47
|
})
|
|
32
|
-
except Exception as
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
'
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
48
|
+
except Exception as marker_exc: # noqa: BLE001
|
|
49
|
+
fallback_md = _markitdown_fallback(pdf_path)
|
|
50
|
+
if fallback_md:
|
|
51
|
+
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
Path(out_path).write_text(fallback_md, encoding='utf-8')
|
|
53
|
+
_emit({
|
|
54
|
+
'type': 'converted',
|
|
55
|
+
'index': index,
|
|
56
|
+
'total': total,
|
|
57
|
+
'file': Path(pdf_path).name,
|
|
58
|
+
'seconds': round(time.time() - t1, 1),
|
|
59
|
+
'chars': len(fallback_md),
|
|
60
|
+
})
|
|
61
|
+
else:
|
|
62
|
+
_emit({
|
|
63
|
+
'type': 'error',
|
|
64
|
+
'index': index,
|
|
65
|
+
'total': total,
|
|
66
|
+
'file': Path(pdf_path).name,
|
|
67
|
+
'seconds': round(time.time() - t1, 1),
|
|
68
|
+
'error': str(marker_exc),
|
|
69
|
+
})
|
|
41
70
|
|
|
42
71
|
|
|
43
72
|
def main() -> None:
|
package/src/bootstrap.ts
CHANGED
|
@@ -10,7 +10,7 @@ interface BootstrapCallbacks {
|
|
|
10
10
|
}
|
|
11
11
|
|
|
12
12
|
const REQUIRED_PACKAGES = ['marker', 'markitdown', 'mammoth', 'mlx_vlm', 'pypdfium2', 'torchvision']
|
|
13
|
-
const PIP_PACKAGES = ['marker-pdf', 'markitdown[docx]', 'mlx-vlm', 'pypdfium2', 'torchvision']
|
|
13
|
+
const PIP_PACKAGES = ['marker-pdf', 'markitdown[docx,pdf]', 'mlx-vlm', 'pypdfium2', 'torchvision']
|
|
14
14
|
const CHANDRA_MODEL_ID = 'mlx-community/chandra-8bit'
|
|
15
15
|
|
|
16
16
|
const checkImportable = async (py: string, pkg: string): Promise<boolean> => {
|
package/tui.tsx
CHANGED
|
@@ -407,6 +407,7 @@ const SidebarStep = ({
|
|
|
407
407
|
}
|
|
408
408
|
|
|
409
409
|
const ERROR_PATTERN = /\b(?:ERROR|Error:|Failed:|failed|FAILED|\u2716|exception|traceback)/iu
|
|
410
|
+
const NOISY_PATTERN = /index \d+ is out of bounds/iu
|
|
410
411
|
|
|
411
412
|
const PreflightBanner = ({ errors, warnings }: { errors: string[]; warnings: string[] }) => {
|
|
412
413
|
if (errors.length === 0 && warnings.length === 0) return null
|
|
@@ -772,7 +773,8 @@ const App = () => {
|
|
|
772
773
|
procRef.current = spawned.proc
|
|
773
774
|
|
|
774
775
|
const onLine = (line: string): void => {
|
|
775
|
-
|
|
776
|
+
const noisy = NOISY_PATTERN.test(line)
|
|
777
|
+
if (!noisy) dispatch({ line, type: 'APPEND_OUTPUT' })
|
|
776
778
|
if (key === 'pipeline') appendPipelineLog(line)
|
|
777
779
|
if (ERROR_PATTERN.test(line)) appendErrorLog(key, line)
|
|
778
780
|
}
|