anymd 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "anymd",
3
- "version": "0.0.7",
3
+ "version": "0.0.8",
4
4
  "description": "Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG",
5
5
  "keywords": [
6
6
  "markdown",
@@ -6,14 +6,30 @@ from pathlib import Path
6
6
  from marker.converters.pdf import PdfConverter
7
7
  from marker.models import create_model_dict
8
8
  from marker.output import text_from_rendered
9
+ from markitdown import MarkItDown
9
10
 
10
11
  MIN_ARGS = 2
12
+ MIN_FALLBACK_CHARS = 10
13
+
14
+ _mid = MarkItDown()
11
15
 
12
16
 
13
17
  def _emit(data: dict[str, object]) -> None:
14
18
  print(json.dumps(data), flush=True)
15
19
 
16
20
 
21
+ def _markitdown_fallback(pdf_path: str) -> str | None:
22
+ try:
23
+ result = _mid.convert(pdf_path)
24
+ text = result.text_content.strip()
25
+ except Exception: # noqa: BLE001
26
+ return None
27
+ else:
28
+ if len(text) < MIN_FALLBACK_CHARS:
29
+ return None
30
+ return text
31
+
32
+
17
33
  def _convert_one(converter: PdfConverter, pdf_path: str, out_path: str, index: int, total: int) -> None:
18
34
  t1 = time.time()
19
35
  try:
@@ -29,15 +45,28 @@ def _convert_one(converter: PdfConverter, pdf_path: str, out_path: str, index: i
29
45
  'seconds': round(time.time() - t1, 1),
30
46
  'chars': len(md),
31
47
  })
32
- except Exception as exc: # noqa: BLE001
33
- _emit({
34
- 'type': 'error',
35
- 'index': index,
36
- 'total': total,
37
- 'file': Path(pdf_path).name,
38
- 'seconds': round(time.time() - t1, 1),
39
- 'error': str(exc),
40
- })
48
+ except Exception as marker_exc: # noqa: BLE001
49
+ fallback_md = _markitdown_fallback(pdf_path)
50
+ if fallback_md:
51
+ Path(out_path).parent.mkdir(parents=True, exist_ok=True)
52
+ Path(out_path).write_text(fallback_md, encoding='utf-8')
53
+ _emit({
54
+ 'type': 'converted',
55
+ 'index': index,
56
+ 'total': total,
57
+ 'file': Path(pdf_path).name,
58
+ 'seconds': round(time.time() - t1, 1),
59
+ 'chars': len(fallback_md),
60
+ })
61
+ else:
62
+ _emit({
63
+ 'type': 'error',
64
+ 'index': index,
65
+ 'total': total,
66
+ 'file': Path(pdf_path).name,
67
+ 'seconds': round(time.time() - t1, 1),
68
+ 'error': str(marker_exc),
69
+ })
41
70
 
42
71
 
43
72
  def main() -> None:
package/src/bootstrap.ts CHANGED
@@ -10,7 +10,7 @@ interface BootstrapCallbacks {
10
10
  }
11
11
 
12
12
  const REQUIRED_PACKAGES = ['marker', 'markitdown', 'mammoth', 'mlx_vlm', 'pypdfium2', 'torchvision']
13
- const PIP_PACKAGES = ['marker-pdf', 'markitdown[docx]', 'mlx-vlm', 'pypdfium2', 'torchvision']
13
+ const PIP_PACKAGES = ['marker-pdf', 'markitdown[docx,pdf]', 'mlx-vlm', 'pypdfium2', 'torchvision']
14
14
  const CHANDRA_MODEL_ID = 'mlx-community/chandra-8bit'
15
15
 
16
16
  const checkImportable = async (py: string, pkg: string): Promise<boolean> => {
package/tui.tsx CHANGED
@@ -407,6 +407,7 @@ const SidebarStep = ({
407
407
  }
408
408
 
409
409
  const ERROR_PATTERN = /\b(?:ERROR|Error:|Failed:|failed|FAILED|\u2716|exception|traceback)/iu
410
+ const NOISY_PATTERN = /index \d+ is out of bounds/iu
410
411
 
411
412
  const PreflightBanner = ({ errors, warnings }: { errors: string[]; warnings: string[] }) => {
412
413
  if (errors.length === 0 && warnings.length === 0) return null
@@ -772,7 +773,8 @@ const App = () => {
772
773
  procRef.current = spawned.proc
773
774
 
774
775
  const onLine = (line: string): void => {
775
- dispatch({ line, type: 'APPEND_OUTPUT' })
776
+ const noisy = NOISY_PATTERN.test(line)
777
+ if (!noisy) dispatch({ line, type: 'APPEND_OUTPUT' })
776
778
  if (key === 'pipeline') appendPipelineLog(line)
777
779
  if (ERROR_PATTERN.test(line)) appendErrorLog(key, line)
778
780
  }