anymd 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "anymd",
3
- "version": "0.0.10",
3
+ "version": "0.0.11",
4
4
  "description": "Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG",
5
5
  "keywords": [
6
6
  "markdown",
@@ -46,9 +46,13 @@
46
46
  "dependencies": {
47
47
  "markdownlint": "^0.40.0",
48
48
  "p-map": "^7.0.4",
49
+ "turndown": "^7.2.2",
49
50
  "yoctocolors": "^2.1.2",
50
51
  "zod": "^4.3.6"
51
52
  },
53
+ "devDependencies": {
54
+ "@types/turndown": "^5.0.6"
55
+ },
52
56
  "engines": {
53
57
  "bun": ">=1.0.0"
54
58
  },
@@ -24,7 +24,7 @@ OUTPUT_BASE = Path(_get_arg('--output-base', 'output/ocr-raw'))
24
24
  STATUS_FILE = Path(_get_arg('--status-file', 'output/ocr-progress.json'))
25
25
  LOG_FILE = Path(_get_arg('--log-file', 'output/ocr-log.txt'))
26
26
 
27
- MODEL_ID = 'mlx-community/chandra-8bit'
27
+ MODEL_ID = 'mlx-community/chandra-4bit'
28
28
  IMAGE_DPI = 150
29
29
  MAX_TOKENS = 8192
30
30
 
@@ -2,12 +2,21 @@
2
2
  import { readFile, writeFile } from 'node:fs/promises'
3
3
  import { basename, join } from 'node:path'
4
4
  import pMap from 'p-map'
5
+ import TurndownService from 'turndown'
5
6
 
6
7
  import type { CleanResult } from '~/types'
7
8
 
8
9
  import { ensureDir, loadExistingMdFiles, logger } from '~/utils'
9
10
 
10
- const BOLD_LINE_REGEX = /^\*\*(?<content>.+)\*\*$/u,
11
+ const td = new TurndownService({ bulletListMarker: '-', emDelimiter: '*', headingStyle: 'atx' })
12
+ td.remove('style')
13
+ const turndown = td,
14
+ HTML_DETECT_REGEX = /<\/?[a-z][a-z0-9]*[^>]*>/iu,
15
+ stripHtml = (text: string): string => {
16
+ if (!HTML_DETECT_REGEX.test(text)) return text
17
+ return turndown.turndown(text)
18
+ },
19
+ BOLD_LINE_REGEX = /^\*\*(?<content>.+)\*\*$/u,
11
20
  PHAN_REGEX = /^(?:Phần|PHẦN)\s+/u,
12
21
  CHUONG_REGEX = /^(?:Chương|CHƯƠNG)\s+/u,
13
22
  MUC_REGEX = /^(?:Mục|MỤC)\s+\d/u,
@@ -18,6 +27,7 @@ const BOLD_LINE_REGEX = /^\*\*(?<content>.+)\*\*$/u,
18
27
  HEADER_TABLE_LINE_REGEX = /^\|.*\|$/u,
19
28
  TABLE_SEP_REGEX = /^\|\s*-+/u,
20
29
  MULTIPLE_BLANKS_REGEX = /\n{3,}/gu,
30
+ MULTIPLE_SPACES_REGEX = / {2,}/gu,
21
31
  // oxlint-disable-next-line no-control-regex
22
32
  // eslint-disable-next-line no-control-regex
23
33
  CONTROL_CHARS_REGEX = /[\u0000-\u0008\u000B\u000C\u000E-\u001F]/gu,
@@ -85,9 +95,11 @@ const BOLD_LINE_REGEX = /^\*\*(?<content>.+)\*\*$/u,
85
95
  return enhanced
86
96
  },
87
97
  enhanceMarkdown = (text: string): string => {
88
- const enhanced = processLines(text.split('\n'))
98
+ const cleaned = stripHtml(text)
99
+ const enhanced = processLines(cleaned.split('\n'))
89
100
  let output = enhanced.join('\n')
90
101
  output = output.replace(MULTIPLE_BLANKS_REGEX, '\n\n')
102
+ output = output.replace(MULTIPLE_SPACES_REGEX, ' ')
91
103
  output = output.trim()
92
104
  return output
93
105
  },