yaohao 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,9 +1,17 @@
|
|
|
1
1
|
// PDF 解析模块:把 attachment 里的 PDF 转纯文本,复用 parse.js 的 extractMetricsFromText。
|
|
2
2
|
// 用 pdfjs-dist(纯 JS,无原生依赖)。
|
|
3
3
|
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import { createRequire } from 'node:module';
|
|
4
6
|
import { fetchBinary } from './crawl.js';
|
|
5
7
|
import { extractMetricsFromText } from './parse.js';
|
|
6
8
|
|
|
9
|
+
const require = createRequire(import.meta.url);
|
|
10
|
+
|
|
11
|
+
// pdfjs-dist 标准字体目录:Node 版需要直接文件系统路径(非 file:// URL),末尾必须带 /
|
|
12
|
+
const PDFJS_DIR = path.dirname(require.resolve('pdfjs-dist/package.json'));
|
|
13
|
+
const STANDARD_FONT_DATA_URL = `${PDFJS_DIR}/standard_fonts/`;
|
|
14
|
+
|
|
7
15
|
let _pdfjs = null;
|
|
8
16
|
|
|
9
17
|
async function getPdfjs() {
|
|
@@ -20,7 +28,13 @@ async function getPdfjs() {
|
|
|
20
28
|
export async function extractPdfText(pdfUrl) {
|
|
21
29
|
const pdfjs = await getPdfjs();
|
|
22
30
|
const { buffer } = await fetchBinary(pdfUrl);
|
|
23
|
-
const doc = await pdfjs.getDocument({
|
|
31
|
+
const doc = await pdfjs.getDocument({
|
|
32
|
+
data: new Uint8Array(buffer),
|
|
33
|
+
standardFontDataUrl: STANDARD_FONT_DATA_URL,
|
|
34
|
+
disableFontFace: true, // 文本抽取不需要字体形状,直接禁用避免加载
|
|
35
|
+
useSystemFonts: false,
|
|
36
|
+
verbosity: 0, // 0=ERRORS only,去掉 info/warning 噪音
|
|
37
|
+
}).promise;
|
|
24
38
|
const pages = [];
|
|
25
39
|
for (let i = 1; i <= doc.numPages; i++) {
|
|
26
40
|
const page = await doc.getPage(i);
|