@make-u-free/migi 0.5.10 → 0.5.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/tools.js +29 -15
package/package.json
CHANGED
package/src/tools.js
CHANGED
|
@@ -8,7 +8,8 @@ import chalk from 'chalk'
|
|
|
8
8
|
import xlsxPkg from 'xlsx'
|
|
9
9
|
import { createRequire } from 'module'
|
|
10
10
|
const require = createRequire(import.meta.url)
|
|
11
|
-
const
|
|
11
|
+
const _pdfParseModule = require('pdf-parse')
|
|
12
|
+
const pdfParse = typeof _pdfParseModule === 'function' ? _pdfParseModule : _pdfParseModule.default
|
|
12
13
|
import AdmZip from 'adm-zip'
|
|
13
14
|
import OpenAI from 'openai'
|
|
14
15
|
import { httpsAgent } from './tls.js'
|
|
@@ -134,14 +135,22 @@ function extractImagesFromPdf(buf) {
|
|
|
134
135
|
const images = []
|
|
135
136
|
let i = 0
|
|
136
137
|
|
|
137
|
-
while (i < buf.length -
|
|
138
|
-
// JPEG: FF D8
|
|
139
|
-
if (buf[i] === 0xFF && buf[i + 1] === 0xD8) {
|
|
140
|
-
const
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
138
|
+
while (i < buf.length - 3) {
|
|
139
|
+
// JPEG: FF D8 で始まり、直後に FF E0〜EF(APPマーカー)か FF DB(DQT)が続く本物だけ拾う
|
|
140
|
+
if (buf[i] === 0xFF && buf[i + 1] === 0xD8 && buf[i + 2] === 0xFF) {
|
|
141
|
+
const nextMarker = buf[i + 3]
|
|
142
|
+
const isApp = nextMarker >= 0xE0 && nextMarker <= 0xEF // JFIF / EXIF 等
|
|
143
|
+
const isDqt = nextMarker === 0xDB // 量子化テーブル
|
|
144
|
+
if (isApp || isDqt) {
|
|
145
|
+
const eoiIdx = buf.indexOf(Buffer.from([0xFF, 0xD9]), i + 4)
|
|
146
|
+
if (eoiIdx === -1) break
|
|
147
|
+
const data = buf.slice(i, eoiIdx + 2)
|
|
148
|
+
if (data.length > 1024) { // 1KB未満はアイコン等のゴミなので除外
|
|
149
|
+
images.push({ data, mime: 'image/jpeg' })
|
|
150
|
+
}
|
|
151
|
+
i = eoiIdx + 2
|
|
152
|
+
continue
|
|
153
|
+
}
|
|
145
154
|
}
|
|
146
155
|
|
|
147
156
|
// PNG: 89 50 4E 47 0D 0A 1A 0A で始まる
|
|
@@ -152,7 +161,10 @@ function extractImagesFromPdf(buf) {
|
|
|
152
161
|
) {
|
|
153
162
|
const iend = buf.indexOf(Buffer.from([0x49, 0x45, 0x4E, 0x44, 0xAE, 0x42, 0x60, 0x82]), i + 8)
|
|
154
163
|
if (iend === -1) break
|
|
155
|
-
|
|
164
|
+
const data = buf.slice(i, iend + 8)
|
|
165
|
+
if (data.length > 1024) {
|
|
166
|
+
images.push({ data, mime: 'image/png' })
|
|
167
|
+
}
|
|
156
168
|
i = iend + 8
|
|
157
169
|
continue
|
|
158
170
|
}
|
|
@@ -222,11 +234,13 @@ export async function executeTool(name, args, opts = {}) {
|
|
|
222
234
|
const buf = readFileSync(args.path)
|
|
223
235
|
|
|
224
236
|
// Step 1: テキストPDFとして抽出を試みる
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
237
|
+
if (typeof pdfParse === 'function') {
|
|
238
|
+
try {
|
|
239
|
+
const data = await pdfParse(buf)
|
|
240
|
+
const text = data.text?.trim()
|
|
241
|
+
if (text) return text
|
|
242
|
+
} catch (_) {}
|
|
243
|
+
}
|
|
230
244
|
|
|
231
245
|
// Step 2: 画像PDFとしてVision APIでOCR(ネイティブ依存なし)
|
|
232
246
|
if (!opts.apiKey) return '(テキストが抽出できませんでした)'
|