@make-u-free/migi 0.5.11 → 0.5.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +2 -1
  2. package/src/tools.js +22 -5
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@make-u-free/migi",
3
- "version": "0.5.11",
3
+ "version": "0.5.13",
4
4
  "description": "Your AI right-hand agent. Works anywhere, with any LLM API.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -17,6 +17,7 @@
17
17
  "glob": "^11.0.0",
18
18
  "openai": "^4.0.0",
19
19
  "pdf-parse": "^2.4.5",
20
+ "pdfjs-dist": "^5.5.207",
20
21
  "xlsx": "^0.18.5"
21
22
  },
22
23
  "engines": {
package/src/tools.js CHANGED
@@ -8,8 +8,10 @@ import chalk from 'chalk'
8
8
  import xlsxPkg from 'xlsx'
9
9
  import { createRequire } from 'module'
10
10
  const require = createRequire(import.meta.url)
11
- const pdfParse = require('pdf-parse')
11
+ const _pdfParseModule = require('pdf-parse')
12
+ const pdfParse = typeof _pdfParseModule === 'function' ? _pdfParseModule : _pdfParseModule.default
12
13
  import AdmZip from 'adm-zip'
14
+ import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs'
13
15
  import OpenAI from 'openai'
14
16
  import { httpsAgent } from './tls.js'
15
17
  const { readFile: xlsxReadFile, utils: xlsxUtils } = xlsxPkg
@@ -232,14 +234,29 @@ export async function executeTool(name, args, opts = {}) {
232
234
  if (ext === '.pdf') {
233
235
  const buf = readFileSync(args.path)
234
236
 
235
- // Step 1: テキストPDFとして抽出を試みる
237
+ // Step 1: pdfjs-dist でテキスト抽出(最も信頼性が高い)
236
238
  try {
237
- const data = await pdfParse(buf)
238
- const text = data.text?.trim()
239
+ const doc = await pdfjsLib.getDocument({ data: new Uint8Array(buf) }).promise
240
+ const pages = []
241
+ for (let p = 1; p <= doc.numPages; p++) {
242
+ const page = await doc.getPage(p)
243
+ const content = await page.getTextContent()
244
+ pages.push(content.items.map(item => item.str).join(''))
245
+ }
246
+ const text = pages.join('\n').trim()
239
247
  if (text) return text
240
248
  } catch (_) {}
241
249
 
242
- // Step 2: 画像PDFとしてVision APIでOCR(ネイティブ依存なし)
250
+ // Step 2: pdf-parse でも試みる(フォールバック)
251
+ if (typeof pdfParse === 'function') {
252
+ try {
253
+ const data = await pdfParse(buf)
254
+ const text = data.text?.trim()
255
+ if (text) return text
256
+ } catch (_) {}
257
+ }
258
+
259
+ // Step 3: 画像PDFとしてVision APIでOCR(ネイティブ依存なし)
243
260
  if (!opts.apiKey) return '(テキストが抽出できませんでした)'
244
261
  const images = extractImagesFromPdf(buf)
245
262
  if (images.length === 0) return '(テキストも画像も抽出できませんでした)'