@chenchaolong/plugin-mineru 1.1.6 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"mineru.client.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.client.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAEhD,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAmB,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACrE,OAAc,EAAE,aAAa,EAAE,MAAM,OAAO,CAAC;AAK7C,OAAO,EAIL,wBAAwB,EAExB,0BAA0B,EAC1B,gBAAgB,EACjB,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"mineru.client.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.client.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAEhD,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAmB,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACrE,OAAc,EAAE,aAAa,EAAE,MAAM,OAAO,CAAC;AAK7C,OAAO,EAIL,wBAAwB,EAExB,0BAA0B,EAC1B,gBAAgB,EACjB,MAAM,YAAY,CAAC;AAIpB,UAAU,iBAAiB;IACzB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,mEAAmE;IACnE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,yEAAyE;IACzE,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,2EAA2E;IAC3E,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,4EAA4E;IAC5E,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED,UAAU,mBAAmB;IAC3B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,UAAU,sBAAsB;IAC9B,KAAK,EAAE,mBAAmB,EAAE,CAAC;IAC7B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAED,UAAU,iBAAiB;IACzB,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AASD,qBAAa,YAAY;IAWrB,OAAO,CAAC,QAAQ,CAAC,aAAa;IAC9B,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAC;IAX/B,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAiC;IACxD,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAS;IAChC,SAAgB,UAAU,EAAE,gBAAgB,CAAC;IAC7C,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAiD;IAE5E,IAAI,UAAU,IAAI,YAAY,GAAG,SAAS,CAEzC;gBAEkB,aAAa,EAAE,aAAa,EAC5B,WAAW,CAAC,EAAE;QACvB,UAAU,CAAC,EAAE,YAAY,CAAC;QAC1B,WAAW,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,wBAAwB,CAAC,CAAC,CAAC;KACjE;IAkBP;;;OAGG;IACG,UAAU,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC;QAAE,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;IAYzE;;OAEG;IACG,eAAe,CAAC,OAAO,EAAE,sBAAsB,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAA;KAAE,CAAC;IAmCzG,iBAAiB,CAAC,MAAM,EAAE,MAAM,GAAG,0BAA0B,GAAG,SAAS;IAOzE;;OAEG;IACG,aAAa,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC;QACxE,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB,CAAC;IAoBF;;OAEG;IACG,cAAc,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC;IAiBnD;;OAEG;IACG,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,SAAS,SAAgB,EAAE,UAAU,SAAO,GAAG,OAAO,CAAC,GAAG,CAAC;IAsB7F,OAAO,CAAC,cAAc;IAMtB,OAAO,CAAC,iBAAiB;IAczB,OAAO,CAAC,kBAAkB;IAiC1B,OAAO,CAAC,sBAAsB;IAI9B,OAAO,CAAC,gBAAgB;IAIxB,OAAO,CAAC,WAAW;IAQnB,OAAO,CAAC,kBAAkB;IAO1B,OAAO,CAAC,oBAAoB;YAYd,kBAAkB;YA4BlB,oBAAoB;YAwIpB,qBAAqB;YAyFrB,uBAAuB;IAsDrC,OAAO,CAAC,iBAAiB;IAgBzB,OAAO,CAAC,2BAA2B;IAenC,OAAO,CAAC,6BAA6B;IAcrC,OAAO,CAAC,iBAAiB;IAQzB,OAAO,CAAC,aAAa;IAcrB,OAAO,CAAC,iBAAiB;IAQzB,OAAO,CAAC,eAAe;YAIT,YAAY;IAkB1B,OAAO,CAAC,eAAe;IA0BvB,wBAAwB,IAAI,OAAO,CAAC,aAAa,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IAKtD,wBAAwB;CAU/B"}
|
|
@@ -7,9 +7,6 @@ import { basename, isAbsolute, join as pathJoin } from 'path';
|
|
|
7
7
|
import fs from 'fs';
|
|
8
8
|
import { ENV_MINERU_API_BASE_URL, ENV_MINERU_API_TOKEN, ENV_MINERU_SERVER_TYPE, } from './types.js';
|
|
9
9
|
const DEFAULT_OFFICIAL_BASE_URL = 'https://mineru.net/api/v4';
|
|
10
|
-
// Default base URL for self-hosted MinerU deployments
|
|
11
|
-
// 自托管 MinerU 部署的默认基础 URL
|
|
12
|
-
const DEFAULT_SELF_HOSTED_BASE_URL = 'http://localhost:8000';
|
|
13
10
|
export class MinerUClient {
|
|
14
11
|
get fileSystem() {
|
|
15
12
|
return this.permissions?.fileSystem;
|
|
@@ -185,8 +182,13 @@ export class MinerUClient {
|
|
|
185
182
|
const tokenFromEnv = this.configService.get(tokenEnvKey);
|
|
186
183
|
const baseUrl = baseUrlFromIntegration ||
|
|
187
184
|
baseUrlFromEnv ||
|
|
188
|
-
(this.serverType === 'official' ? DEFAULT_OFFICIAL_BASE_URL :
|
|
185
|
+
(this.serverType === 'official' ? DEFAULT_OFFICIAL_BASE_URL : undefined);
|
|
189
186
|
const token = tokenFromIntegration || tokenFromEnv;
|
|
187
|
+
// Validate baseUrl is provided for self-hosted mode
|
|
188
|
+
if (this.serverType === 'self-hosted' && !baseUrl) {
|
|
189
|
+
throw new Error('MinerU self-hosted mode requires apiUrl to be configured in integration options or ' +
|
|
190
|
+
`${ENV_MINERU_API_BASE_URL} environment variable`);
|
|
191
|
+
}
|
|
190
192
|
return { baseUrl, token };
|
|
191
193
|
}
|
|
192
194
|
readIntegrationOptions(integration) {
|
|
@@ -255,53 +257,39 @@ export class MinerUClient {
|
|
|
255
257
|
}
|
|
256
258
|
async createSelfHostedTask(options) {
|
|
257
259
|
// Validate fileSystem is available for self-hosted mode
|
|
258
|
-
// 验证文件系统是否可用于自托管模式
|
|
259
260
|
if (!this.fileSystem) {
|
|
260
261
|
throw new Error('MinerU self-hosted mode requires fileSystem permission');
|
|
261
262
|
}
|
|
262
263
|
// Validate filePath is provided
|
|
263
|
-
// 验证是否提供了文件路径
|
|
264
264
|
if (!options.filePath) {
|
|
265
265
|
throw new Error('MinerU self-hosted mode requires filePath to be provided');
|
|
266
266
|
}
|
|
267
267
|
// Resolve absolute file path
|
|
268
|
-
// 解析绝对文件路径
|
|
269
268
|
// Log original filePath for debugging
|
|
270
|
-
// 记录原始 filePath 以便调试
|
|
271
269
|
const basePath = this.fileSystem ? this.fileSystem.basePath : 'N/A';
|
|
272
270
|
this.logger.debug(`Resolving file path. Original filePath: ${options.filePath}, basePath: ${basePath}`);
|
|
273
271
|
// Check if filePath is already an absolute path
|
|
274
|
-
// 检查 filePath 是否已经是绝对路径
|
|
275
272
|
const isAbsolutePath = isAbsolute(options.filePath);
|
|
276
273
|
// Also check if it looks like a full path even without leading slash
|
|
277
|
-
|
|
278
|
-
const looksLikeFullPath = !isAbsolutePath && (options.filePath.includes('/apps/api/public/') ||
|
|
279
|
-
options.filePath.includes('/public/files/') ||
|
|
280
|
-
options.filePath.includes('/项目/') ||
|
|
281
|
-
options.filePath.includes('/xpert-dev/') ||
|
|
282
|
-
options.filePath.startsWith('Users/') ||
|
|
274
|
+
const looksLikeFullPath = !isAbsolutePath && (options.filePath.startsWith('Users/') ||
|
|
283
275
|
options.filePath.startsWith('home/'));
|
|
284
276
|
let filePath;
|
|
285
277
|
if (isAbsolutePath) {
|
|
286
278
|
// Use absolute path directly
|
|
287
|
-
// 直接使用绝对路径
|
|
288
279
|
filePath = options.filePath;
|
|
289
280
|
this.logger.debug(`Using absolute path directly: ${filePath}`);
|
|
290
281
|
}
|
|
291
282
|
else if (looksLikeFullPath) {
|
|
292
283
|
// If it looks like a full path but doesn't start with /, add it
|
|
293
|
-
// 如果看起来像完整路径但没有以 / 开头,添加它
|
|
294
284
|
filePath = options.filePath.startsWith('/') ? options.filePath : '/' + options.filePath;
|
|
295
285
|
this.logger.debug(`Detected full path pattern, normalized to: ${filePath}`);
|
|
296
286
|
}
|
|
297
287
|
else {
|
|
298
288
|
// Use xpFileSystem.fullPath() to resolve relative path to absolute path
|
|
299
|
-
// 使用 xpFileSystem.fullPath() 将相对路径解析为绝对路径
|
|
300
289
|
filePath = this.fileSystem.fullPath(options.filePath);
|
|
301
290
|
this.logger.debug(`Resolved relative path using basePath: ${filePath}`);
|
|
302
291
|
}
|
|
303
292
|
// Validate file exists and is readable before attempting to parse
|
|
304
|
-
// 在尝试解析之前验证文件是否存在且可读
|
|
305
293
|
try {
|
|
306
294
|
await fs.promises.access(filePath, fs.constants.F_OK | fs.constants.R_OK);
|
|
307
295
|
const stats = await fs.promises.stat(filePath);
|
|
@@ -312,45 +300,32 @@ export class MinerUClient {
|
|
|
312
300
|
}
|
|
313
301
|
catch (error) {
|
|
314
302
|
// If file not found in the resolved path, try to find it in common alternative locations
|
|
315
|
-
// 如果文件在解析路径中未找到,尝试在常见的替代位置查找
|
|
316
303
|
// This handles two scenarios:
|
|
317
304
|
// 1. StorageFile: files/{tenantId}/filename -> apps/api/public/files/{tenantId}/filename (already tried above)
|
|
318
305
|
// 2. VolumeClient: folder/filename or filename -> ~/data/folder/filename or ~/data/filename
|
|
319
|
-
// 这处理两种情况:
|
|
320
|
-
// 1. StorageFile: files/{tenantId}/filename -> apps/api/public/files/{tenantId}/filename(上面已尝试)
|
|
321
|
-
// 2. VolumeClient: folder/filename 或 filename -> ~/data/folder/filename 或 ~/data/filename
|
|
322
306
|
if (error instanceof Error && error.code === 'ENOENT') {
|
|
323
307
|
const homeDir = process.env.HOME || process.env.USERPROFILE;
|
|
324
308
|
const originalFilePath = options.filePath;
|
|
325
309
|
const fileName = basename(originalFilePath);
|
|
326
310
|
// Build alternative paths for VolumeClient storage
|
|
327
|
-
// 构建 VolumeClient 存储的替代路径
|
|
328
311
|
const alternativePaths = [];
|
|
329
312
|
// If original path contains directory separators, try both full path and just filename
|
|
330
|
-
// 如果原始路径包含目录分隔符,尝试完整路径和仅文件名
|
|
331
313
|
if (originalFilePath.includes('/') || originalFilePath.includes('\\')) {
|
|
332
314
|
// Try full path in ~/data/
|
|
333
|
-
// 尝试 ~/data/ 下的完整路径
|
|
334
315
|
alternativePaths.push(pathJoin(homeDir || '', 'data', originalFilePath));
|
|
335
316
|
// Try just filename in ~/data/ (for VolumeClient files stored directly in root)
|
|
336
|
-
// 尝试 ~/data/ 下的仅文件名(用于直接存储在根目录的 VolumeClient 文件)
|
|
337
317
|
alternativePaths.push(pathJoin(homeDir || '', 'data', fileName));
|
|
338
318
|
}
|
|
339
319
|
else {
|
|
340
320
|
// If original path is just a filename, try in ~/data/ root
|
|
341
|
-
// 如果原始路径只是文件名,尝试 ~/data/ 根目录
|
|
342
321
|
alternativePaths.push(pathJoin(homeDir || '', 'data', originalFilePath));
|
|
343
322
|
}
|
|
344
323
|
// Also try in knowledge base specific paths if we can determine knowledgebaseId
|
|
345
|
-
// 如果能够确定 knowledgebaseId,也尝试知识库特定路径
|
|
346
324
|
// Note: We don't have direct access to knowledgebaseId here, but files might be in knowledges subdirectory
|
|
347
|
-
// 注意:这里无法直接访问 knowledgebaseId,但文件可能在 knowledges 子目录中
|
|
348
325
|
const resolvedPath = this.fileSystem.fullPath(originalFilePath);
|
|
349
326
|
if (resolvedPath.includes('apps/api/public')) {
|
|
350
327
|
// This looks like a StorageFile path, but file not found
|
|
351
328
|
// Try VolumeClient paths as fallback
|
|
352
|
-
// 这看起来像 StorageFile 路径,但文件未找到
|
|
353
|
-
// 尝试 VolumeClient 路径作为后备
|
|
354
329
|
this.logger.debug(`File not found in StorageFile path, trying VolumeClient paths...`);
|
|
355
330
|
}
|
|
356
331
|
let foundPath = null;
|
|
@@ -371,13 +346,11 @@ export class MinerUClient {
|
|
|
371
346
|
}
|
|
372
347
|
}
|
|
373
348
|
// If file found in alternative location, use it
|
|
374
|
-
// 如果在替代位置找到文件,使用它
|
|
375
349
|
if (foundPath) {
|
|
376
350
|
filePath = foundPath;
|
|
377
351
|
}
|
|
378
352
|
else {
|
|
379
353
|
// If still not found after trying alternatives, throw original error
|
|
380
|
-
// 如果尝试替代路径后仍未找到,抛出原始错误
|
|
381
354
|
const basePath = this.fileSystem ? this.fileSystem.basePath : 'N/A';
|
|
382
355
|
this.logger.error(`File not found or not readable. ` +
|
|
383
356
|
`Original path: ${originalFilePath}, ` +
|
|
@@ -396,7 +369,6 @@ export class MinerUClient {
|
|
|
396
369
|
}
|
|
397
370
|
else {
|
|
398
371
|
// Re-throw other errors
|
|
399
|
-
// 重新抛出其他错误
|
|
400
372
|
throw error;
|
|
401
373
|
}
|
|
402
374
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,
|
|
1
|
+
{"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAGb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EAEtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,EAAgB,wBAAwB,EAAE,MAAM,YAAY,CAAA;AAInE,qBAEa,yBAA0B,YAAW,4BAA4B,CAAC,wBAAwB,CAAC;IAEtG,OAAO,CAAC,QAAQ,CAAC,YAAY,CAA2B;IAGxD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;IAE7C,QAAQ,CAAC,WAAW,mDAWnB;IAED,QAAQ,CAAC,IAAI;;;;;;;;;;;kBAWM,QAAQ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAwE1B;IAED,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAIpC,kBAAkB,CACtB,SAAS,EAAE,OAAO,CAAC,kBAAkB,CAAC,EAAE,EACxC,MAAM,EAAE,wBAAwB,GAC/B,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,EAAE,CAAC;CAwFzD"}
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import { __decorate, __metadata } from "tslib";
|
|
2
2
|
import { forwardRef, Inject, Injectable } from '@nestjs/common';
|
|
3
3
|
import { ConfigService } from '@nestjs/config';
|
|
4
|
-
import { DocumentTransformerStrategy, } from '@xpert-ai/plugin-sdk';
|
|
4
|
+
import { DocumentTransformerStrategy, downloadRemoteFile, isRemoteFile, } from '@xpert-ai/plugin-sdk';
|
|
5
5
|
import { isNil, omitBy, pick } from 'lodash-es';
|
|
6
6
|
import { MinerUClient } from './mineru.client.js';
|
|
7
7
|
import { MinerUResultParserService } from './result-parser.service.js';
|
|
8
8
|
import { icon, MinerU } from './types.js';
|
|
9
|
+
import path from 'path';
|
|
10
|
+
import fsPromises from 'fs/promises';
|
|
9
11
|
let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
10
12
|
constructor() {
|
|
11
13
|
this.permissions = [
|
|
@@ -113,9 +115,30 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
|
113
115
|
const parsedResults = [];
|
|
114
116
|
for await (const document of documents) {
|
|
115
117
|
if (mineru.serverType === 'self-hosted') {
|
|
118
|
+
// Handle missing filePath: if filePath is not provided but fileUrl is available,
|
|
119
|
+
// download the file to a temporary directory
|
|
120
|
+
let filePath = document.filePath;
|
|
121
|
+
if (!filePath && document.fileUrl && isRemoteFile(document.fileUrl)) {
|
|
122
|
+
const tempDir = config.tempDir || '/tmp/';
|
|
123
|
+
const fileName = document.name || path.basename(document.fileUrl) || 'document.pdf';
|
|
124
|
+
const tempFilePath = path.join(tempDir, `mineru_${Date.now()}_${fileName}`);
|
|
125
|
+
// Ensure the temp directory exists
|
|
126
|
+
await fsPromises.mkdir(path.dirname(tempFilePath), { recursive: true });
|
|
127
|
+
// Download the remote file to a local temporary directory
|
|
128
|
+
filePath = await downloadRemoteFile(document.fileUrl, tempFilePath);
|
|
129
|
+
}
|
|
130
|
+
else if (!filePath && document.fileUrl && config.permissions?.fileSystem) {
|
|
131
|
+
// If fileUrl is not remote but filePath is missing, try to resolve using fileSystem
|
|
132
|
+
// This handles cases where fileUrl might be a relative path
|
|
133
|
+
filePath = config.permissions.fileSystem.fullPath(document.fileUrl);
|
|
134
|
+
}
|
|
135
|
+
if (!filePath) {
|
|
136
|
+
throw new Error('MinerU self-hosted mode requires filePath to be provided. ' +
|
|
137
|
+
'Either provide filePath in the document, or ensure fileUrl is a valid remote URL that can be downloaded.');
|
|
138
|
+
}
|
|
116
139
|
const { taskId } = await mineru.createTask({
|
|
117
140
|
url: document.fileUrl,
|
|
118
|
-
filePath:
|
|
141
|
+
filePath: filePath,
|
|
119
142
|
fileName: document.name,
|
|
120
143
|
isOcr: true,
|
|
121
144
|
enableFormula: true,
|