@helloxiaohu/plugin-mineru 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -258
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7 -8
- package/dist/lib/integration.strategy.d.ts.map +1 -1
- package/dist/lib/integration.strategy.js +20 -5
- package/dist/lib/mineru.client.d.ts +1 -5
- package/dist/lib/mineru.client.d.ts.map +1 -1
- package/dist/lib/mineru.client.js +165 -56
- package/dist/lib/mineru.plugin.d.ts.map +1 -1
- package/dist/lib/mineru.plugin.js +0 -2
- package/dist/lib/result-parser.service.d.ts +2 -2
- package/dist/lib/result-parser.service.d.ts.map +1 -1
- package/dist/lib/result-parser.service.js +44 -72
- package/dist/lib/transformer-mineru.strategy.d.ts +11 -0
- package/dist/lib/transformer-mineru.strategy.d.ts.map +1 -1
- package/dist/lib/transformer-mineru.strategy.js +31 -9
- package/dist/lib/types.d.ts +23 -40
- package/dist/lib/types.d.ts.map +1 -1
- package/dist/lib/types.js +22 -35
- package/package.json +10 -18
- package/dist/lib/mineru-toolset.strategy.d.ts +0 -234
- package/dist/lib/mineru-toolset.strategy.d.ts.map +0 -1
- package/dist/lib/mineru-toolset.strategy.js +0 -306
- package/dist/lib/mineru.tool.d.ts +0 -35
- package/dist/lib/mineru.tool.d.ts.map +0 -1
- package/dist/lib/mineru.tool.js +0 -157
- package/dist/lib/mineru.toolset.d.ts +0 -50
- package/dist/lib/mineru.toolset.d.ts.map +0 -1
- package/dist/lib/mineru.toolset.js +0 -95
|
@@ -3,7 +3,7 @@ import { getErrorMessage } from '@xpert-ai/plugin-sdk';
|
|
|
3
3
|
import axios from 'axios';
|
|
4
4
|
import FormData from 'form-data';
|
|
5
5
|
import { randomUUID } from 'crypto';
|
|
6
|
-
import { basename } from 'path';
|
|
6
|
+
import { basename, isAbsolute, join as pathJoin } from 'path';
|
|
7
7
|
import fs from 'fs';
|
|
8
8
|
import { ENV_MINERU_API_BASE_URL, ENV_MINERU_API_TOKEN, ENV_MINERU_SERVER_TYPE, } from './types.js';
|
|
9
9
|
const DEFAULT_OFFICIAL_BASE_URL = 'https://mineru.net/api/v4';
|
|
@@ -17,26 +17,13 @@ export class MinerUClient {
|
|
|
17
17
|
this.logger = new Logger(MinerUClient.name);
|
|
18
18
|
this.localTasks = new Map();
|
|
19
19
|
const integration = this.permissions?.integration;
|
|
20
|
-
|
|
20
|
+
this.serverType = this.resolveServerType(integration);
|
|
21
21
|
const { baseUrl, token } = this.resolveCredentials(integration);
|
|
22
|
-
const maskedToken = token && token.length > 8
|
|
23
|
-
? `${token.slice(0, 4)}***${token.slice(-4)}`
|
|
24
|
-
: token
|
|
25
|
-
? 'provided'
|
|
26
|
-
: 'missing';
|
|
27
|
-
this.logger.debug('[MinerU] MinerUClient credentials resolved', {
|
|
28
|
-
hasIntegration: Boolean(integration),
|
|
29
|
-
apiUrl: baseUrl,
|
|
30
|
-
token: maskedToken,
|
|
31
|
-
serverTypeFromUrl: this.resolveServerTypeFromUrl(baseUrl || '', integration),
|
|
32
|
-
});
|
|
33
22
|
if (!baseUrl) {
|
|
34
23
|
throw new Error('MinerU base URL is required');
|
|
35
24
|
}
|
|
36
25
|
this.baseUrl = this.normalizeBaseUrl(baseUrl);
|
|
37
26
|
this.token = token;
|
|
38
|
-
// Automatically determine serverType from URL: official if it's the official URL, otherwise self-hosted
|
|
39
|
-
this.serverType = this.resolveServerTypeFromUrl(this.baseUrl, integration);
|
|
40
27
|
if (this.serverType === 'official' && !this.token) {
|
|
41
28
|
throw new Error('MinerU official API requires an access token');
|
|
42
29
|
}
|
|
@@ -159,13 +146,7 @@ export class MinerUClient {
|
|
|
159
146
|
const start = Date.now();
|
|
160
147
|
while (true) {
|
|
161
148
|
const result = await this.getTaskResult(taskId);
|
|
162
|
-
this.logger.debug(
|
|
163
|
-
taskId,
|
|
164
|
-
hasZip: Boolean(result?.full_zip_url),
|
|
165
|
-
hasUrl: Boolean(result?.full_url),
|
|
166
|
-
hasContent: Boolean(result?.content),
|
|
167
|
-
status: result?.status,
|
|
168
|
-
});
|
|
149
|
+
this.logger.debug(`MinerU waiting task result: ${JSON.stringify(result)}`);
|
|
169
150
|
if (result?.full_zip_url || result?.full_url || result?.content || result?.status === 'done') {
|
|
170
151
|
return result;
|
|
171
152
|
}
|
|
@@ -180,42 +161,34 @@ export class MinerUClient {
|
|
|
180
161
|
throw new Error(`${feature} is only supported for official MinerU deployments`);
|
|
181
162
|
}
|
|
182
163
|
}
|
|
183
|
-
|
|
184
|
-
* Automatically determine serverType from URL
|
|
185
|
-
* Returns 'official' if URL is the official address (https://mineru.net/api/v4), otherwise 'self-hosted'
|
|
186
|
-
*/
|
|
187
|
-
resolveServerTypeFromUrl(baseUrl, integration) {
|
|
188
|
-
// Prefer explicitly specified serverType (backward compatibility)
|
|
164
|
+
resolveServerType(integration) {
|
|
189
165
|
const integrationType = this.readIntegrationOptions(integration)?.serverType;
|
|
190
166
|
if (integrationType === 'self-hosted' || integrationType === 'official') {
|
|
191
167
|
return integrationType;
|
|
192
168
|
}
|
|
193
|
-
// Check environment variable (backward compatibility)
|
|
194
169
|
const envValue = this.configService.get(ENV_MINERU_SERVER_TYPE)?.toLowerCase();
|
|
195
170
|
if (envValue === 'self-hosted') {
|
|
196
171
|
return 'self-hosted';
|
|
197
172
|
}
|
|
198
|
-
|
|
199
|
-
const normalizedOfficialUrl = this.normalizeBaseUrl(DEFAULT_OFFICIAL_BASE_URL);
|
|
200
|
-
const normalizedBaseUrl = this.normalizeBaseUrl(baseUrl);
|
|
201
|
-
if (normalizedBaseUrl === normalizedOfficialUrl) {
|
|
202
|
-
return 'official';
|
|
203
|
-
}
|
|
204
|
-
return 'self-hosted';
|
|
173
|
+
return 'official';
|
|
205
174
|
}
|
|
206
175
|
resolveCredentials(integration) {
|
|
207
176
|
const options = this.readIntegrationOptions(integration);
|
|
208
177
|
const baseUrlFromIntegration = options?.apiUrl;
|
|
209
178
|
const tokenFromIntegration = options?.apiKey;
|
|
210
|
-
|
|
211
|
-
const
|
|
212
|
-
const
|
|
213
|
-
|
|
179
|
+
const baseUrlEnvKey = this.serverType === 'self-hosted' ? ENV_MINERU_API_BASE_URL : ENV_MINERU_API_BASE_URL;
|
|
180
|
+
const tokenEnvKey = this.serverType === 'self-hosted' ? ENV_MINERU_API_TOKEN : ENV_MINERU_API_TOKEN;
|
|
181
|
+
const baseUrlFromEnv = this.configService.get(baseUrlEnvKey);
|
|
182
|
+
const tokenFromEnv = this.configService.get(tokenEnvKey);
|
|
214
183
|
const baseUrl = baseUrlFromIntegration ||
|
|
215
184
|
baseUrlFromEnv ||
|
|
216
|
-
DEFAULT_OFFICIAL_BASE_URL;
|
|
217
|
-
// Determine token: prefer integration config, then env
|
|
185
|
+
(this.serverType === 'official' ? DEFAULT_OFFICIAL_BASE_URL : undefined);
|
|
218
186
|
const token = tokenFromIntegration || tokenFromEnv;
|
|
187
|
+
// Validate baseUrl is provided for self-hosted mode
|
|
188
|
+
if (this.serverType === 'self-hosted' && !baseUrl) {
|
|
189
|
+
throw new Error('MinerU self-hosted mode requires apiUrl to be configured in integration options or ' +
|
|
190
|
+
`${ENV_MINERU_API_BASE_URL} environment variable`);
|
|
191
|
+
}
|
|
219
192
|
return { baseUrl, token };
|
|
220
193
|
}
|
|
221
194
|
readIntegrationOptions(integration) {
|
|
@@ -270,11 +243,6 @@ export class MinerUClient {
|
|
|
270
243
|
if (options.seed)
|
|
271
244
|
body.seed = options.seed;
|
|
272
245
|
try {
|
|
273
|
-
this.logger.debug('[MinerU] createOfficialTask request', {
|
|
274
|
-
url,
|
|
275
|
-
body,
|
|
276
|
-
hasAuthHeader: Boolean(this.getOfficialHeaders().Authorization),
|
|
277
|
-
});
|
|
278
246
|
const resp = await axios.post(url, body, { headers: this.getOfficialHeaders() });
|
|
279
247
|
const data = resp.data;
|
|
280
248
|
if (data.code !== 0) {
|
|
@@ -288,18 +256,141 @@ export class MinerUClient {
|
|
|
288
256
|
}
|
|
289
257
|
}
|
|
290
258
|
async createSelfHostedTask(options) {
|
|
291
|
-
|
|
259
|
+
// Validate fileSystem is available for self-hosted mode
|
|
260
|
+
if (!this.fileSystem) {
|
|
261
|
+
throw new Error('MinerU self-hosted mode requires fileSystem permission');
|
|
262
|
+
}
|
|
263
|
+
// Validate filePath is provided
|
|
264
|
+
if (!options.filePath) {
|
|
265
|
+
throw new Error('MinerU self-hosted mode requires filePath to be provided');
|
|
266
|
+
}
|
|
267
|
+
// Resolve absolute file path
|
|
268
|
+
// Log original filePath for debugging
|
|
269
|
+
const basePath = this.fileSystem ? this.fileSystem.basePath : 'N/A';
|
|
270
|
+
this.logger.debug(`Resolving file path. Original filePath: ${options.filePath}, basePath: ${basePath}`);
|
|
271
|
+
// Check if filePath is already an absolute path
|
|
272
|
+
const isAbsolutePath = isAbsolute(options.filePath);
|
|
273
|
+
// Also check if it looks like a full path even without leading slash
|
|
274
|
+
const looksLikeFullPath = !isAbsolutePath && (options.filePath.startsWith('Users/') ||
|
|
275
|
+
options.filePath.startsWith('home/'));
|
|
276
|
+
let filePath;
|
|
277
|
+
if (isAbsolutePath) {
|
|
278
|
+
// Use absolute path directly
|
|
279
|
+
filePath = options.filePath;
|
|
280
|
+
this.logger.debug(`Using absolute path directly: ${filePath}`);
|
|
281
|
+
}
|
|
282
|
+
else if (looksLikeFullPath) {
|
|
283
|
+
// If it looks like a full path but doesn't start with /, add it
|
|
284
|
+
filePath = options.filePath.startsWith('/') ? options.filePath : '/' + options.filePath;
|
|
285
|
+
this.logger.debug(`Detected full path pattern, normalized to: ${filePath}`);
|
|
286
|
+
}
|
|
287
|
+
else {
|
|
288
|
+
// Use xpFileSystem.fullPath() to resolve relative path to absolute path
|
|
289
|
+
filePath = this.fileSystem.fullPath(options.filePath);
|
|
290
|
+
this.logger.debug(`Resolved relative path using basePath: ${filePath}`);
|
|
291
|
+
}
|
|
292
|
+
// Validate file exists and is readable before attempting to parse
|
|
293
|
+
try {
|
|
294
|
+
await fs.promises.access(filePath, fs.constants.F_OK | fs.constants.R_OK);
|
|
295
|
+
const stats = await fs.promises.stat(filePath);
|
|
296
|
+
this.logger.debug(`Processing file: ${filePath}, size: ${stats.size} bytes`);
|
|
297
|
+
if (stats.size === 0) {
|
|
298
|
+
throw new Error(`File is empty: ${filePath}`);
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
catch (error) {
|
|
302
|
+
// If file not found in the resolved path, try to find it in common alternative locations
|
|
303
|
+
// This handles two scenarios:
|
|
304
|
+
// 1. StorageFile: files/{tenantId}/filename -> apps/api/public/files/{tenantId}/filename (already tried above)
|
|
305
|
+
// 2. VolumeClient: folder/filename or filename -> ~/data/folder/filename or ~/data/filename
|
|
306
|
+
if (error instanceof Error && error.code === 'ENOENT') {
|
|
307
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE;
|
|
308
|
+
const originalFilePath = options.filePath;
|
|
309
|
+
const fileName = basename(originalFilePath);
|
|
310
|
+
// Build alternative paths for VolumeClient storage
|
|
311
|
+
const alternativePaths = [];
|
|
312
|
+
// If original path contains directory separators, try both full path and just filename
|
|
313
|
+
if (originalFilePath.includes('/') || originalFilePath.includes('\\')) {
|
|
314
|
+
// Try full path in ~/data/
|
|
315
|
+
alternativePaths.push(pathJoin(homeDir || '', 'data', originalFilePath));
|
|
316
|
+
// Try just filename in ~/data/ (for VolumeClient files stored directly in root)
|
|
317
|
+
alternativePaths.push(pathJoin(homeDir || '', 'data', fileName));
|
|
318
|
+
}
|
|
319
|
+
else {
|
|
320
|
+
// If original path is just a filename, try in ~/data/ root
|
|
321
|
+
alternativePaths.push(pathJoin(homeDir || '', 'data', originalFilePath));
|
|
322
|
+
}
|
|
323
|
+
// Also try in knowledge base specific paths if we can determine knowledgebaseId
|
|
324
|
+
// Note: We don't have direct access to knowledgebaseId here, but files might be in knowledges subdirectory
|
|
325
|
+
const resolvedPath = this.fileSystem.fullPath(originalFilePath);
|
|
326
|
+
if (resolvedPath.includes('apps/api/public')) {
|
|
327
|
+
// This looks like a StorageFile path, but file not found
|
|
328
|
+
// Try VolumeClient paths as fallback
|
|
329
|
+
this.logger.debug(`File not found in StorageFile path, trying VolumeClient paths...`);
|
|
330
|
+
}
|
|
331
|
+
let foundPath = null;
|
|
332
|
+
for (const altPath of alternativePaths) {
|
|
333
|
+
try {
|
|
334
|
+
await fs.promises.access(altPath, fs.constants.F_OK | fs.constants.R_OK);
|
|
335
|
+
const stats = await fs.promises.stat(altPath);
|
|
336
|
+
this.logger.debug(`Found file in alternative location: ${altPath}, size: ${stats.size} bytes`);
|
|
337
|
+
foundPath = altPath;
|
|
338
|
+
if (stats.size === 0) {
|
|
339
|
+
throw new Error(`File is empty: ${foundPath}`);
|
|
340
|
+
}
|
|
341
|
+
break; // File found, exit loop
|
|
342
|
+
}
|
|
343
|
+
catch (altError) {
|
|
344
|
+
// Continue to next alternative path
|
|
345
|
+
continue;
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
// If file found in alternative location, use it
|
|
349
|
+
if (foundPath) {
|
|
350
|
+
filePath = foundPath;
|
|
351
|
+
}
|
|
352
|
+
else {
|
|
353
|
+
// If still not found after trying alternatives, throw original error
|
|
354
|
+
const basePath = this.fileSystem ? this.fileSystem.basePath : 'N/A';
|
|
355
|
+
this.logger.error(`File not found or not readable. ` +
|
|
356
|
+
`Original path: ${originalFilePath}, ` +
|
|
357
|
+
`Resolved path: ${filePath}, ` +
|
|
358
|
+
`Base path: ${basePath}, ` +
|
|
359
|
+
`Tried alternative paths: ${alternativePaths.join(', ')}`, error instanceof Error ? error.stack : error);
|
|
360
|
+
throw new Error(`File not found or not readable: ${filePath}. ` +
|
|
361
|
+
`Original path: ${originalFilePath}, ` +
|
|
362
|
+
`Base path: ${basePath}. ` +
|
|
363
|
+
`Tried alternative locations: ${alternativePaths.join(', ')}`);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
else if (error instanceof Error && error.message.includes('empty')) {
|
|
367
|
+
this.logger.error(`File is empty: ${filePath}`);
|
|
368
|
+
throw error;
|
|
369
|
+
}
|
|
370
|
+
else {
|
|
371
|
+
// Re-throw other errors
|
|
372
|
+
throw error;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
292
375
|
const taskId = randomUUID();
|
|
293
|
-
const result = await this.invokeSelfHostedParse(filePath, options.fileName, options);
|
|
376
|
+
const result = await this.invokeSelfHostedParse(filePath, options.fileName || basename(filePath), options);
|
|
294
377
|
this.localTasks.set(taskId, { ...result, sourceUrl: options.url });
|
|
295
378
|
return { taskId };
|
|
296
379
|
}
|
|
297
380
|
async invokeSelfHostedParse(filePath, fileName, options) {
|
|
298
381
|
const parseUrl = this.buildApiUrl('file_parse');
|
|
382
|
+
this.logger.debug(`Sending parse request to: ${parseUrl}, file: ${fileName}`);
|
|
299
383
|
const form = new FormData();
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
384
|
+
// Create file read stream (file existence is already validated in createSelfHostedTask)
|
|
385
|
+
try {
|
|
386
|
+
form.append('files', fs.createReadStream(filePath), {
|
|
387
|
+
filename: fileName,
|
|
388
|
+
});
|
|
389
|
+
}
|
|
390
|
+
catch (error) {
|
|
391
|
+
this.logger.error(`Failed to create read stream for file: ${filePath}`, error instanceof Error ? error.stack : error);
|
|
392
|
+
throw new Error(`Failed to read file: ${filePath}. ${error instanceof Error ? error.message : String(error)}`);
|
|
393
|
+
}
|
|
303
394
|
// form.append('files', fileBuffer, { filename: fileName, contentType: contentType || 'application/pdf' });
|
|
304
395
|
form.append('parse_method', options.parseMethod ?? 'auto');
|
|
305
396
|
form.append('return_md', 'true');
|
|
@@ -327,11 +418,27 @@ export class MinerUClient {
|
|
|
327
418
|
return this.invokeSelfHostedParseV1(filePath, fileName, options);
|
|
328
419
|
}
|
|
329
420
|
if (response.status === 400) {
|
|
330
|
-
|
|
421
|
+
const errorMessage = getErrorMessage(response.data);
|
|
422
|
+
this.logger.error(`MinerU self-hosted parse failed with 400: ${errorMessage}`, JSON.stringify(response.data));
|
|
423
|
+
throw new BadRequestException(`MinerU self-hosted parse failed: ${response.status} ${errorMessage}`);
|
|
331
424
|
}
|
|
332
425
|
if (response.status !== 200) {
|
|
333
|
-
|
|
334
|
-
|
|
426
|
+
const errorMessage = getErrorMessage(response.data) || response.statusText;
|
|
427
|
+
const errorDetails = typeof response.data === 'object' ? JSON.stringify(response.data) : String(response.data);
|
|
428
|
+
this.logger.error(`MinerU self-hosted parse failed with ${response.status}: ${errorMessage}`, `Request URL: ${parseUrl}, File: ${fileName}, Details: ${errorDetails}`);
|
|
429
|
+
// Provide more helpful error message for common issues
|
|
430
|
+
let userFriendlyMessage = `MinerU self-hosted parse failed: ${response.status} ${response.statusText}`;
|
|
431
|
+
if (errorMessage) {
|
|
432
|
+
userFriendlyMessage += `. ${errorMessage}`;
|
|
433
|
+
}
|
|
434
|
+
// Check for specific error patterns
|
|
435
|
+
if (errorMessage && errorMessage.includes('0 active models')) {
|
|
436
|
+
userFriendlyMessage += ' Please ensure MinerU service has active models configured.';
|
|
437
|
+
}
|
|
438
|
+
else if (errorMessage && errorMessage.includes('NoneType')) {
|
|
439
|
+
userFriendlyMessage += ' This may indicate a configuration issue with the MinerU service.';
|
|
440
|
+
}
|
|
441
|
+
throw new Error(userFriendlyMessage);
|
|
335
442
|
}
|
|
336
443
|
return this.normalizeSelfHostedResponse(response.data);
|
|
337
444
|
}
|
|
@@ -360,7 +467,9 @@ export class MinerUClient {
|
|
|
360
467
|
validateStatus: () => true,
|
|
361
468
|
});
|
|
362
469
|
if (response.status !== 200) {
|
|
363
|
-
|
|
470
|
+
const errorMessage = getErrorMessage(response.data) || response.statusText;
|
|
471
|
+
this.logger.error(`MinerU self-hosted legacy parse failed with ${response.status}: ${errorMessage}`, JSON.stringify(response.data));
|
|
472
|
+
throw new Error(`MinerU self-hosted legacy parse failed: ${response.status} ${response.statusText}. ${errorMessage}`);
|
|
364
473
|
}
|
|
365
474
|
return this.normalizeSelfHostedResponse(response.data);
|
|
366
475
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"mineru.plugin.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.plugin.ts"],"names":[],"mappings":"AACA,OAAO,EAAqB,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;
|
|
1
|
+
{"version":3,"file":"mineru.plugin.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.plugin.ts"],"names":[],"mappings":"AACA,OAAO,EAAqB,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAO/F,qBAiBa,YAAa,YAAW,kBAAkB,EAAE,gBAAgB;IAExE,OAAO,CAAC,UAAU,CAAQ;IAE1B;;OAEG;IACH,iBAAiB,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAMzC;;OAEG;IACH,eAAe,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;CAKvC"}
|
|
@@ -7,7 +7,6 @@ import { MinerUTransformerStrategy } from './transformer-mineru.strategy.js';
|
|
|
7
7
|
import { MinerUResultParserService } from './result-parser.service.js';
|
|
8
8
|
import { MinerUIntegrationStrategy } from './integration.strategy.js';
|
|
9
9
|
import { MinerUController } from './mineru.controller.js';
|
|
10
|
-
import { MinerUToolsetStrategy } from './mineru-toolset.strategy.js';
|
|
11
10
|
let MinerUPlugin = MinerUPlugin_1 = class MinerUPlugin {
|
|
12
11
|
constructor() {
|
|
13
12
|
// We disable by default additional logging for each event to avoid cluttering the logs
|
|
@@ -42,7 +41,6 @@ MinerUPlugin = MinerUPlugin_1 = __decorate([
|
|
|
42
41
|
MinerUIntegrationStrategy,
|
|
43
42
|
MinerUTransformerStrategy,
|
|
44
43
|
MinerUResultParserService,
|
|
45
|
-
MinerUToolsetStrategy,
|
|
46
44
|
],
|
|
47
45
|
controllers: [
|
|
48
46
|
MinerUController
|
|
@@ -4,12 +4,12 @@ import { ChunkMetadata, XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
|
4
4
|
import { MinerUDocumentMetadata, MineruSelfHostedTaskResult } from './types.js';
|
|
5
5
|
export declare class MinerUResultParserService {
|
|
6
6
|
private readonly logger;
|
|
7
|
-
parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem
|
|
7
|
+
parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
|
|
8
8
|
id?: string;
|
|
9
9
|
chunks: Document<ChunkMetadata>[];
|
|
10
10
|
metadata: MinerUDocumentMetadata;
|
|
11
11
|
}>;
|
|
12
|
-
parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem
|
|
12
|
+
parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
|
|
13
13
|
id?: string;
|
|
14
14
|
chunks: Document<ChunkMetadata>[];
|
|
15
15
|
metadata: MinerUDocumentMetadata;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,
|
|
1
|
+
{"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;IAsFI,cAAc,CAClB,MAAM,EAAE,0BAA0B,EAClC,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;CAkDH"}
|
|
@@ -21,6 +21,7 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
|
|
|
21
21
|
const metadata = {
|
|
22
22
|
parser: MinerU,
|
|
23
23
|
taskId,
|
|
24
|
+
fullZipUrl,
|
|
24
25
|
};
|
|
25
26
|
// 2. Unzip the file
|
|
26
27
|
const zipEntries = [];
|
|
@@ -36,61 +37,43 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
|
|
|
36
37
|
zipEntries.push({ entryName: entry.path, data });
|
|
37
38
|
const fileName = entry.path;
|
|
38
39
|
const filePath = join(document.folder || '', entry.path);
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
url,
|
|
74
|
-
filePath: filePath,
|
|
75
|
-
});
|
|
76
|
-
}
|
|
77
|
-
else if (fileName.endsWith('origin.pdf')) {
|
|
78
|
-
metadata.originPdfUrl = fileName;
|
|
79
|
-
}
|
|
40
|
+
const url = await fileSystem.writeFile(filePath, data);
|
|
41
|
+
pathMap.set(fileName, url);
|
|
42
|
+
// Write images to local file system
|
|
43
|
+
if (fileName.startsWith('images/')) {
|
|
44
|
+
assets.push({
|
|
45
|
+
type: 'image',
|
|
46
|
+
url: url,
|
|
47
|
+
filePath: filePath,
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
else if (fileName.endsWith('layout.json')) {
|
|
51
|
+
layoutJson = JSON.parse(data.toString('utf-8'));
|
|
52
|
+
metadata.mineruBackend = layoutJson?._backend;
|
|
53
|
+
metadata.mineruVersion = layoutJson?._version_name;
|
|
54
|
+
assets.push({
|
|
55
|
+
type: 'file',
|
|
56
|
+
url,
|
|
57
|
+
filePath: filePath,
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
else if (fileName.endsWith('content_list.json')) {
|
|
61
|
+
assets.push({
|
|
62
|
+
type: 'file',
|
|
63
|
+
url,
|
|
64
|
+
filePath: filePath,
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
else if (fileName.endsWith('full.md')) {
|
|
68
|
+
fullMd = data.toString('utf-8');
|
|
69
|
+
assets.push({
|
|
70
|
+
type: 'file',
|
|
71
|
+
url,
|
|
72
|
+
filePath: filePath,
|
|
73
|
+
});
|
|
80
74
|
}
|
|
81
|
-
else {
|
|
82
|
-
|
|
83
|
-
if (fileName.endsWith('layout.json')) {
|
|
84
|
-
layoutJson = JSON.parse(data.toString('utf-8'));
|
|
85
|
-
metadata.mineruBackend = layoutJson?._backend;
|
|
86
|
-
metadata.mineruVersion = layoutJson?._version_name;
|
|
87
|
-
}
|
|
88
|
-
else if (fileName.endsWith('full.md')) {
|
|
89
|
-
fullMd = data.toString('utf-8');
|
|
90
|
-
}
|
|
91
|
-
else if (fileName.endsWith('origin.pdf')) {
|
|
92
|
-
metadata.originPdfUrl = fileName;
|
|
93
|
-
}
|
|
75
|
+
else if (fileName.endsWith('origin.pdf')) {
|
|
76
|
+
metadata.originPdfUrl = fileName;
|
|
94
77
|
}
|
|
95
78
|
}
|
|
96
79
|
metadata.assets = assets;
|
|
@@ -119,24 +102,13 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
|
|
|
119
102
|
const pathMap = new Map();
|
|
120
103
|
for (const image of result.images) {
|
|
121
104
|
const filePath = join(document.folder || '', 'images', image.name);
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
});
|
|
130
|
-
}
|
|
131
|
-
else {
|
|
132
|
-
// Fallback: keep images as data URLs so markdown can still render without filesystem permission
|
|
133
|
-
pathMap.set(`images/${image.name}`, image.dataUrl);
|
|
134
|
-
assets.push({
|
|
135
|
-
type: 'image',
|
|
136
|
-
url: image.dataUrl,
|
|
137
|
-
filePath: filePath,
|
|
138
|
-
});
|
|
139
|
-
}
|
|
105
|
+
const url = await fileSystem.writeFile(filePath, Buffer.from(image.dataUrl.split(',')[1], 'base64'));
|
|
106
|
+
pathMap.set(`images/${image.name}`, url);
|
|
107
|
+
assets.push({
|
|
108
|
+
type: 'image',
|
|
109
|
+
url: url,
|
|
110
|
+
filePath: filePath,
|
|
111
|
+
});
|
|
140
112
|
}
|
|
141
113
|
if (result.sourceUrl) {
|
|
142
114
|
assets.push({
|
|
@@ -85,6 +85,17 @@ export declare class MinerUTransformerStrategy implements IDocumentTransformerSt
|
|
|
85
85
|
enum: string[];
|
|
86
86
|
default: string;
|
|
87
87
|
};
|
|
88
|
+
pageRanges: {
|
|
89
|
+
type: string;
|
|
90
|
+
title: {
|
|
91
|
+
en_US: string;
|
|
92
|
+
zh_Hans: string;
|
|
93
|
+
};
|
|
94
|
+
description: {
|
|
95
|
+
en_US: string;
|
|
96
|
+
zh_Hans: string;
|
|
97
|
+
};
|
|
98
|
+
};
|
|
88
99
|
};
|
|
89
100
|
required: any[];
|
|
90
101
|
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAEb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EACtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,
|
|
1
|
+
{"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAEb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EACtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,EAA0C,wBAAwB,EAAE,MAAM,YAAY,CAAA;AAE7F,qBAEa,yBAA0B,YAAW,4BAA4B,CAAC,wBAAwB,CAAC;IAEtG,OAAO,CAAC,QAAQ,CAAC,YAAY,CAA2B;IAGxD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;IAE7C,QAAQ,CAAC,WAAW,mDAWnB;IAED,QAAQ,CAAC,IAAI;;;;;;;;;;;kBAWM,QAAQ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAmF1B;IAED,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAIpC,kBAAkB,CACtB,SAAS,EAAE,OAAO,CAAC,kBAAkB,CAAC,EAAE,EACxC,MAAM,EAAE,wBAAwB,GAC/B,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,EAAE,CAAC;CAiEzD"}
|
|
@@ -5,13 +5,13 @@ import { DocumentTransformerStrategy, } from '@xpert-ai/plugin-sdk';
|
|
|
5
5
|
import { isNil, omitBy, pick } from 'lodash-es';
|
|
6
6
|
import { MinerUClient } from './mineru.client.js';
|
|
7
7
|
import { MinerUResultParserService } from './result-parser.service.js';
|
|
8
|
-
import { icon,
|
|
8
|
+
import { icon, MinerU } from './types.js';
|
|
9
9
|
let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
10
10
|
constructor() {
|
|
11
11
|
this.permissions = [
|
|
12
12
|
{
|
|
13
13
|
type: 'integration',
|
|
14
|
-
service:
|
|
14
|
+
service: MinerU,
|
|
15
15
|
description: 'Access to MinerU system integrations'
|
|
16
16
|
},
|
|
17
17
|
{
|
|
@@ -21,7 +21,7 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
|
21
21
|
}
|
|
22
22
|
];
|
|
23
23
|
this.meta = {
|
|
24
|
-
name:
|
|
24
|
+
name: MinerU,
|
|
25
25
|
label: {
|
|
26
26
|
en_US: 'MinerU',
|
|
27
27
|
zh_Hans: 'MinerU'
|
|
@@ -99,6 +99,17 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
|
99
99
|
},
|
|
100
100
|
enum: ['pipeline', 'vlm'],
|
|
101
101
|
default: 'pipeline'
|
|
102
|
+
},
|
|
103
|
+
pageRanges: {
|
|
104
|
+
type: 'string',
|
|
105
|
+
title: {
|
|
106
|
+
en_US: 'Page Ranges',
|
|
107
|
+
zh_Hans: '页码范围'
|
|
108
|
+
},
|
|
109
|
+
description: {
|
|
110
|
+
en_US: 'Page ranges like "2,4-6" or "2--2" (official API only).',
|
|
111
|
+
zh_Hans: '页码范围,例如 "2,4-6" 或 "2--2"(仅官方 API)。'
|
|
112
|
+
}
|
|
102
113
|
}
|
|
103
114
|
},
|
|
104
115
|
required: []
|
|
@@ -111,6 +122,7 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
|
111
122
|
async transformDocuments(documents, config) {
|
|
112
123
|
const mineru = new MinerUClient(this.configService, config.permissions);
|
|
113
124
|
const parsedResults = [];
|
|
125
|
+
const integrationOptions = config.permissions?.integration?.options;
|
|
114
126
|
for await (const document of documents) {
|
|
115
127
|
if (mineru.serverType === 'self-hosted') {
|
|
116
128
|
const { taskId } = await mineru.createTask({
|
|
@@ -125,8 +137,12 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
|
125
137
|
});
|
|
126
138
|
const result = mineru.getSelfHostedTask(taskId);
|
|
127
139
|
const parsedResult = await this.resultParser.parseLocalTask(result, taskId, document, config.permissions.fileSystem);
|
|
128
|
-
parsedResult
|
|
129
|
-
parsedResults.push(
|
|
140
|
+
// Convert parsedResult to IKnowledgeDocument format
|
|
141
|
+
parsedResults.push({
|
|
142
|
+
id: document.id,
|
|
143
|
+
chunks: parsedResult.chunks,
|
|
144
|
+
metadata: parsedResult.metadata
|
|
145
|
+
});
|
|
130
146
|
}
|
|
131
147
|
else {
|
|
132
148
|
const { taskId } = await mineru.createTask({
|
|
@@ -136,13 +152,19 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
|
136
152
|
enableTable: true,
|
|
137
153
|
language: 'ch',
|
|
138
154
|
modelVersion: 'vlm',
|
|
139
|
-
|
|
155
|
+
pageRanges: config.pageRanges,
|
|
156
|
+
extraFormats: integrationOptions?.extraFormats,
|
|
157
|
+
...omitBy(pick(config, ['isOcr', 'enableFormula', 'enableTable', 'language', 'modelVersion', 'pageRanges']), isNil)
|
|
140
158
|
});
|
|
141
159
|
// Waiting for completion
|
|
142
160
|
const result = await mineru.waitForTask(taskId, 5 * 60 * 1000, 5000);
|
|
143
161
|
const parsedResult = await this.resultParser.parseFromUrl(result.full_zip_url, taskId, document, config.permissions.fileSystem);
|
|
144
|
-
parsedResult
|
|
145
|
-
parsedResults.push(
|
|
162
|
+
// Convert parsedResult to IKnowledgeDocument format
|
|
163
|
+
parsedResults.push({
|
|
164
|
+
id: document.id,
|
|
165
|
+
chunks: parsedResult.chunks,
|
|
166
|
+
metadata: parsedResult.metadata
|
|
167
|
+
});
|
|
146
168
|
}
|
|
147
169
|
}
|
|
148
170
|
return parsedResults;
|
|
@@ -158,6 +180,6 @@ __decorate([
|
|
|
158
180
|
], MinerUTransformerStrategy.prototype, "configService", void 0);
|
|
159
181
|
MinerUTransformerStrategy = __decorate([
|
|
160
182
|
Injectable(),
|
|
161
|
-
DocumentTransformerStrategy(
|
|
183
|
+
DocumentTransformerStrategy(MinerU)
|
|
162
184
|
], MinerUTransformerStrategy);
|
|
163
185
|
export { MinerUTransformerStrategy };
|