@helloxiaohu/plugin-mineru 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ import { getErrorMessage } from '@xpert-ai/plugin-sdk';
3
3
  import axios from 'axios';
4
4
  import FormData from 'form-data';
5
5
  import { randomUUID } from 'crypto';
6
- import { basename } from 'path';
6
+ import { basename, isAbsolute, join as pathJoin } from 'path';
7
7
  import fs from 'fs';
8
8
  import { ENV_MINERU_API_BASE_URL, ENV_MINERU_API_TOKEN, ENV_MINERU_SERVER_TYPE, } from './types.js';
9
9
  const DEFAULT_OFFICIAL_BASE_URL = 'https://mineru.net/api/v4';
@@ -17,26 +17,13 @@ export class MinerUClient {
17
17
  this.logger = new Logger(MinerUClient.name);
18
18
  this.localTasks = new Map();
19
19
  const integration = this.permissions?.integration;
20
- // First, resolve credentials without depending on serverType
20
+ this.serverType = this.resolveServerType(integration);
21
21
  const { baseUrl, token } = this.resolveCredentials(integration);
22
- const maskedToken = token && token.length > 8
23
- ? `${token.slice(0, 4)}***${token.slice(-4)}`
24
- : token
25
- ? 'provided'
26
- : 'missing';
27
- this.logger.debug('[MinerU] MinerUClient credentials resolved', {
28
- hasIntegration: Boolean(integration),
29
- apiUrl: baseUrl,
30
- token: maskedToken,
31
- serverTypeFromUrl: this.resolveServerTypeFromUrl(baseUrl || '', integration),
32
- });
33
22
  if (!baseUrl) {
34
23
  throw new Error('MinerU base URL is required');
35
24
  }
36
25
  this.baseUrl = this.normalizeBaseUrl(baseUrl);
37
26
  this.token = token;
38
- // Automatically determine serverType from URL: official if it's the official URL, otherwise self-hosted
39
- this.serverType = this.resolveServerTypeFromUrl(this.baseUrl, integration);
40
27
  if (this.serverType === 'official' && !this.token) {
41
28
  throw new Error('MinerU official API requires an access token');
42
29
  }
@@ -159,13 +146,7 @@ export class MinerUClient {
159
146
  const start = Date.now();
160
147
  while (true) {
161
148
  const result = await this.getTaskResult(taskId);
162
- this.logger.debug('[MinerU] waiting task result', {
163
- taskId,
164
- hasZip: Boolean(result?.full_zip_url),
165
- hasUrl: Boolean(result?.full_url),
166
- hasContent: Boolean(result?.content),
167
- status: result?.status,
168
- });
149
+ this.logger.debug(`MinerU waiting task result: ${JSON.stringify(result)}`);
169
150
  if (result?.full_zip_url || result?.full_url || result?.content || result?.status === 'done') {
170
151
  return result;
171
152
  }
@@ -180,42 +161,34 @@ export class MinerUClient {
180
161
  throw new Error(`${feature} is only supported for official MinerU deployments`);
181
162
  }
182
163
  }
183
- /**
184
- * Automatically determine serverType from URL
185
- * Returns 'official' if URL is the official address (https://mineru.net/api/v4), otherwise 'self-hosted'
186
- */
187
- resolveServerTypeFromUrl(baseUrl, integration) {
188
- // Prefer explicitly specified serverType (backward compatibility)
164
+ resolveServerType(integration) {
189
165
  const integrationType = this.readIntegrationOptions(integration)?.serverType;
190
166
  if (integrationType === 'self-hosted' || integrationType === 'official') {
191
167
  return integrationType;
192
168
  }
193
- // Check environment variable (backward compatibility)
194
169
  const envValue = this.configService.get(ENV_MINERU_SERVER_TYPE)?.toLowerCase();
195
170
  if (envValue === 'self-hosted') {
196
171
  return 'self-hosted';
197
172
  }
198
- // Automatically determine from URL: if normalized URL matches official address, return 'official'
199
- const normalizedOfficialUrl = this.normalizeBaseUrl(DEFAULT_OFFICIAL_BASE_URL);
200
- const normalizedBaseUrl = this.normalizeBaseUrl(baseUrl);
201
- if (normalizedBaseUrl === normalizedOfficialUrl) {
202
- return 'official';
203
- }
204
- return 'self-hosted';
173
+ return 'official';
205
174
  }
206
175
  resolveCredentials(integration) {
207
176
  const options = this.readIntegrationOptions(integration);
208
177
  const baseUrlFromIntegration = options?.apiUrl;
209
178
  const tokenFromIntegration = options?.apiKey;
210
- // Read from environment variables (same keys for both official and self-hosted)
211
- const baseUrlFromEnv = this.configService.get(ENV_MINERU_API_BASE_URL);
212
- const tokenFromEnv = this.configService.get(ENV_MINERU_API_TOKEN);
213
- // Determine baseUrl: prefer integration config, then env, then default to official URL
179
+ const baseUrlEnvKey = this.serverType === 'self-hosted' ? ENV_MINERU_API_BASE_URL : ENV_MINERU_API_BASE_URL;
180
+ const tokenEnvKey = this.serverType === 'self-hosted' ? ENV_MINERU_API_TOKEN : ENV_MINERU_API_TOKEN;
181
+ const baseUrlFromEnv = this.configService.get(baseUrlEnvKey);
182
+ const tokenFromEnv = this.configService.get(tokenEnvKey);
214
183
  const baseUrl = baseUrlFromIntegration ||
215
184
  baseUrlFromEnv ||
216
- DEFAULT_OFFICIAL_BASE_URL;
217
- // Determine token: prefer integration config, then env
185
+ (this.serverType === 'official' ? DEFAULT_OFFICIAL_BASE_URL : undefined);
218
186
  const token = tokenFromIntegration || tokenFromEnv;
187
+ // Validate baseUrl is provided for self-hosted mode
188
+ if (this.serverType === 'self-hosted' && !baseUrl) {
189
+ throw new Error('MinerU self-hosted mode requires apiUrl to be configured in integration options or ' +
190
+ `${ENV_MINERU_API_BASE_URL} environment variable`);
191
+ }
219
192
  return { baseUrl, token };
220
193
  }
221
194
  readIntegrationOptions(integration) {
@@ -270,11 +243,6 @@ export class MinerUClient {
270
243
  if (options.seed)
271
244
  body.seed = options.seed;
272
245
  try {
273
- this.logger.debug('[MinerU] createOfficialTask request', {
274
- url,
275
- body,
276
- hasAuthHeader: Boolean(this.getOfficialHeaders().Authorization),
277
- });
278
246
  const resp = await axios.post(url, body, { headers: this.getOfficialHeaders() });
279
247
  const data = resp.data;
280
248
  if (data.code !== 0) {
@@ -288,18 +256,141 @@ export class MinerUClient {
288
256
  }
289
257
  }
290
258
  async createSelfHostedTask(options) {
291
- const filePath = this.fileSystem.fullPath(options.filePath);
259
+ // Validate fileSystem is available for self-hosted mode
260
+ if (!this.fileSystem) {
261
+ throw new Error('MinerU self-hosted mode requires fileSystem permission');
262
+ }
263
+ // Validate filePath is provided
264
+ if (!options.filePath) {
265
+ throw new Error('MinerU self-hosted mode requires filePath to be provided');
266
+ }
267
+ // Resolve absolute file path
268
+ // Log original filePath for debugging
269
+ const basePath = this.fileSystem ? this.fileSystem.basePath : 'N/A';
270
+ this.logger.debug(`Resolving file path. Original filePath: ${options.filePath}, basePath: ${basePath}`);
271
+ // Check if filePath is already an absolute path
272
+ const isAbsolutePath = isAbsolute(options.filePath);
273
+ // Also check if it looks like a full path even without leading slash
274
+ const looksLikeFullPath = !isAbsolutePath && (options.filePath.startsWith('Users/') ||
275
+ options.filePath.startsWith('home/'));
276
+ let filePath;
277
+ if (isAbsolutePath) {
278
+ // Use absolute path directly
279
+ filePath = options.filePath;
280
+ this.logger.debug(`Using absolute path directly: ${filePath}`);
281
+ }
282
+ else if (looksLikeFullPath) {
283
+ // If it looks like a full path but doesn't start with /, add it
284
+ filePath = options.filePath.startsWith('/') ? options.filePath : '/' + options.filePath;
285
+ this.logger.debug(`Detected full path pattern, normalized to: ${filePath}`);
286
+ }
287
+ else {
288
+ // Use xpFileSystem.fullPath() to resolve relative path to absolute path
289
+ filePath = this.fileSystem.fullPath(options.filePath);
290
+ this.logger.debug(`Resolved relative path using basePath: ${filePath}`);
291
+ }
292
+ // Validate file exists and is readable before attempting to parse
293
+ try {
294
+ await fs.promises.access(filePath, fs.constants.F_OK | fs.constants.R_OK);
295
+ const stats = await fs.promises.stat(filePath);
296
+ this.logger.debug(`Processing file: ${filePath}, size: ${stats.size} bytes`);
297
+ if (stats.size === 0) {
298
+ throw new Error(`File is empty: ${filePath}`);
299
+ }
300
+ }
301
+ catch (error) {
302
+ // If file not found in the resolved path, try to find it in common alternative locations
303
+ // This handles two scenarios:
304
+ // 1. StorageFile: files/{tenantId}/filename -> apps/api/public/files/{tenantId}/filename (already tried above)
305
+ // 2. VolumeClient: folder/filename or filename -> ~/data/folder/filename or ~/data/filename
306
+ if (error instanceof Error && error.code === 'ENOENT') {
307
+ const homeDir = process.env.HOME || process.env.USERPROFILE;
308
+ const originalFilePath = options.filePath;
309
+ const fileName = basename(originalFilePath);
310
+ // Build alternative paths for VolumeClient storage
311
+ const alternativePaths = [];
312
+ // If original path contains directory separators, try both full path and just filename
313
+ if (originalFilePath.includes('/') || originalFilePath.includes('\\')) {
314
+ // Try full path in ~/data/
315
+ alternativePaths.push(pathJoin(homeDir || '', 'data', originalFilePath));
316
+ // Try just filename in ~/data/ (for VolumeClient files stored directly in root)
317
+ alternativePaths.push(pathJoin(homeDir || '', 'data', fileName));
318
+ }
319
+ else {
320
+ // If original path is just a filename, try in ~/data/ root
321
+ alternativePaths.push(pathJoin(homeDir || '', 'data', originalFilePath));
322
+ }
323
+ // Also try in knowledge base specific paths if we can determine knowledgebaseId
324
+ // Note: We don't have direct access to knowledgebaseId here, but files might be in knowledges subdirectory
325
+ const resolvedPath = this.fileSystem.fullPath(originalFilePath);
326
+ if (resolvedPath.includes('apps/api/public')) {
327
+ // This looks like a StorageFile path, but file not found
328
+ // Try VolumeClient paths as fallback
329
+ this.logger.debug(`File not found in StorageFile path, trying VolumeClient paths...`);
330
+ }
331
+ let foundPath = null;
332
+ for (const altPath of alternativePaths) {
333
+ try {
334
+ await fs.promises.access(altPath, fs.constants.F_OK | fs.constants.R_OK);
335
+ const stats = await fs.promises.stat(altPath);
336
+ this.logger.debug(`Found file in alternative location: ${altPath}, size: ${stats.size} bytes`);
337
+ foundPath = altPath;
338
+ if (stats.size === 0) {
339
+ throw new Error(`File is empty: ${foundPath}`);
340
+ }
341
+ break; // File found, exit loop
342
+ }
343
+ catch (altError) {
344
+ // Continue to next alternative path
345
+ continue;
346
+ }
347
+ }
348
+ // If file found in alternative location, use it
349
+ if (foundPath) {
350
+ filePath = foundPath;
351
+ }
352
+ else {
353
+ // If still not found after trying alternatives, throw original error
354
+ const basePath = this.fileSystem ? this.fileSystem.basePath : 'N/A';
355
+ this.logger.error(`File not found or not readable. ` +
356
+ `Original path: ${originalFilePath}, ` +
357
+ `Resolved path: ${filePath}, ` +
358
+ `Base path: ${basePath}, ` +
359
+ `Tried alternative paths: ${alternativePaths.join(', ')}`, error instanceof Error ? error.stack : error);
360
+ throw new Error(`File not found or not readable: ${filePath}. ` +
361
+ `Original path: ${originalFilePath}, ` +
362
+ `Base path: ${basePath}. ` +
363
+ `Tried alternative locations: ${alternativePaths.join(', ')}`);
364
+ }
365
+ }
366
+ else if (error instanceof Error && error.message.includes('empty')) {
367
+ this.logger.error(`File is empty: ${filePath}`);
368
+ throw error;
369
+ }
370
+ else {
371
+ // Re-throw other errors
372
+ throw error;
373
+ }
374
+ }
292
375
  const taskId = randomUUID();
293
- const result = await this.invokeSelfHostedParse(filePath, options.fileName, options);
376
+ const result = await this.invokeSelfHostedParse(filePath, options.fileName || basename(filePath), options);
294
377
  this.localTasks.set(taskId, { ...result, sourceUrl: options.url });
295
378
  return { taskId };
296
379
  }
297
380
  async invokeSelfHostedParse(filePath, fileName, options) {
298
381
  const parseUrl = this.buildApiUrl('file_parse');
382
+ this.logger.debug(`Sending parse request to: ${parseUrl}, file: ${fileName}`);
299
383
  const form = new FormData();
300
- form.append('files', fs.createReadStream(filePath), {
301
- filename: fileName,
302
- });
384
+ // Create file read stream (file existence is already validated in createSelfHostedTask)
385
+ try {
386
+ form.append('files', fs.createReadStream(filePath), {
387
+ filename: fileName,
388
+ });
389
+ }
390
+ catch (error) {
391
+ this.logger.error(`Failed to create read stream for file: ${filePath}`, error instanceof Error ? error.stack : error);
392
+ throw new Error(`Failed to read file: ${filePath}. ${error instanceof Error ? error.message : String(error)}`);
393
+ }
303
394
  // form.append('files', fileBuffer, { filename: fileName, contentType: contentType || 'application/pdf' });
304
395
  form.append('parse_method', options.parseMethod ?? 'auto');
305
396
  form.append('return_md', 'true');
@@ -327,11 +418,27 @@ export class MinerUClient {
327
418
  return this.invokeSelfHostedParseV1(filePath, fileName, options);
328
419
  }
329
420
  if (response.status === 400) {
330
- throw new BadRequestException(`MinerU self-hosted parse failed: ${response.status} ${getErrorMessage(response.data)}`);
421
+ const errorMessage = getErrorMessage(response.data);
422
+ this.logger.error(`MinerU self-hosted parse failed with 400: ${errorMessage}`, JSON.stringify(response.data));
423
+ throw new BadRequestException(`MinerU self-hosted parse failed: ${response.status} ${errorMessage}`);
331
424
  }
332
425
  if (response.status !== 200) {
333
- console.error(response.data);
334
- throw new Error(`MinerU self-hosted parse failed: ${response.status} ${response.statusText}`);
426
+ const errorMessage = getErrorMessage(response.data) || response.statusText;
427
+ const errorDetails = typeof response.data === 'object' ? JSON.stringify(response.data) : String(response.data);
428
+ this.logger.error(`MinerU self-hosted parse failed with ${response.status}: ${errorMessage}`, `Request URL: ${parseUrl}, File: ${fileName}, Details: ${errorDetails}`);
429
+ // Provide more helpful error message for common issues
430
+ let userFriendlyMessage = `MinerU self-hosted parse failed: ${response.status} ${response.statusText}`;
431
+ if (errorMessage) {
432
+ userFriendlyMessage += `. ${errorMessage}`;
433
+ }
434
+ // Check for specific error patterns
435
+ if (errorMessage && errorMessage.includes('0 active models')) {
436
+ userFriendlyMessage += ' Please ensure MinerU service has active models configured.';
437
+ }
438
+ else if (errorMessage && errorMessage.includes('NoneType')) {
439
+ userFriendlyMessage += ' This may indicate a configuration issue with the MinerU service.';
440
+ }
441
+ throw new Error(userFriendlyMessage);
335
442
  }
336
443
  return this.normalizeSelfHostedResponse(response.data);
337
444
  }
@@ -360,7 +467,9 @@ export class MinerUClient {
360
467
  validateStatus: () => true,
361
468
  });
362
469
  if (response.status !== 200) {
363
- throw new Error(`MinerU self-hosted legacy parse failed: ${response.status} ${response.statusText}`);
470
+ const errorMessage = getErrorMessage(response.data) || response.statusText;
471
+ this.logger.error(`MinerU self-hosted legacy parse failed with ${response.status}: ${errorMessage}`, JSON.stringify(response.data));
472
+ throw new Error(`MinerU self-hosted legacy parse failed: ${response.status} ${response.statusText}. ${errorMessage}`);
364
473
  }
365
474
  return this.normalizeSelfHostedResponse(response.data);
366
475
  }
@@ -1 +1 @@
1
- {"version":3,"file":"mineru.plugin.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.plugin.ts"],"names":[],"mappings":"AACA,OAAO,EAAqB,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAQ/F,qBAkBa,YAAa,YAAW,kBAAkB,EAAE,gBAAgB;IAExE,OAAO,CAAC,UAAU,CAAQ;IAE1B;;OAEG;IACH,iBAAiB,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAMzC;;OAEG;IACH,eAAe,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;CAKvC"}
1
+ {"version":3,"file":"mineru.plugin.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.plugin.ts"],"names":[],"mappings":"AACA,OAAO,EAAqB,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAO/F,qBAiBa,YAAa,YAAW,kBAAkB,EAAE,gBAAgB;IAExE,OAAO,CAAC,UAAU,CAAQ;IAE1B;;OAEG;IACH,iBAAiB,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAMzC;;OAEG;IACH,eAAe,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;CAKvC"}
@@ -7,7 +7,6 @@ import { MinerUTransformerStrategy } from './transformer-mineru.strategy.js';
7
7
  import { MinerUResultParserService } from './result-parser.service.js';
8
8
  import { MinerUIntegrationStrategy } from './integration.strategy.js';
9
9
  import { MinerUController } from './mineru.controller.js';
10
- import { MinerUToolsetStrategy } from './mineru-toolset.strategy.js';
11
10
  let MinerUPlugin = MinerUPlugin_1 = class MinerUPlugin {
12
11
  constructor() {
13
12
  // We disable by default additional logging for each event to avoid cluttering the logs
@@ -42,7 +41,6 @@ MinerUPlugin = MinerUPlugin_1 = __decorate([
42
41
  MinerUIntegrationStrategy,
43
42
  MinerUTransformerStrategy,
44
43
  MinerUResultParserService,
45
- MinerUToolsetStrategy,
46
44
  ],
47
45
  controllers: [
48
46
  MinerUController
@@ -4,12 +4,12 @@ import { ChunkMetadata, XpFileSystem } from '@xpert-ai/plugin-sdk';
4
4
  import { MinerUDocumentMetadata, MineruSelfHostedTaskResult } from './types.js';
5
5
  export declare class MinerUResultParserService {
6
6
  private readonly logger;
7
- parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem?: XpFileSystem): Promise<{
7
+ parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
8
8
  id?: string;
9
9
  chunks: Document<ChunkMetadata>[];
10
10
  metadata: MinerUDocumentMetadata;
11
11
  }>;
12
- parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem?: XpFileSystem): Promise<{
12
+ parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
13
13
  id?: string;
14
14
  chunks: Document<ChunkMetadata>[];
15
15
  metadata: MinerUDocumentMetadata;
@@ -1 +1 @@
1
- {"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,CAAC,EAAE,YAAY,GACxB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;IAqGI,cAAc,CAClB,MAAM,EAAE,0BAA0B,EAClC,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,CAAC,EAAE,YAAY,GACxB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;CA4DH"}
1
+ {"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;IAsFI,cAAc,CAClB,MAAM,EAAE,0BAA0B,EAClC,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;CAkDH"}
@@ -21,6 +21,7 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
21
21
  const metadata = {
22
22
  parser: MinerU,
23
23
  taskId,
24
+ fullZipUrl,
24
25
  };
25
26
  // 2. Unzip the file
26
27
  const zipEntries = [];
@@ -36,61 +37,43 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
36
37
  zipEntries.push({ entryName: entry.path, data });
37
38
  const fileName = entry.path;
38
39
  const filePath = join(document.folder || '', entry.path);
39
- // If platform didn't provide filesystem permission, still parse markdown but skip persisting files.
40
- // This avoids runtime crashes like: "Cannot read properties of undefined (reading 'writeFile')".
41
- if (fileSystem) {
42
- const url = await fileSystem.writeFile(filePath, data);
43
- pathMap.set(fileName, url);
44
- // Write images to local file system
45
- if (fileName.startsWith('images/')) {
46
- assets.push({
47
- type: 'image',
48
- url: url,
49
- filePath: filePath,
50
- });
51
- }
52
- else if (fileName.endsWith('layout.json')) {
53
- layoutJson = JSON.parse(data.toString('utf-8'));
54
- metadata.mineruBackend = layoutJson?._backend;
55
- metadata.mineruVersion = layoutJson?._version_name;
56
- assets.push({
57
- type: 'file',
58
- url,
59
- filePath: filePath,
60
- });
61
- }
62
- else if (fileName.endsWith('content_list.json')) {
63
- assets.push({
64
- type: 'file',
65
- url,
66
- filePath: filePath,
67
- });
68
- }
69
- else if (fileName.endsWith('full.md')) {
70
- fullMd = data.toString('utf-8');
71
- assets.push({
72
- type: 'file',
73
- url,
74
- filePath: filePath,
75
- });
76
- }
77
- else if (fileName.endsWith('origin.pdf')) {
78
- metadata.originPdfUrl = fileName;
79
- }
40
+ const url = await fileSystem.writeFile(filePath, data);
41
+ pathMap.set(fileName, url);
42
+ // Write images to local file system
43
+ if (fileName.startsWith('images/')) {
44
+ assets.push({
45
+ type: 'image',
46
+ url: url,
47
+ filePath: filePath,
48
+ });
49
+ }
50
+ else if (fileName.endsWith('layout.json')) {
51
+ layoutJson = JSON.parse(data.toString('utf-8'));
52
+ metadata.mineruBackend = layoutJson?._backend;
53
+ metadata.mineruVersion = layoutJson?._version_name;
54
+ assets.push({
55
+ type: 'file',
56
+ url,
57
+ filePath: filePath,
58
+ });
59
+ }
60
+ else if (fileName.endsWith('content_list.json')) {
61
+ assets.push({
62
+ type: 'file',
63
+ url,
64
+ filePath: filePath,
65
+ });
66
+ }
67
+ else if (fileName.endsWith('full.md')) {
68
+ fullMd = data.toString('utf-8');
69
+ assets.push({
70
+ type: 'file',
71
+ url,
72
+ filePath: filePath,
73
+ });
80
74
  }
81
- else {
82
- // Still extract key metadata & markdown without writing to filesystem
83
- if (fileName.endsWith('layout.json')) {
84
- layoutJson = JSON.parse(data.toString('utf-8'));
85
- metadata.mineruBackend = layoutJson?._backend;
86
- metadata.mineruVersion = layoutJson?._version_name;
87
- }
88
- else if (fileName.endsWith('full.md')) {
89
- fullMd = data.toString('utf-8');
90
- }
91
- else if (fileName.endsWith('origin.pdf')) {
92
- metadata.originPdfUrl = fileName;
93
- }
75
+ else if (fileName.endsWith('origin.pdf')) {
76
+ metadata.originPdfUrl = fileName;
94
77
  }
95
78
  }
96
79
  metadata.assets = assets;
@@ -119,24 +102,13 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
119
102
  const pathMap = new Map();
120
103
  for (const image of result.images) {
121
104
  const filePath = join(document.folder || '', 'images', image.name);
122
- if (fileSystem) {
123
- const url = await fileSystem.writeFile(filePath, Buffer.from(image.dataUrl.split(',')[1], 'base64'));
124
- pathMap.set(`images/${image.name}`, url);
125
- assets.push({
126
- type: 'image',
127
- url: url,
128
- filePath: filePath,
129
- });
130
- }
131
- else {
132
- // Fallback: keep images as data URLs so markdown can still render without filesystem permission
133
- pathMap.set(`images/${image.name}`, image.dataUrl);
134
- assets.push({
135
- type: 'image',
136
- url: image.dataUrl,
137
- filePath: filePath,
138
- });
139
- }
105
+ const url = await fileSystem.writeFile(filePath, Buffer.from(image.dataUrl.split(',')[1], 'base64'));
106
+ pathMap.set(`images/${image.name}`, url);
107
+ assets.push({
108
+ type: 'image',
109
+ url: url,
110
+ filePath: filePath,
111
+ });
140
112
  }
141
113
  if (result.sourceUrl) {
142
114
  assets.push({
@@ -85,6 +85,17 @@ export declare class MinerUTransformerStrategy implements IDocumentTransformerSt
85
85
  enum: string[];
86
86
  default: string;
87
87
  };
88
+ pageRanges: {
89
+ type: string;
90
+ title: {
91
+ en_US: string;
92
+ zh_Hans: string;
93
+ };
94
+ description: {
95
+ en_US: string;
96
+ zh_Hans: string;
97
+ };
98
+ };
88
99
  };
89
100
  required: any[];
90
101
  };
@@ -1 +1 @@
1
- {"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAEb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EACtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,EAA8C,wBAAwB,EAAE,MAAM,YAAY,CAAA;AAEjG,qBAEa,yBAA0B,YAAW,4BAA4B,CAAC,wBAAwB,CAAC;IAEtG,OAAO,CAAC,QAAQ,CAAC,YAAY,CAA2B;IAGxD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;IAE7C,QAAQ,CAAC,WAAW,mDAWnB;IAED,QAAQ,CAAC,IAAI;;;;;;;;;;;kBAWM,QAAQ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAwE1B;IAED,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAIpC,kBAAkB,CACtB,SAAS,EAAE,OAAO,CAAC,kBAAkB,CAAC,EAAE,EACxC,MAAM,EAAE,wBAAwB,GAC/B,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,EAAE,CAAC;CAsDzD"}
1
+ {"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAEb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EACtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,EAA0C,wBAAwB,EAAE,MAAM,YAAY,CAAA;AAE7F,qBAEa,yBAA0B,YAAW,4BAA4B,CAAC,wBAAwB,CAAC;IAEtG,OAAO,CAAC,QAAQ,CAAC,YAAY,CAA2B;IAGxD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;IAE7C,QAAQ,CAAC,WAAW,mDAWnB;IAED,QAAQ,CAAC,IAAI;;;;;;;;;;;kBAWM,QAAQ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAmF1B;IAED,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAIpC,kBAAkB,CACtB,SAAS,EAAE,OAAO,CAAC,kBAAkB,CAAC,EAAE,EACxC,MAAM,EAAE,wBAAwB,GAC/B,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,EAAE,CAAC;CAiEzD"}
@@ -5,13 +5,13 @@ import { DocumentTransformerStrategy, } from '@xpert-ai/plugin-sdk';
5
5
  import { isNil, omitBy, pick } from 'lodash-es';
6
6
  import { MinerUClient } from './mineru.client.js';
7
7
  import { MinerUResultParserService } from './result-parser.service.js';
8
- import { icon, MinerUIntegration, MinerUTransformer } from './types.js';
8
+ import { icon, MinerU } from './types.js';
9
9
  let MinerUTransformerStrategy = class MinerUTransformerStrategy {
10
10
  constructor() {
11
11
  this.permissions = [
12
12
  {
13
13
  type: 'integration',
14
- service: MinerUIntegration,
14
+ service: MinerU,
15
15
  description: 'Access to MinerU system integrations'
16
16
  },
17
17
  {
@@ -21,7 +21,7 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
21
21
  }
22
22
  ];
23
23
  this.meta = {
24
- name: MinerUTransformer,
24
+ name: MinerU,
25
25
  label: {
26
26
  en_US: 'MinerU',
27
27
  zh_Hans: 'MinerU'
@@ -99,6 +99,17 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
99
99
  },
100
100
  enum: ['pipeline', 'vlm'],
101
101
  default: 'pipeline'
102
+ },
103
+ pageRanges: {
104
+ type: 'string',
105
+ title: {
106
+ en_US: 'Page Ranges',
107
+ zh_Hans: '页码范围'
108
+ },
109
+ description: {
110
+ en_US: 'Page ranges like "2,4-6" or "2--2" (official API only).',
111
+ zh_Hans: '页码范围,例如 "2,4-6" 或 "2--2"(仅官方 API)。'
112
+ }
102
113
  }
103
114
  },
104
115
  required: []
@@ -111,6 +122,7 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
111
122
  async transformDocuments(documents, config) {
112
123
  const mineru = new MinerUClient(this.configService, config.permissions);
113
124
  const parsedResults = [];
125
+ const integrationOptions = config.permissions?.integration?.options;
114
126
  for await (const document of documents) {
115
127
  if (mineru.serverType === 'self-hosted') {
116
128
  const { taskId } = await mineru.createTask({
@@ -125,8 +137,12 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
125
137
  });
126
138
  const result = mineru.getSelfHostedTask(taskId);
127
139
  const parsedResult = await this.resultParser.parseLocalTask(result, taskId, document, config.permissions.fileSystem);
128
- parsedResult.id = document.id;
129
- parsedResults.push(parsedResult);
140
+ // Convert parsedResult to IKnowledgeDocument format
141
+ parsedResults.push({
142
+ id: document.id,
143
+ chunks: parsedResult.chunks,
144
+ metadata: parsedResult.metadata
145
+ });
130
146
  }
131
147
  else {
132
148
  const { taskId } = await mineru.createTask({
@@ -136,13 +152,19 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
136
152
  enableTable: true,
137
153
  language: 'ch',
138
154
  modelVersion: 'vlm',
139
- ...omitBy(pick(config, ['isOcr', 'enableFormula', 'enableTable', 'language', 'modelVersion']), isNil)
155
+ pageRanges: config.pageRanges,
156
+ extraFormats: integrationOptions?.extraFormats,
157
+ ...omitBy(pick(config, ['isOcr', 'enableFormula', 'enableTable', 'language', 'modelVersion', 'pageRanges']), isNil)
140
158
  });
141
159
  // Waiting for completion
142
160
  const result = await mineru.waitForTask(taskId, 5 * 60 * 1000, 5000);
143
161
  const parsedResult = await this.resultParser.parseFromUrl(result.full_zip_url, taskId, document, config.permissions.fileSystem);
144
- parsedResult.id = document.id;
145
- parsedResults.push(parsedResult);
162
+ // Convert parsedResult to IKnowledgeDocument format
163
+ parsedResults.push({
164
+ id: document.id,
165
+ chunks: parsedResult.chunks,
166
+ metadata: parsedResult.metadata
167
+ });
146
168
  }
147
169
  }
148
170
  return parsedResults;
@@ -158,6 +180,6 @@ __decorate([
158
180
  ], MinerUTransformerStrategy.prototype, "configService", void 0);
159
181
  MinerUTransformerStrategy = __decorate([
160
182
  Injectable(),
161
- DocumentTransformerStrategy(MinerUTransformer)
183
+ DocumentTransformerStrategy(MinerU)
162
184
  ], MinerUTransformerStrategy);
163
185
  export { MinerUTransformerStrategy };