@helloxiaohu/plugin-mineru 0.1.0 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,28 +3,36 @@ import { getCurrentTaskInput } from '@langchain/langgraph';
3
3
  import { getErrorMessage } from '@xpert-ai/plugin-sdk';
4
4
  import { z } from 'zod';
5
5
  import { MinerUClient } from './mineru.client.js';
6
- import { MinerUIntegration } from './types.js';
7
- /**
8
- * Build MinerU PDF parser tool
9
- * This tool converts PDF files to markdown format using MinerU service
10
- */
6
+ import { MinerU } from './types.js';
7
+ function normalizeExtraFormats(value) {
8
+ if (!value) {
9
+ return undefined;
10
+ }
11
+ if (Array.isArray(value)) {
12
+ const formats = value.map((item) => String(item).trim()).filter(Boolean);
13
+ return formats.length ? formats : undefined;
14
+ }
15
+ const formats = value
16
+ .split(',')
17
+ .map((item) => item.trim())
18
+ .filter(Boolean);
19
+ return formats.length ? formats : undefined;
20
+ }
11
21
  export function buildMinerUTool(configService, resultParser, options, fileSystem, defaults) {
12
22
  return tool(async (input) => {
13
23
  try {
14
- const { doc_url } = input;
15
- // Log raw input (mask nothing sensitive here because doc_url is public; avoid logging other fields)
16
- console.debug('[MinerU] tool invoked with input', { doc_url, extraKeys: Object.keys(input || {}).filter((k) => k !== 'doc_url') });
24
+ const { doc_url, page_ranges } = input;
25
+ console.debug('[MinerU] tool invoked with input', {
26
+ doc_url,
27
+ extraKeys: Object.keys(input || {}).filter((key) => key !== 'doc_url'),
28
+ });
17
29
  if (!doc_url) {
18
30
  throw new Error('doc_url is required');
19
31
  }
20
- // Get workspace context from current task
21
32
  const currentState = getCurrentTaskInput();
22
33
  const workspacePath = currentState?.['sys']?.['volume'] ?? '/tmp/xpert';
23
- // Use configuration from authorization page (passed via options and defaults parameters)
24
- // These values come from the authorization page configuration and are set when the tool is created
25
34
  const finalApiUrl = options?.apiUrl || 'https://mineru.net/api/v4';
26
- const finalApiKey = options?.apiKey; // apiKey is required and validated in authorization page
27
- // Log effective config (mask key) to确认是否拿到了授权页的凭据
35
+ const finalApiKey = options?.apiKey;
28
36
  const maskedKey = finalApiKey && finalApiKey.length > 8
29
37
  ? `${finalApiKey.slice(0, 4)}***${finalApiKey.slice(-4)}`
30
38
  : finalApiKey
@@ -42,42 +50,45 @@ export function buildMinerUTool(configService, resultParser, options, fileSystem
42
50
  modelVersion: defaults?.modelVersion,
43
51
  },
44
52
  });
45
- // Use configuration values from authorization page (passed via defaults parameter)
46
- // Convert string enum values ('true'/'false') to boolean, or use boolean values directly
47
- // If undefined, default to true
48
53
  const finalIsOcr = defaults?.isOcr === undefined
49
54
  ? true
50
- : (typeof defaults.isOcr === 'string' ? defaults.isOcr === 'true' : defaults.isOcr === true);
55
+ : typeof defaults.isOcr === 'string'
56
+ ? defaults.isOcr === 'true'
57
+ : defaults.isOcr === true;
51
58
  const finalEnableFormula = defaults?.enableFormula === undefined
52
59
  ? true
53
- : (typeof defaults.enableFormula === 'string' ? defaults.enableFormula === 'true' : defaults.enableFormula === true);
60
+ : typeof defaults.enableFormula === 'string'
61
+ ? defaults.enableFormula === 'true'
62
+ : defaults.enableFormula === true;
54
63
  const finalEnableTable = defaults?.enableTable === undefined
55
64
  ? true
56
- : (typeof defaults.enableTable === 'string' ? defaults.enableTable === 'true' : defaults.enableTable === true);
65
+ : typeof defaults.enableTable === 'string'
66
+ ? defaults.enableTable === 'true'
67
+ : defaults.enableTable === true;
57
68
  const finalLanguage = defaults?.language || 'ch';
58
69
  const finalModelVersion = defaults?.modelVersion || 'pipeline';
70
+ const finalExtraFormats = normalizeExtraFormats(options?.extraFormats);
59
71
  const effectiveOptions = {
60
72
  apiUrl: finalApiUrl,
61
73
  apiKey: finalApiKey,
74
+ extraFormats: finalExtraFormats,
62
75
  };
63
76
  const integration = {
64
- provider: MinerUIntegration,
77
+ provider: MinerU,
65
78
  options: effectiveOptions,
66
79
  };
67
80
  const mineruClient = new MinerUClient(configService, {
68
81
  fileSystem,
69
82
  integration,
70
83
  });
71
- // Determine file name from URL
72
84
  let finalFileName = 'document.pdf';
73
85
  try {
74
86
  const parsed = new URL(doc_url);
75
87
  finalFileName = parsed.pathname.split('/').pop() || 'document.pdf';
76
88
  }
77
89
  catch {
78
- // ignore
90
+ // Ignore URL parsing errors.
79
91
  }
80
- // Create MinerU task
81
92
  const { taskId } = await mineruClient.createTask({
82
93
  url: doc_url,
83
94
  fileName: finalFileName,
@@ -86,10 +97,11 @@ export function buildMinerUTool(configService, resultParser, options, fileSystem
86
97
  enableTable: finalEnableTable,
87
98
  language: finalLanguage,
88
99
  modelVersion: finalModelVersion,
100
+ pageRanges: page_ranges ?? undefined,
101
+ extraFormats: finalExtraFormats,
89
102
  });
90
103
  let parsedResult;
91
104
  if (mineruClient.serverType === 'self-hosted') {
92
- // Self-hosted: get result immediately
93
105
  const taskResult = mineruClient.getSelfHostedTask(taskId);
94
106
  if (!taskResult) {
95
107
  throw new Error('Failed to get MinerU task result');
@@ -101,24 +113,34 @@ export function buildMinerUTool(configService, resultParser, options, fileSystem
101
113
  }, fileSystem);
102
114
  }
103
115
  else {
104
- // Official API: wait for completion
105
116
  const result = await mineruClient.waitForTask(taskId, 5 * 60 * 1000, 5000);
106
- parsedResult = await resultParser.parseFromUrl(result.full_zip_url, taskId, {
117
+ const fullZipUrl = result.full_zip_url;
118
+ parsedResult = await resultParser.parseFromUrl(fullZipUrl, taskId, {
107
119
  fileUrl: doc_url,
108
120
  name: finalFileName,
109
121
  folder: workspacePath,
110
122
  }, fileSystem);
123
+ if (fullZipUrl) {
124
+ parsedResult.metadata = parsedResult.metadata ?? {};
125
+ parsedResult.metadata.fullZipUrl = parsedResult.metadata.fullZipUrl ?? fullZipUrl;
126
+ parsedResult.metadata.full_zip_url = parsedResult.metadata.full_zip_url ?? fullZipUrl;
127
+ }
111
128
  }
112
- // Build file artifacts from parsed result
113
129
  const fileArtifacts = [];
114
130
  if (parsedResult.metadata?.assets) {
115
131
  for (const asset of parsedResult.metadata.assets) {
116
132
  if (asset.type === 'file' || asset.type === 'image') {
117
- const fileName = asset.filePath?.split(/[/\\]/).pop() || asset.url?.split('/').pop() || 'file';
133
+ const fileName = asset.filePath?.split(/[/\\]/).pop() ||
134
+ asset.url?.split('/').pop() ||
135
+ 'file';
118
136
  const extension = fileName.split('.').pop()?.toLowerCase() || 'md';
119
137
  const mimeType = asset.type === 'image'
120
- ? (extension === 'png' ? 'image/png' : 'image/jpeg')
121
- : (extension === 'md' ? 'text/markdown' : 'application/json');
138
+ ? extension === 'png'
139
+ ? 'image/png'
140
+ : 'image/jpeg'
141
+ : extension === 'md'
142
+ ? 'text/markdown'
143
+ : 'application/json';
122
144
  fileArtifacts.push({
123
145
  fileName: fileName,
124
146
  filePath: asset.filePath,
@@ -129,11 +151,7 @@ export function buildMinerUTool(configService, resultParser, options, fileSystem
129
151
  }
130
152
  }
131
153
  }
132
- // Extract markdown content from chunks
133
- const markdownContent = parsedResult.chunks
134
- ?.map((chunk) => chunk.pageContent)
135
- .join('\n\n') || '';
136
- // Return full markdown (do NOT truncate). If the platform/UI needs a preview, it can truncate client-side.
154
+ const markdownContent = parsedResult.chunks?.map((chunk) => chunk.pageContent).join('\n\n') || '';
137
155
  return [
138
156
  markdownContent,
139
157
  {
@@ -148,9 +166,14 @@ export function buildMinerUTool(configService, resultParser, options, fileSystem
148
166
  }
149
167
  }, {
150
168
  name: 'mineru_pdf_parser',
151
- description: 'Convert PDF files to markdown format using MinerU. Supports OCR, formula recognition, and table extraction. Returns markdown content and extracted files (images, JSON, etc.).',
169
+ description: 'Convert documents to markdown using MinerU. Supports OCR, formula recognition, and table extraction. Returns markdown content and extracted files.',
152
170
  schema: z.object({
153
- doc_url: z.string().min(1).describe('PDF URL (required)'),
171
+ doc_url: z.string().min(1).describe('Document URL (required)'),
172
+ page_ranges: z
173
+ .string()
174
+ .optional()
175
+ .nullable()
176
+ .describe('Page ranges like "2,4-6" or "2--2"'),
154
177
  }),
155
178
  responseFormat: 'content_and_artifact',
156
179
  });
@@ -2,15 +2,10 @@ import { StructuredToolInterface, ToolSchemaBase } from '@langchain/core/tools';
2
2
  import { BuiltinToolset, XpFileSystem } from '@xpert-ai/plugin-sdk';
3
3
  import { ConfigService } from '@nestjs/config';
4
4
  import { MinerUResultParserService } from './result-parser.service.js';
5
- /**
6
- * Configuration for MinerU Toolset
7
- */
8
5
  export interface MinerUToolsetConfig {
9
- /**
10
- * MinerU API options stored in toolset credentials
11
- */
12
6
  apiUrl?: string;
13
7
  apiKey?: string;
8
+ extraFormats?: string | string[];
14
9
  fileSystem?: XpFileSystem;
15
10
  configService?: ConfigService;
16
11
  resultParser?: MinerUResultParserService;
@@ -20,31 +15,11 @@ export interface MinerUToolsetConfig {
20
15
  language?: 'en' | 'ch';
21
16
  modelVersion?: 'pipeline' | 'vlm';
22
17
  }
23
- /**
24
- * MinerU Toolset implementation
25
- * Provides PDF to markdown conversion tool using MinerU service
26
- */
27
18
  export declare class MinerUToolset extends BuiltinToolset<StructuredToolInterface, MinerUToolsetConfig> {
28
19
  private readonly config;
29
20
  tools: any[];
30
- /**
31
- * Constructor for MinerU Toolset
32
- * Accepts config which contains credentials and dependencies
33
- * Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
34
- * properties (tenantId, env) that are added at runtime
35
- */
36
21
  constructor(config: MinerUToolsetConfig);
37
- /**
38
- * Validate credentials for MinerU toolset
39
- * Note: During authorization phase, credentials may be incomplete.
40
- * configService and resultParser are runtime dependencies injected by the strategy.
41
- * We don't validate anything here to allow authorization to proceed.
42
- */
43
- _validateCredentials(credentials: MinerUToolsetConfig): Promise<void>;
44
- /**
45
- * Initialize tools for MinerU toolset
46
- * Creates the PDF parser tool with necessary dependencies
47
- */
22
+ _validateCredentials(_credentials: MinerUToolsetConfig): Promise<void>;
48
23
  initTools(): Promise<StructuredToolInterface<ToolSchemaBase, any, any>[]>;
49
24
  }
50
25
  //# sourceMappingURL=mineru.toolset.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"mineru.toolset.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.toolset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAChF,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACpE,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAIvE;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC;;OAEG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,YAAY,CAAC;IAC1B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,YAAY,CAAC,EAAE,yBAAyB,CAAC;IAGzC,KAAK,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACzB,aAAa,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACjC,WAAW,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IAC/B,QAAQ,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACvB,YAAY,CAAC,EAAE,UAAU,GAAG,KAAK,CAAC;CACnC;AAED;;;GAGG;AACH,qBAAa,aAAc,SAAQ,cAAc,CAAC,uBAAuB,EAAE,mBAAmB,CAAC;IAC7F,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAsB;IAEpC,KAAK,EAAE,GAAG,EAAE,CAAM;IAE3B;;;;;OAKG;gBACS,MAAM,EAAE,mBAAmB;IAgBvC;;;;;OAKG;IACY,oBAAoB,CAAC,WAAW,EAAE,mBAAmB,GAAG,OAAO,CAAC,IAAI,CAAC;IAKpF;;;OAGG;IACY,SAAS,IAAI,OAAO,CAAC,uBAAuB,CAAC,cAAc,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;CA+DzF"}
1
+ {"version":3,"file":"mineru.toolset.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.toolset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAChF,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACpE,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAIvE,MAAM,WAAW,mBAAmB;IAClC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IACjC,UAAU,CAAC,EAAE,YAAY,CAAC;IAC1B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,YAAY,CAAC,EAAE,yBAAyB,CAAC;IACzC,KAAK,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACzB,aAAa,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACjC,WAAW,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IAC/B,QAAQ,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACvB,YAAY,CAAC,EAAE,UAAU,GAAG,KAAK,CAAC;CACnC;AAED,qBAAa,aAAc,SAAQ,cAAc,CAAC,uBAAuB,EAAE,mBAAmB,CAAC;IAC7F,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAsB;IACpC,KAAK,EAAE,GAAG,EAAE,CAAM;gBAEf,MAAM,EAAE,mBAAmB;IAiBxB,oBAAoB,CAAC,YAAY,EAAE,mBAAmB,GAAG,OAAO,CAAC,IAAI,CAAC;IAItE,SAAS,IAAI,OAAO,CAAC,uBAAuB,CAAC,cAAc,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;CA+CzF"}
@@ -1,87 +1,40 @@
1
1
  import { BuiltinToolset } from '@xpert-ai/plugin-sdk';
2
2
  import { buildMinerUTool } from './mineru.tool.js';
3
- /**
4
- * MinerU Toolset implementation
5
- * Provides PDF to markdown conversion tool using MinerU service
6
- */
7
3
  export class MinerUToolset extends BuiltinToolset {
8
- /**
9
- * Constructor for MinerU Toolset
10
- * Accepts config which contains credentials and dependencies
11
- * Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
12
- * properties (tenantId, env) that are added at runtime
13
- */
14
4
  constructor(config) {
15
5
  super('mineru', undefined, config);
16
- // Ensure `tools` exists even if upstream BuiltinToolset typings differ across versions.
17
6
  this.tools = [];
18
7
  this.config = config;
19
- // Log config received in constructor (mask apiKey for security)
20
8
  const configForLog = { ...config };
21
9
  if (configForLog.apiKey) {
22
- configForLog.apiKey = configForLog.apiKey.length > 8
23
- ? `${configForLog.apiKey.substring(0, 4)}...${configForLog.apiKey.substring(configForLog.apiKey.length - 4)}`
24
- : '***';
10
+ configForLog.apiKey =
11
+ configForLog.apiKey.length > 8
12
+ ? `${configForLog.apiKey.substring(0, 4)}...${configForLog.apiKey.substring(configForLog.apiKey.length - 4)}`
13
+ : '***';
25
14
  }
26
- // Use base class logger (protected access)
27
15
  if ('logger' in this && this.logger) {
28
16
  this.logger.log(`[MinerU] MinerUToolset constructor received config: ${JSON.stringify(configForLog, null, 2)}`);
29
17
  }
30
18
  }
31
- /**
32
- * Validate credentials for MinerU toolset
33
- * Note: During authorization phase, credentials may be incomplete.
34
- * configService and resultParser are runtime dependencies injected by the strategy.
35
- * We don't validate anything here to allow authorization to proceed.
36
- */
37
- async _validateCredentials(credentials) {
38
- // No validation needed during authorization phase
39
- // API key validity will be enforced by MinerU server when tool is used
19
+ async _validateCredentials(_credentials) {
20
+ // No validation during authorization phase.
40
21
  }
41
- /**
42
- * Initialize tools for MinerU toolset
43
- * Creates the PDF parser tool with necessary dependencies
44
- */
45
22
  async initTools() {
46
- const { configService, resultParser, apiUrl, apiKey, fileSystem, isOcr, enableFormula, enableTable, language, modelVersion } = this.config;
47
- // Log config before destructuring
48
- const configKeys = Object.keys(this.config);
49
- const hasApiKey = 'apiKey' in this.config;
50
- const apiKeyValue = this.config.apiKey;
51
- const maskedApiKey = apiKeyValue
52
- ? (apiKeyValue.length > 8 ? `${apiKeyValue.substring(0, 4)}...${apiKeyValue.substring(apiKeyValue.length - 4)}` : '***')
53
- : 'missing';
54
- // Use base class logger (protected access)
55
- if ('logger' in this && this.logger) {
56
- this.logger.log(`[MinerU] MinerUToolset.initTools() - config keys: ${configKeys.join(', ')}, hasApiKey: ${hasApiKey}, apiKey: ${maskedApiKey}`);
57
- this.logger.log(`[MinerU] MinerUToolset.initTools() - destructured apiKey: ${apiKey ? (apiKey.length > 8 ? `${apiKey.substring(0, 4)}...${apiKey.substring(apiKey.length - 4)}` : '***') : 'missing'}`);
58
- }
23
+ const { configService, resultParser, apiUrl, apiKey, extraFormats, fileSystem, isOcr, enableFormula, enableTable, language, modelVersion, } = this.config;
59
24
  if (!configService || !resultParser) {
60
25
  throw new Error('ConfigService and MinerUResultParserService are required');
61
26
  }
62
- // Use configuration from authorization page
63
- // apiUrl: use provided value or default to official server URL
64
27
  const finalApiUrl = apiUrl || 'https://mineru.net/api/v4';
65
- // Convert string enum values to boolean (compatible with 'true'/'false' strings and boolean values)
66
- // Use provided values from authorization page, or default to true
67
28
  const finalIsOcr = isOcr === 'true' || isOcr === true;
68
29
  const finalEnableFormula = enableFormula === 'true' || enableFormula === true;
69
30
  const finalEnableTable = enableTable === 'true' || enableTable === true;
70
- // Use provided values from authorization page, or use defaults
71
31
  const finalLanguage = language || 'ch';
72
32
  const finalModelVersion = modelVersion || 'pipeline';
73
- // Log what we're passing to buildMinerUTool
74
- const maskedFinalApiKey = apiKey
75
- ? (apiKey.length > 8 ? `${apiKey.substring(0, 4)}...${apiKey.substring(apiKey.length - 4)}` : '***')
76
- : 'missing';
77
- // Use base class logger (protected access)
78
- if ('logger' in this && this.logger) {
79
- this.logger.log(`[MinerU] MinerUToolset.initTools() - passing to buildMinerUTool: apiUrl=${finalApiUrl}, apiKey=${maskedFinalApiKey}`);
80
- }
81
33
  this.tools = [
82
34
  buildMinerUTool(configService, resultParser, {
83
35
  apiUrl: finalApiUrl,
84
- apiKey, // apiKey is required and validated in authorization page
36
+ apiKey,
37
+ extraFormats,
85
38
  }, fileSystem, {
86
39
  isOcr: finalIsOcr,
87
40
  enableFormula: finalEnableFormula,
@@ -4,12 +4,12 @@ import { ChunkMetadata, XpFileSystem } from '@xpert-ai/plugin-sdk';
4
4
  import { MinerUDocumentMetadata, MineruSelfHostedTaskResult } from './types.js';
5
5
  export declare class MinerUResultParserService {
6
6
  private readonly logger;
7
- parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem?: XpFileSystem): Promise<{
7
+ parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
8
8
  id?: string;
9
9
  chunks: Document<ChunkMetadata>[];
10
10
  metadata: MinerUDocumentMetadata;
11
11
  }>;
12
- parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem?: XpFileSystem): Promise<{
12
+ parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
13
13
  id?: string;
14
14
  chunks: Document<ChunkMetadata>[];
15
15
  metadata: MinerUDocumentMetadata;
@@ -1 +1 @@
1
- {"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,CAAC,EAAE,YAAY,GACxB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;IAqGI,cAAc,CAClB,MAAM,EAAE,0BAA0B,EAClC,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,CAAC,EAAE,YAAY,GACxB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;CA4DH"}
1
+ {"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;IAsFI,cAAc,CAClB,MAAM,EAAE,0BAA0B,EAClC,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;CAkDH"}
@@ -21,6 +21,7 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
21
21
  const metadata = {
22
22
  parser: MinerU,
23
23
  taskId,
24
+ fullZipUrl,
24
25
  };
25
26
  // 2. Unzip the file
26
27
  const zipEntries = [];
@@ -36,61 +37,43 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
36
37
  zipEntries.push({ entryName: entry.path, data });
37
38
  const fileName = entry.path;
38
39
  const filePath = join(document.folder || '', entry.path);
39
- // If platform didn't provide filesystem permission, still parse markdown but skip persisting files.
40
- // This avoids runtime crashes like: "Cannot read properties of undefined (reading 'writeFile')".
41
- if (fileSystem) {
42
- const url = await fileSystem.writeFile(filePath, data);
43
- pathMap.set(fileName, url);
44
- // Write images to local file system
45
- if (fileName.startsWith('images/')) {
46
- assets.push({
47
- type: 'image',
48
- url: url,
49
- filePath: filePath,
50
- });
51
- }
52
- else if (fileName.endsWith('layout.json')) {
53
- layoutJson = JSON.parse(data.toString('utf-8'));
54
- metadata.mineruBackend = layoutJson?._backend;
55
- metadata.mineruVersion = layoutJson?._version_name;
56
- assets.push({
57
- type: 'file',
58
- url,
59
- filePath: filePath,
60
- });
61
- }
62
- else if (fileName.endsWith('content_list.json')) {
63
- assets.push({
64
- type: 'file',
65
- url,
66
- filePath: filePath,
67
- });
68
- }
69
- else if (fileName.endsWith('full.md')) {
70
- fullMd = data.toString('utf-8');
71
- assets.push({
72
- type: 'file',
73
- url,
74
- filePath: filePath,
75
- });
76
- }
77
- else if (fileName.endsWith('origin.pdf')) {
78
- metadata.originPdfUrl = fileName;
79
- }
40
+ const url = await fileSystem.writeFile(filePath, data);
41
+ pathMap.set(fileName, url);
42
+ // Write images to local file system
43
+ if (fileName.startsWith('images/')) {
44
+ assets.push({
45
+ type: 'image',
46
+ url: url,
47
+ filePath: filePath,
48
+ });
49
+ }
50
+ else if (fileName.endsWith('layout.json')) {
51
+ layoutJson = JSON.parse(data.toString('utf-8'));
52
+ metadata.mineruBackend = layoutJson?._backend;
53
+ metadata.mineruVersion = layoutJson?._version_name;
54
+ assets.push({
55
+ type: 'file',
56
+ url,
57
+ filePath: filePath,
58
+ });
59
+ }
60
+ else if (fileName.endsWith('content_list.json')) {
61
+ assets.push({
62
+ type: 'file',
63
+ url,
64
+ filePath: filePath,
65
+ });
66
+ }
67
+ else if (fileName.endsWith('full.md')) {
68
+ fullMd = data.toString('utf-8');
69
+ assets.push({
70
+ type: 'file',
71
+ url,
72
+ filePath: filePath,
73
+ });
80
74
  }
81
- else {
82
- // Still extract key metadata & markdown without writing to filesystem
83
- if (fileName.endsWith('layout.json')) {
84
- layoutJson = JSON.parse(data.toString('utf-8'));
85
- metadata.mineruBackend = layoutJson?._backend;
86
- metadata.mineruVersion = layoutJson?._version_name;
87
- }
88
- else if (fileName.endsWith('full.md')) {
89
- fullMd = data.toString('utf-8');
90
- }
91
- else if (fileName.endsWith('origin.pdf')) {
92
- metadata.originPdfUrl = fileName;
93
- }
75
+ else if (fileName.endsWith('origin.pdf')) {
76
+ metadata.originPdfUrl = fileName;
94
77
  }
95
78
  }
96
79
  metadata.assets = assets;
@@ -119,24 +102,13 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
119
102
  const pathMap = new Map();
120
103
  for (const image of result.images) {
121
104
  const filePath = join(document.folder || '', 'images', image.name);
122
- if (fileSystem) {
123
- const url = await fileSystem.writeFile(filePath, Buffer.from(image.dataUrl.split(',')[1], 'base64'));
124
- pathMap.set(`images/${image.name}`, url);
125
- assets.push({
126
- type: 'image',
127
- url: url,
128
- filePath: filePath,
129
- });
130
- }
131
- else {
132
- // Fallback: keep images as data URLs so markdown can still render without filesystem permission
133
- pathMap.set(`images/${image.name}`, image.dataUrl);
134
- assets.push({
135
- type: 'image',
136
- url: image.dataUrl,
137
- filePath: filePath,
138
- });
139
- }
105
+ const url = await fileSystem.writeFile(filePath, Buffer.from(image.dataUrl.split(',')[1], 'base64'));
106
+ pathMap.set(`images/${image.name}`, url);
107
+ assets.push({
108
+ type: 'image',
109
+ url: url,
110
+ filePath: filePath,
111
+ });
140
112
  }
141
113
  if (result.sourceUrl) {
142
114
  assets.push({
@@ -85,6 +85,17 @@ export declare class MinerUTransformerStrategy implements IDocumentTransformerSt
85
85
  enum: string[];
86
86
  default: string;
87
87
  };
88
+ pageRanges: {
89
+ type: string;
90
+ title: {
91
+ en_US: string;
92
+ zh_Hans: string;
93
+ };
94
+ description: {
95
+ en_US: string;
96
+ zh_Hans: string;
97
+ };
98
+ };
88
99
  };
89
100
  required: any[];
90
101
  };
@@ -1 +1 @@
1
- {"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAEb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EACtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,EAA8C,wBAAwB,EAAE,MAAM,YAAY,CAAA;AAEjG,qBAEa,yBAA0B,YAAW,4BAA4B,CAAC,wBAAwB,CAAC;IAEtG,OAAO,CAAC,QAAQ,CAAC,YAAY,CAA2B;IAGxD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;IAE7C,QAAQ,CAAC,WAAW,mDAWnB;IAED,QAAQ,CAAC,IAAI;;;;;;;;;;;kBAWM,QAAQ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAwE1B;IAED,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAIpC,kBAAkB,CACtB,SAAS,EAAE,OAAO,CAAC,kBAAkB,CAAC,EAAE,EACxC,MAAM,EAAE,wBAAwB,GAC/B,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,EAAE,CAAC;CAsDzD"}
1
+ {"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAEb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EACtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,EAA0C,wBAAwB,EAAE,MAAM,YAAY,CAAA;AAE7F,qBAEa,yBAA0B,YAAW,4BAA4B,CAAC,wBAAwB,CAAC;IAEtG,OAAO,CAAC,QAAQ,CAAC,YAAY,CAA2B;IAGxD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;IAE7C,QAAQ,CAAC,WAAW,mDAWnB;IAED,QAAQ,CAAC,IAAI;;;;;;;;;;;kBAWM,QAAQ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAmF1B;IAED,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAIpC,kBAAkB,CACtB,SAAS,EAAE,OAAO,CAAC,kBAAkB,CAAC,EAAE,EACxC,MAAM,EAAE,wBAAwB,GAC/B,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,EAAE,CAAC;CAiEzD"}
@@ -5,13 +5,13 @@ import { DocumentTransformerStrategy, } from '@xpert-ai/plugin-sdk';
5
5
  import { isNil, omitBy, pick } from 'lodash-es';
6
6
  import { MinerUClient } from './mineru.client.js';
7
7
  import { MinerUResultParserService } from './result-parser.service.js';
8
- import { icon, MinerUIntegration, MinerUTransformer } from './types.js';
8
+ import { icon, MinerU } from './types.js';
9
9
  let MinerUTransformerStrategy = class MinerUTransformerStrategy {
10
10
  constructor() {
11
11
  this.permissions = [
12
12
  {
13
13
  type: 'integration',
14
- service: MinerUIntegration,
14
+ service: MinerU,
15
15
  description: 'Access to MinerU system integrations'
16
16
  },
17
17
  {
@@ -21,7 +21,7 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
21
21
  }
22
22
  ];
23
23
  this.meta = {
24
- name: MinerUTransformer,
24
+ name: MinerU,
25
25
  label: {
26
26
  en_US: 'MinerU',
27
27
  zh_Hans: 'MinerU'
@@ -99,6 +99,17 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
99
99
  },
100
100
  enum: ['pipeline', 'vlm'],
101
101
  default: 'pipeline'
102
+ },
103
+ pageRanges: {
104
+ type: 'string',
105
+ title: {
106
+ en_US: 'Page Ranges',
107
+ zh_Hans: '页码范围'
108
+ },
109
+ description: {
110
+ en_US: 'Page ranges like "2,4-6" or "2--2" (official API only).',
111
+ zh_Hans: '页码范围,例如 "2,4-6" 或 "2--2"(仅官方 API)。'
112
+ }
102
113
  }
103
114
  },
104
115
  required: []
@@ -111,6 +122,7 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
111
122
  async transformDocuments(documents, config) {
112
123
  const mineru = new MinerUClient(this.configService, config.permissions);
113
124
  const parsedResults = [];
125
+ const integrationOptions = config.permissions?.integration?.options;
114
126
  for await (const document of documents) {
115
127
  if (mineru.serverType === 'self-hosted') {
116
128
  const { taskId } = await mineru.createTask({
@@ -125,8 +137,12 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
125
137
  });
126
138
  const result = mineru.getSelfHostedTask(taskId);
127
139
  const parsedResult = await this.resultParser.parseLocalTask(result, taskId, document, config.permissions.fileSystem);
128
- parsedResult.id = document.id;
129
- parsedResults.push(parsedResult);
140
+ // Convert parsedResult to IKnowledgeDocument format
141
+ parsedResults.push({
142
+ id: document.id,
143
+ chunks: parsedResult.chunks,
144
+ metadata: parsedResult.metadata
145
+ });
130
146
  }
131
147
  else {
132
148
  const { taskId } = await mineru.createTask({
@@ -136,13 +152,19 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
136
152
  enableTable: true,
137
153
  language: 'ch',
138
154
  modelVersion: 'vlm',
139
- ...omitBy(pick(config, ['isOcr', 'enableFormula', 'enableTable', 'language', 'modelVersion']), isNil)
155
+ pageRanges: config.pageRanges,
156
+ extraFormats: integrationOptions?.extraFormats,
157
+ ...omitBy(pick(config, ['isOcr', 'enableFormula', 'enableTable', 'language', 'modelVersion', 'pageRanges']), isNil)
140
158
  });
141
159
  // Waiting for completion
142
160
  const result = await mineru.waitForTask(taskId, 5 * 60 * 1000, 5000);
143
161
  const parsedResult = await this.resultParser.parseFromUrl(result.full_zip_url, taskId, document, config.permissions.fileSystem);
144
- parsedResult.id = document.id;
145
- parsedResults.push(parsedResult);
162
+ // Convert parsedResult to IKnowledgeDocument format
163
+ parsedResults.push({
164
+ id: document.id,
165
+ chunks: parsedResult.chunks,
166
+ metadata: parsedResult.metadata
167
+ });
146
168
  }
147
169
  }
148
170
  return parsedResults;
@@ -158,6 +180,6 @@ __decorate([
158
180
  ], MinerUTransformerStrategy.prototype, "configService", void 0);
159
181
  MinerUTransformerStrategy = __decorate([
160
182
  Injectable(),
161
- DocumentTransformerStrategy(MinerUTransformer)
183
+ DocumentTransformerStrategy(MinerU)
162
184
  ], MinerUTransformerStrategy);
163
185
  export { MinerUTransformerStrategy };