@helloxiaohu/plugin-mineru 0.1.0 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -258
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7 -8
- package/dist/lib/integration.strategy.d.ts.map +1 -1
- package/dist/lib/integration.strategy.js +20 -5
- package/dist/lib/mineru-toolset.strategy.d.ts +16 -32
- package/dist/lib/mineru-toolset.strategy.d.ts.map +1 -1
- package/dist/lib/mineru-toolset.strategy.js +18 -63
- package/dist/lib/mineru.client.d.ts +1 -5
- package/dist/lib/mineru.client.d.ts.map +1 -1
- package/dist/lib/mineru.client.js +165 -56
- package/dist/lib/mineru.tool.d.ts +5 -7
- package/dist/lib/mineru.tool.d.ts.map +1 -1
- package/dist/lib/mineru.tool.js +60 -37
- package/dist/lib/mineru.toolset.d.ts +2 -27
- package/dist/lib/mineru.toolset.d.ts.map +1 -1
- package/dist/lib/mineru.toolset.js +9 -56
- package/dist/lib/result-parser.service.d.ts +2 -2
- package/dist/lib/result-parser.service.d.ts.map +1 -1
- package/dist/lib/result-parser.service.js +44 -72
- package/dist/lib/transformer-mineru.strategy.d.ts +11 -0
- package/dist/lib/transformer-mineru.strategy.d.ts.map +1 -1
- package/dist/lib/transformer-mineru.strategy.js +31 -9
- package/dist/lib/types.d.ts +23 -40
- package/dist/lib/types.d.ts.map +1 -1
- package/dist/lib/types.js +22 -35
- package/package.json +10 -18
package/dist/lib/mineru.tool.js
CHANGED
|
@@ -3,28 +3,36 @@ import { getCurrentTaskInput } from '@langchain/langgraph';
|
|
|
3
3
|
import { getErrorMessage } from '@xpert-ai/plugin-sdk';
|
|
4
4
|
import { z } from 'zod';
|
|
5
5
|
import { MinerUClient } from './mineru.client.js';
|
|
6
|
-
import {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
6
|
+
import { MinerU } from './types.js';
|
|
7
|
+
function normalizeExtraFormats(value) {
|
|
8
|
+
if (!value) {
|
|
9
|
+
return undefined;
|
|
10
|
+
}
|
|
11
|
+
if (Array.isArray(value)) {
|
|
12
|
+
const formats = value.map((item) => String(item).trim()).filter(Boolean);
|
|
13
|
+
return formats.length ? formats : undefined;
|
|
14
|
+
}
|
|
15
|
+
const formats = value
|
|
16
|
+
.split(',')
|
|
17
|
+
.map((item) => item.trim())
|
|
18
|
+
.filter(Boolean);
|
|
19
|
+
return formats.length ? formats : undefined;
|
|
20
|
+
}
|
|
11
21
|
export function buildMinerUTool(configService, resultParser, options, fileSystem, defaults) {
|
|
12
22
|
return tool(async (input) => {
|
|
13
23
|
try {
|
|
14
|
-
const { doc_url } = input;
|
|
15
|
-
|
|
16
|
-
|
|
24
|
+
const { doc_url, page_ranges } = input;
|
|
25
|
+
console.debug('[MinerU] tool invoked with input', {
|
|
26
|
+
doc_url,
|
|
27
|
+
extraKeys: Object.keys(input || {}).filter((key) => key !== 'doc_url'),
|
|
28
|
+
});
|
|
17
29
|
if (!doc_url) {
|
|
18
30
|
throw new Error('doc_url is required');
|
|
19
31
|
}
|
|
20
|
-
// Get workspace context from current task
|
|
21
32
|
const currentState = getCurrentTaskInput();
|
|
22
33
|
const workspacePath = currentState?.['sys']?.['volume'] ?? '/tmp/xpert';
|
|
23
|
-
// Use configuration from authorization page (passed via options and defaults parameters)
|
|
24
|
-
// These values come from the authorization page configuration and are set when the tool is created
|
|
25
34
|
const finalApiUrl = options?.apiUrl || 'https://mineru.net/api/v4';
|
|
26
|
-
const finalApiKey = options?.apiKey;
|
|
27
|
-
// Log effective config (mask key) to确认是否拿到了授权页的凭据
|
|
35
|
+
const finalApiKey = options?.apiKey;
|
|
28
36
|
const maskedKey = finalApiKey && finalApiKey.length > 8
|
|
29
37
|
? `${finalApiKey.slice(0, 4)}***${finalApiKey.slice(-4)}`
|
|
30
38
|
: finalApiKey
|
|
@@ -42,42 +50,45 @@ export function buildMinerUTool(configService, resultParser, options, fileSystem
|
|
|
42
50
|
modelVersion: defaults?.modelVersion,
|
|
43
51
|
},
|
|
44
52
|
});
|
|
45
|
-
// Use configuration values from authorization page (passed via defaults parameter)
|
|
46
|
-
// Convert string enum values ('true'/'false') to boolean, or use boolean values directly
|
|
47
|
-
// If undefined, default to true
|
|
48
53
|
const finalIsOcr = defaults?.isOcr === undefined
|
|
49
54
|
? true
|
|
50
|
-
:
|
|
55
|
+
: typeof defaults.isOcr === 'string'
|
|
56
|
+
? defaults.isOcr === 'true'
|
|
57
|
+
: defaults.isOcr === true;
|
|
51
58
|
const finalEnableFormula = defaults?.enableFormula === undefined
|
|
52
59
|
? true
|
|
53
|
-
:
|
|
60
|
+
: typeof defaults.enableFormula === 'string'
|
|
61
|
+
? defaults.enableFormula === 'true'
|
|
62
|
+
: defaults.enableFormula === true;
|
|
54
63
|
const finalEnableTable = defaults?.enableTable === undefined
|
|
55
64
|
? true
|
|
56
|
-
:
|
|
65
|
+
: typeof defaults.enableTable === 'string'
|
|
66
|
+
? defaults.enableTable === 'true'
|
|
67
|
+
: defaults.enableTable === true;
|
|
57
68
|
const finalLanguage = defaults?.language || 'ch';
|
|
58
69
|
const finalModelVersion = defaults?.modelVersion || 'pipeline';
|
|
70
|
+
const finalExtraFormats = normalizeExtraFormats(options?.extraFormats);
|
|
59
71
|
const effectiveOptions = {
|
|
60
72
|
apiUrl: finalApiUrl,
|
|
61
73
|
apiKey: finalApiKey,
|
|
74
|
+
extraFormats: finalExtraFormats,
|
|
62
75
|
};
|
|
63
76
|
const integration = {
|
|
64
|
-
provider:
|
|
77
|
+
provider: MinerU,
|
|
65
78
|
options: effectiveOptions,
|
|
66
79
|
};
|
|
67
80
|
const mineruClient = new MinerUClient(configService, {
|
|
68
81
|
fileSystem,
|
|
69
82
|
integration,
|
|
70
83
|
});
|
|
71
|
-
// Determine file name from URL
|
|
72
84
|
let finalFileName = 'document.pdf';
|
|
73
85
|
try {
|
|
74
86
|
const parsed = new URL(doc_url);
|
|
75
87
|
finalFileName = parsed.pathname.split('/').pop() || 'document.pdf';
|
|
76
88
|
}
|
|
77
89
|
catch {
|
|
78
|
-
//
|
|
90
|
+
// Ignore URL parsing errors.
|
|
79
91
|
}
|
|
80
|
-
// Create MinerU task
|
|
81
92
|
const { taskId } = await mineruClient.createTask({
|
|
82
93
|
url: doc_url,
|
|
83
94
|
fileName: finalFileName,
|
|
@@ -86,10 +97,11 @@ export function buildMinerUTool(configService, resultParser, options, fileSystem
|
|
|
86
97
|
enableTable: finalEnableTable,
|
|
87
98
|
language: finalLanguage,
|
|
88
99
|
modelVersion: finalModelVersion,
|
|
100
|
+
pageRanges: page_ranges ?? undefined,
|
|
101
|
+
extraFormats: finalExtraFormats,
|
|
89
102
|
});
|
|
90
103
|
let parsedResult;
|
|
91
104
|
if (mineruClient.serverType === 'self-hosted') {
|
|
92
|
-
// Self-hosted: get result immediately
|
|
93
105
|
const taskResult = mineruClient.getSelfHostedTask(taskId);
|
|
94
106
|
if (!taskResult) {
|
|
95
107
|
throw new Error('Failed to get MinerU task result');
|
|
@@ -101,24 +113,34 @@ export function buildMinerUTool(configService, resultParser, options, fileSystem
|
|
|
101
113
|
}, fileSystem);
|
|
102
114
|
}
|
|
103
115
|
else {
|
|
104
|
-
// Official API: wait for completion
|
|
105
116
|
const result = await mineruClient.waitForTask(taskId, 5 * 60 * 1000, 5000);
|
|
106
|
-
|
|
117
|
+
const fullZipUrl = result.full_zip_url;
|
|
118
|
+
parsedResult = await resultParser.parseFromUrl(fullZipUrl, taskId, {
|
|
107
119
|
fileUrl: doc_url,
|
|
108
120
|
name: finalFileName,
|
|
109
121
|
folder: workspacePath,
|
|
110
122
|
}, fileSystem);
|
|
123
|
+
if (fullZipUrl) {
|
|
124
|
+
parsedResult.metadata = parsedResult.metadata ?? {};
|
|
125
|
+
parsedResult.metadata.fullZipUrl = parsedResult.metadata.fullZipUrl ?? fullZipUrl;
|
|
126
|
+
parsedResult.metadata.full_zip_url = parsedResult.metadata.full_zip_url ?? fullZipUrl;
|
|
127
|
+
}
|
|
111
128
|
}
|
|
112
|
-
// Build file artifacts from parsed result
|
|
113
129
|
const fileArtifacts = [];
|
|
114
130
|
if (parsedResult.metadata?.assets) {
|
|
115
131
|
for (const asset of parsedResult.metadata.assets) {
|
|
116
132
|
if (asset.type === 'file' || asset.type === 'image') {
|
|
117
|
-
const fileName = asset.filePath?.split(/[/\\]/).pop() ||
|
|
133
|
+
const fileName = asset.filePath?.split(/[/\\]/).pop() ||
|
|
134
|
+
asset.url?.split('/').pop() ||
|
|
135
|
+
'file';
|
|
118
136
|
const extension = fileName.split('.').pop()?.toLowerCase() || 'md';
|
|
119
137
|
const mimeType = asset.type === 'image'
|
|
120
|
-
?
|
|
121
|
-
|
|
138
|
+
? extension === 'png'
|
|
139
|
+
? 'image/png'
|
|
140
|
+
: 'image/jpeg'
|
|
141
|
+
: extension === 'md'
|
|
142
|
+
? 'text/markdown'
|
|
143
|
+
: 'application/json';
|
|
122
144
|
fileArtifacts.push({
|
|
123
145
|
fileName: fileName,
|
|
124
146
|
filePath: asset.filePath,
|
|
@@ -129,11 +151,7 @@ export function buildMinerUTool(configService, resultParser, options, fileSystem
|
|
|
129
151
|
}
|
|
130
152
|
}
|
|
131
153
|
}
|
|
132
|
-
|
|
133
|
-
const markdownContent = parsedResult.chunks
|
|
134
|
-
?.map((chunk) => chunk.pageContent)
|
|
135
|
-
.join('\n\n') || '';
|
|
136
|
-
// Return full markdown (do NOT truncate). If the platform/UI needs a preview, it can truncate client-side.
|
|
154
|
+
const markdownContent = parsedResult.chunks?.map((chunk) => chunk.pageContent).join('\n\n') || '';
|
|
137
155
|
return [
|
|
138
156
|
markdownContent,
|
|
139
157
|
{
|
|
@@ -148,9 +166,14 @@ export function buildMinerUTool(configService, resultParser, options, fileSystem
|
|
|
148
166
|
}
|
|
149
167
|
}, {
|
|
150
168
|
name: 'mineru_pdf_parser',
|
|
151
|
-
description: 'Convert
|
|
169
|
+
description: 'Convert documents to markdown using MinerU. Supports OCR, formula recognition, and table extraction. Returns markdown content and extracted files.',
|
|
152
170
|
schema: z.object({
|
|
153
|
-
doc_url: z.string().min(1).describe('
|
|
171
|
+
doc_url: z.string().min(1).describe('Document URL (required)'),
|
|
172
|
+
page_ranges: z
|
|
173
|
+
.string()
|
|
174
|
+
.optional()
|
|
175
|
+
.nullable()
|
|
176
|
+
.describe('Page ranges like "2,4-6" or "2--2"'),
|
|
154
177
|
}),
|
|
155
178
|
responseFormat: 'content_and_artifact',
|
|
156
179
|
});
|
|
@@ -2,15 +2,10 @@ import { StructuredToolInterface, ToolSchemaBase } from '@langchain/core/tools';
|
|
|
2
2
|
import { BuiltinToolset, XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
3
3
|
import { ConfigService } from '@nestjs/config';
|
|
4
4
|
import { MinerUResultParserService } from './result-parser.service.js';
|
|
5
|
-
/**
|
|
6
|
-
* Configuration for MinerU Toolset
|
|
7
|
-
*/
|
|
8
5
|
export interface MinerUToolsetConfig {
|
|
9
|
-
/**
|
|
10
|
-
* MinerU API options stored in toolset credentials
|
|
11
|
-
*/
|
|
12
6
|
apiUrl?: string;
|
|
13
7
|
apiKey?: string;
|
|
8
|
+
extraFormats?: string | string[];
|
|
14
9
|
fileSystem?: XpFileSystem;
|
|
15
10
|
configService?: ConfigService;
|
|
16
11
|
resultParser?: MinerUResultParserService;
|
|
@@ -20,31 +15,11 @@ export interface MinerUToolsetConfig {
|
|
|
20
15
|
language?: 'en' | 'ch';
|
|
21
16
|
modelVersion?: 'pipeline' | 'vlm';
|
|
22
17
|
}
|
|
23
|
-
/**
|
|
24
|
-
* MinerU Toolset implementation
|
|
25
|
-
* Provides PDF to markdown conversion tool using MinerU service
|
|
26
|
-
*/
|
|
27
18
|
export declare class MinerUToolset extends BuiltinToolset<StructuredToolInterface, MinerUToolsetConfig> {
|
|
28
19
|
private readonly config;
|
|
29
20
|
tools: any[];
|
|
30
|
-
/**
|
|
31
|
-
* Constructor for MinerU Toolset
|
|
32
|
-
* Accepts config which contains credentials and dependencies
|
|
33
|
-
* Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
|
|
34
|
-
* properties (tenantId, env) that are added at runtime
|
|
35
|
-
*/
|
|
36
21
|
constructor(config: MinerUToolsetConfig);
|
|
37
|
-
|
|
38
|
-
* Validate credentials for MinerU toolset
|
|
39
|
-
* Note: During authorization phase, credentials may be incomplete.
|
|
40
|
-
* configService and resultParser are runtime dependencies injected by the strategy.
|
|
41
|
-
* We don't validate anything here to allow authorization to proceed.
|
|
42
|
-
*/
|
|
43
|
-
_validateCredentials(credentials: MinerUToolsetConfig): Promise<void>;
|
|
44
|
-
/**
|
|
45
|
-
* Initialize tools for MinerU toolset
|
|
46
|
-
* Creates the PDF parser tool with necessary dependencies
|
|
47
|
-
*/
|
|
22
|
+
_validateCredentials(_credentials: MinerUToolsetConfig): Promise<void>;
|
|
48
23
|
initTools(): Promise<StructuredToolInterface<ToolSchemaBase, any, any>[]>;
|
|
49
24
|
}
|
|
50
25
|
//# sourceMappingURL=mineru.toolset.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"mineru.toolset.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.toolset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAChF,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACpE,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAIvE
|
|
1
|
+
{"version":3,"file":"mineru.toolset.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.toolset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAChF,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACpE,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAIvE,MAAM,WAAW,mBAAmB;IAClC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IACjC,UAAU,CAAC,EAAE,YAAY,CAAC;IAC1B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,YAAY,CAAC,EAAE,yBAAyB,CAAC;IACzC,KAAK,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACzB,aAAa,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACjC,WAAW,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IAC/B,QAAQ,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACvB,YAAY,CAAC,EAAE,UAAU,GAAG,KAAK,CAAC;CACnC;AAED,qBAAa,aAAc,SAAQ,cAAc,CAAC,uBAAuB,EAAE,mBAAmB,CAAC;IAC7F,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAsB;IACpC,KAAK,EAAE,GAAG,EAAE,CAAM;gBAEf,MAAM,EAAE,mBAAmB;IAiBxB,oBAAoB,CAAC,YAAY,EAAE,mBAAmB,GAAG,OAAO,CAAC,IAAI,CAAC;IAItE,SAAS,IAAI,OAAO,CAAC,uBAAuB,CAAC,cAAc,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;CA+CzF"}
|
|
@@ -1,87 +1,40 @@
|
|
|
1
1
|
import { BuiltinToolset } from '@xpert-ai/plugin-sdk';
|
|
2
2
|
import { buildMinerUTool } from './mineru.tool.js';
|
|
3
|
-
/**
|
|
4
|
-
* MinerU Toolset implementation
|
|
5
|
-
* Provides PDF to markdown conversion tool using MinerU service
|
|
6
|
-
*/
|
|
7
3
|
export class MinerUToolset extends BuiltinToolset {
|
|
8
|
-
/**
|
|
9
|
-
* Constructor for MinerU Toolset
|
|
10
|
-
* Accepts config which contains credentials and dependencies
|
|
11
|
-
* Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
|
|
12
|
-
* properties (tenantId, env) that are added at runtime
|
|
13
|
-
*/
|
|
14
4
|
constructor(config) {
|
|
15
5
|
super('mineru', undefined, config);
|
|
16
|
-
// Ensure `tools` exists even if upstream BuiltinToolset typings differ across versions.
|
|
17
6
|
this.tools = [];
|
|
18
7
|
this.config = config;
|
|
19
|
-
// Log config received in constructor (mask apiKey for security)
|
|
20
8
|
const configForLog = { ...config };
|
|
21
9
|
if (configForLog.apiKey) {
|
|
22
|
-
configForLog.apiKey =
|
|
23
|
-
|
|
24
|
-
|
|
10
|
+
configForLog.apiKey =
|
|
11
|
+
configForLog.apiKey.length > 8
|
|
12
|
+
? `${configForLog.apiKey.substring(0, 4)}...${configForLog.apiKey.substring(configForLog.apiKey.length - 4)}`
|
|
13
|
+
: '***';
|
|
25
14
|
}
|
|
26
|
-
// Use base class logger (protected access)
|
|
27
15
|
if ('logger' in this && this.logger) {
|
|
28
16
|
this.logger.log(`[MinerU] MinerUToolset constructor received config: ${JSON.stringify(configForLog, null, 2)}`);
|
|
29
17
|
}
|
|
30
18
|
}
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
* Note: During authorization phase, credentials may be incomplete.
|
|
34
|
-
* configService and resultParser are runtime dependencies injected by the strategy.
|
|
35
|
-
* We don't validate anything here to allow authorization to proceed.
|
|
36
|
-
*/
|
|
37
|
-
async _validateCredentials(credentials) {
|
|
38
|
-
// No validation needed during authorization phase
|
|
39
|
-
// API key validity will be enforced by MinerU server when tool is used
|
|
19
|
+
async _validateCredentials(_credentials) {
|
|
20
|
+
// No validation during authorization phase.
|
|
40
21
|
}
|
|
41
|
-
/**
|
|
42
|
-
* Initialize tools for MinerU toolset
|
|
43
|
-
* Creates the PDF parser tool with necessary dependencies
|
|
44
|
-
*/
|
|
45
22
|
async initTools() {
|
|
46
|
-
const { configService, resultParser, apiUrl, apiKey, fileSystem, isOcr, enableFormula, enableTable, language, modelVersion } = this.config;
|
|
47
|
-
// Log config before destructuring
|
|
48
|
-
const configKeys = Object.keys(this.config);
|
|
49
|
-
const hasApiKey = 'apiKey' in this.config;
|
|
50
|
-
const apiKeyValue = this.config.apiKey;
|
|
51
|
-
const maskedApiKey = apiKeyValue
|
|
52
|
-
? (apiKeyValue.length > 8 ? `${apiKeyValue.substring(0, 4)}...${apiKeyValue.substring(apiKeyValue.length - 4)}` : '***')
|
|
53
|
-
: 'missing';
|
|
54
|
-
// Use base class logger (protected access)
|
|
55
|
-
if ('logger' in this && this.logger) {
|
|
56
|
-
this.logger.log(`[MinerU] MinerUToolset.initTools() - config keys: ${configKeys.join(', ')}, hasApiKey: ${hasApiKey}, apiKey: ${maskedApiKey}`);
|
|
57
|
-
this.logger.log(`[MinerU] MinerUToolset.initTools() - destructured apiKey: ${apiKey ? (apiKey.length > 8 ? `${apiKey.substring(0, 4)}...${apiKey.substring(apiKey.length - 4)}` : '***') : 'missing'}`);
|
|
58
|
-
}
|
|
23
|
+
const { configService, resultParser, apiUrl, apiKey, extraFormats, fileSystem, isOcr, enableFormula, enableTable, language, modelVersion, } = this.config;
|
|
59
24
|
if (!configService || !resultParser) {
|
|
60
25
|
throw new Error('ConfigService and MinerUResultParserService are required');
|
|
61
26
|
}
|
|
62
|
-
// Use configuration from authorization page
|
|
63
|
-
// apiUrl: use provided value or default to official server URL
|
|
64
27
|
const finalApiUrl = apiUrl || 'https://mineru.net/api/v4';
|
|
65
|
-
// Convert string enum values to boolean (compatible with 'true'/'false' strings and boolean values)
|
|
66
|
-
// Use provided values from authorization page, or default to true
|
|
67
28
|
const finalIsOcr = isOcr === 'true' || isOcr === true;
|
|
68
29
|
const finalEnableFormula = enableFormula === 'true' || enableFormula === true;
|
|
69
30
|
const finalEnableTable = enableTable === 'true' || enableTable === true;
|
|
70
|
-
// Use provided values from authorization page, or use defaults
|
|
71
31
|
const finalLanguage = language || 'ch';
|
|
72
32
|
const finalModelVersion = modelVersion || 'pipeline';
|
|
73
|
-
// Log what we're passing to buildMinerUTool
|
|
74
|
-
const maskedFinalApiKey = apiKey
|
|
75
|
-
? (apiKey.length > 8 ? `${apiKey.substring(0, 4)}...${apiKey.substring(apiKey.length - 4)}` : '***')
|
|
76
|
-
: 'missing';
|
|
77
|
-
// Use base class logger (protected access)
|
|
78
|
-
if ('logger' in this && this.logger) {
|
|
79
|
-
this.logger.log(`[MinerU] MinerUToolset.initTools() - passing to buildMinerUTool: apiUrl=${finalApiUrl}, apiKey=${maskedFinalApiKey}`);
|
|
80
|
-
}
|
|
81
33
|
this.tools = [
|
|
82
34
|
buildMinerUTool(configService, resultParser, {
|
|
83
35
|
apiUrl: finalApiUrl,
|
|
84
|
-
apiKey,
|
|
36
|
+
apiKey,
|
|
37
|
+
extraFormats,
|
|
85
38
|
}, fileSystem, {
|
|
86
39
|
isOcr: finalIsOcr,
|
|
87
40
|
enableFormula: finalEnableFormula,
|
|
@@ -4,12 +4,12 @@ import { ChunkMetadata, XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
|
4
4
|
import { MinerUDocumentMetadata, MineruSelfHostedTaskResult } from './types.js';
|
|
5
5
|
export declare class MinerUResultParserService {
|
|
6
6
|
private readonly logger;
|
|
7
|
-
parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem
|
|
7
|
+
parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
|
|
8
8
|
id?: string;
|
|
9
9
|
chunks: Document<ChunkMetadata>[];
|
|
10
10
|
metadata: MinerUDocumentMetadata;
|
|
11
11
|
}>;
|
|
12
|
-
parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem
|
|
12
|
+
parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
|
|
13
13
|
id?: string;
|
|
14
14
|
chunks: Document<ChunkMetadata>[];
|
|
15
15
|
metadata: MinerUDocumentMetadata;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,
|
|
1
|
+
{"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;IAsFI,cAAc,CAClB,MAAM,EAAE,0BAA0B,EAClC,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;CAkDH"}
|
|
@@ -21,6 +21,7 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
|
|
|
21
21
|
const metadata = {
|
|
22
22
|
parser: MinerU,
|
|
23
23
|
taskId,
|
|
24
|
+
fullZipUrl,
|
|
24
25
|
};
|
|
25
26
|
// 2. Unzip the file
|
|
26
27
|
const zipEntries = [];
|
|
@@ -36,61 +37,43 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
|
|
|
36
37
|
zipEntries.push({ entryName: entry.path, data });
|
|
37
38
|
const fileName = entry.path;
|
|
38
39
|
const filePath = join(document.folder || '', entry.path);
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
url,
|
|
74
|
-
filePath: filePath,
|
|
75
|
-
});
|
|
76
|
-
}
|
|
77
|
-
else if (fileName.endsWith('origin.pdf')) {
|
|
78
|
-
metadata.originPdfUrl = fileName;
|
|
79
|
-
}
|
|
40
|
+
const url = await fileSystem.writeFile(filePath, data);
|
|
41
|
+
pathMap.set(fileName, url);
|
|
42
|
+
// Write images to local file system
|
|
43
|
+
if (fileName.startsWith('images/')) {
|
|
44
|
+
assets.push({
|
|
45
|
+
type: 'image',
|
|
46
|
+
url: url,
|
|
47
|
+
filePath: filePath,
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
else if (fileName.endsWith('layout.json')) {
|
|
51
|
+
layoutJson = JSON.parse(data.toString('utf-8'));
|
|
52
|
+
metadata.mineruBackend = layoutJson?._backend;
|
|
53
|
+
metadata.mineruVersion = layoutJson?._version_name;
|
|
54
|
+
assets.push({
|
|
55
|
+
type: 'file',
|
|
56
|
+
url,
|
|
57
|
+
filePath: filePath,
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
else if (fileName.endsWith('content_list.json')) {
|
|
61
|
+
assets.push({
|
|
62
|
+
type: 'file',
|
|
63
|
+
url,
|
|
64
|
+
filePath: filePath,
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
else if (fileName.endsWith('full.md')) {
|
|
68
|
+
fullMd = data.toString('utf-8');
|
|
69
|
+
assets.push({
|
|
70
|
+
type: 'file',
|
|
71
|
+
url,
|
|
72
|
+
filePath: filePath,
|
|
73
|
+
});
|
|
80
74
|
}
|
|
81
|
-
else {
|
|
82
|
-
|
|
83
|
-
if (fileName.endsWith('layout.json')) {
|
|
84
|
-
layoutJson = JSON.parse(data.toString('utf-8'));
|
|
85
|
-
metadata.mineruBackend = layoutJson?._backend;
|
|
86
|
-
metadata.mineruVersion = layoutJson?._version_name;
|
|
87
|
-
}
|
|
88
|
-
else if (fileName.endsWith('full.md')) {
|
|
89
|
-
fullMd = data.toString('utf-8');
|
|
90
|
-
}
|
|
91
|
-
else if (fileName.endsWith('origin.pdf')) {
|
|
92
|
-
metadata.originPdfUrl = fileName;
|
|
93
|
-
}
|
|
75
|
+
else if (fileName.endsWith('origin.pdf')) {
|
|
76
|
+
metadata.originPdfUrl = fileName;
|
|
94
77
|
}
|
|
95
78
|
}
|
|
96
79
|
metadata.assets = assets;
|
|
@@ -119,24 +102,13 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
|
|
|
119
102
|
const pathMap = new Map();
|
|
120
103
|
for (const image of result.images) {
|
|
121
104
|
const filePath = join(document.folder || '', 'images', image.name);
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
});
|
|
130
|
-
}
|
|
131
|
-
else {
|
|
132
|
-
// Fallback: keep images as data URLs so markdown can still render without filesystem permission
|
|
133
|
-
pathMap.set(`images/${image.name}`, image.dataUrl);
|
|
134
|
-
assets.push({
|
|
135
|
-
type: 'image',
|
|
136
|
-
url: image.dataUrl,
|
|
137
|
-
filePath: filePath,
|
|
138
|
-
});
|
|
139
|
-
}
|
|
105
|
+
const url = await fileSystem.writeFile(filePath, Buffer.from(image.dataUrl.split(',')[1], 'base64'));
|
|
106
|
+
pathMap.set(`images/${image.name}`, url);
|
|
107
|
+
assets.push({
|
|
108
|
+
type: 'image',
|
|
109
|
+
url: url,
|
|
110
|
+
filePath: filePath,
|
|
111
|
+
});
|
|
140
112
|
}
|
|
141
113
|
if (result.sourceUrl) {
|
|
142
114
|
assets.push({
|
|
@@ -85,6 +85,17 @@ export declare class MinerUTransformerStrategy implements IDocumentTransformerSt
|
|
|
85
85
|
enum: string[];
|
|
86
86
|
default: string;
|
|
87
87
|
};
|
|
88
|
+
pageRanges: {
|
|
89
|
+
type: string;
|
|
90
|
+
title: {
|
|
91
|
+
en_US: string;
|
|
92
|
+
zh_Hans: string;
|
|
93
|
+
};
|
|
94
|
+
description: {
|
|
95
|
+
en_US: string;
|
|
96
|
+
zh_Hans: string;
|
|
97
|
+
};
|
|
98
|
+
};
|
|
88
99
|
};
|
|
89
100
|
required: any[];
|
|
90
101
|
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAEb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EACtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,
|
|
1
|
+
{"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAEb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EACtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,EAA0C,wBAAwB,EAAE,MAAM,YAAY,CAAA;AAE7F,qBAEa,yBAA0B,YAAW,4BAA4B,CAAC,wBAAwB,CAAC;IAEtG,OAAO,CAAC,QAAQ,CAAC,YAAY,CAA2B;IAGxD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;IAE7C,QAAQ,CAAC,WAAW,mDAWnB;IAED,QAAQ,CAAC,IAAI;;;;;;;;;;;kBAWM,QAAQ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAmF1B;IAED,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAIpC,kBAAkB,CACtB,SAAS,EAAE,OAAO,CAAC,kBAAkB,CAAC,EAAE,EACxC,MAAM,EAAE,wBAAwB,GAC/B,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,EAAE,CAAC;CAiEzD"}
|
|
@@ -5,13 +5,13 @@ import { DocumentTransformerStrategy, } from '@xpert-ai/plugin-sdk';
|
|
|
5
5
|
import { isNil, omitBy, pick } from 'lodash-es';
|
|
6
6
|
import { MinerUClient } from './mineru.client.js';
|
|
7
7
|
import { MinerUResultParserService } from './result-parser.service.js';
|
|
8
|
-
import { icon,
|
|
8
|
+
import { icon, MinerU } from './types.js';
|
|
9
9
|
let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
10
10
|
constructor() {
|
|
11
11
|
this.permissions = [
|
|
12
12
|
{
|
|
13
13
|
type: 'integration',
|
|
14
|
-
service:
|
|
14
|
+
service: MinerU,
|
|
15
15
|
description: 'Access to MinerU system integrations'
|
|
16
16
|
},
|
|
17
17
|
{
|
|
@@ -21,7 +21,7 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
|
21
21
|
}
|
|
22
22
|
];
|
|
23
23
|
this.meta = {
|
|
24
|
-
name:
|
|
24
|
+
name: MinerU,
|
|
25
25
|
label: {
|
|
26
26
|
en_US: 'MinerU',
|
|
27
27
|
zh_Hans: 'MinerU'
|
|
@@ -99,6 +99,17 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
|
99
99
|
},
|
|
100
100
|
enum: ['pipeline', 'vlm'],
|
|
101
101
|
default: 'pipeline'
|
|
102
|
+
},
|
|
103
|
+
pageRanges: {
|
|
104
|
+
type: 'string',
|
|
105
|
+
title: {
|
|
106
|
+
en_US: 'Page Ranges',
|
|
107
|
+
zh_Hans: '页码范围'
|
|
108
|
+
},
|
|
109
|
+
description: {
|
|
110
|
+
en_US: 'Page ranges like "2,4-6" or "2--2" (official API only).',
|
|
111
|
+
zh_Hans: '页码范围,例如 "2,4-6" 或 "2--2"(仅官方 API)。'
|
|
112
|
+
}
|
|
102
113
|
}
|
|
103
114
|
},
|
|
104
115
|
required: []
|
|
@@ -111,6 +122,7 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
|
111
122
|
async transformDocuments(documents, config) {
|
|
112
123
|
const mineru = new MinerUClient(this.configService, config.permissions);
|
|
113
124
|
const parsedResults = [];
|
|
125
|
+
const integrationOptions = config.permissions?.integration?.options;
|
|
114
126
|
for await (const document of documents) {
|
|
115
127
|
if (mineru.serverType === 'self-hosted') {
|
|
116
128
|
const { taskId } = await mineru.createTask({
|
|
@@ -125,8 +137,12 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
|
125
137
|
});
|
|
126
138
|
const result = mineru.getSelfHostedTask(taskId);
|
|
127
139
|
const parsedResult = await this.resultParser.parseLocalTask(result, taskId, document, config.permissions.fileSystem);
|
|
128
|
-
parsedResult
|
|
129
|
-
parsedResults.push(
|
|
140
|
+
// Convert parsedResult to IKnowledgeDocument format
|
|
141
|
+
parsedResults.push({
|
|
142
|
+
id: document.id,
|
|
143
|
+
chunks: parsedResult.chunks,
|
|
144
|
+
metadata: parsedResult.metadata
|
|
145
|
+
});
|
|
130
146
|
}
|
|
131
147
|
else {
|
|
132
148
|
const { taskId } = await mineru.createTask({
|
|
@@ -136,13 +152,19 @@ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
|
136
152
|
enableTable: true,
|
|
137
153
|
language: 'ch',
|
|
138
154
|
modelVersion: 'vlm',
|
|
139
|
-
|
|
155
|
+
pageRanges: config.pageRanges,
|
|
156
|
+
extraFormats: integrationOptions?.extraFormats,
|
|
157
|
+
...omitBy(pick(config, ['isOcr', 'enableFormula', 'enableTable', 'language', 'modelVersion', 'pageRanges']), isNil)
|
|
140
158
|
});
|
|
141
159
|
// Waiting for completion
|
|
142
160
|
const result = await mineru.waitForTask(taskId, 5 * 60 * 1000, 5000);
|
|
143
161
|
const parsedResult = await this.resultParser.parseFromUrl(result.full_zip_url, taskId, document, config.permissions.fileSystem);
|
|
144
|
-
parsedResult
|
|
145
|
-
parsedResults.push(
|
|
162
|
+
// Convert parsedResult to IKnowledgeDocument format
|
|
163
|
+
parsedResults.push({
|
|
164
|
+
id: document.id,
|
|
165
|
+
chunks: parsedResult.chunks,
|
|
166
|
+
metadata: parsedResult.metadata
|
|
167
|
+
});
|
|
146
168
|
}
|
|
147
169
|
}
|
|
148
170
|
return parsedResults;
|
|
@@ -158,6 +180,6 @@ __decorate([
|
|
|
158
180
|
], MinerUTransformerStrategy.prototype, "configService", void 0);
|
|
159
181
|
MinerUTransformerStrategy = __decorate([
|
|
160
182
|
Injectable(),
|
|
161
|
-
DocumentTransformerStrategy(
|
|
183
|
+
DocumentTransformerStrategy(MinerU)
|
|
162
184
|
], MinerUTransformerStrategy);
|
|
163
185
|
export { MinerUTransformerStrategy };
|