@helloxiaohu/plugin-mineru 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -258
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7 -8
- package/dist/lib/integration.strategy.d.ts.map +1 -1
- package/dist/lib/integration.strategy.js +20 -5
- package/dist/lib/mineru.client.d.ts +1 -5
- package/dist/lib/mineru.client.d.ts.map +1 -1
- package/dist/lib/mineru.client.js +165 -56
- package/dist/lib/mineru.plugin.d.ts.map +1 -1
- package/dist/lib/mineru.plugin.js +0 -2
- package/dist/lib/result-parser.service.d.ts +2 -2
- package/dist/lib/result-parser.service.d.ts.map +1 -1
- package/dist/lib/result-parser.service.js +44 -72
- package/dist/lib/transformer-mineru.strategy.d.ts +11 -0
- package/dist/lib/transformer-mineru.strategy.d.ts.map +1 -1
- package/dist/lib/transformer-mineru.strategy.js +31 -9
- package/dist/lib/types.d.ts +23 -40
- package/dist/lib/types.d.ts.map +1 -1
- package/dist/lib/types.js +22 -35
- package/package.json +10 -18
- package/dist/lib/mineru-toolset.strategy.d.ts +0 -234
- package/dist/lib/mineru-toolset.strategy.d.ts.map +0 -1
- package/dist/lib/mineru-toolset.strategy.js +0 -306
- package/dist/lib/mineru.tool.d.ts +0 -35
- package/dist/lib/mineru.tool.d.ts.map +0 -1
- package/dist/lib/mineru.tool.js +0 -157
- package/dist/lib/mineru.toolset.d.ts +0 -50
- package/dist/lib/mineru.toolset.d.ts.map +0 -1
- package/dist/lib/mineru.toolset.js +0 -95
|
@@ -1,306 +0,0 @@
|
|
|
1
|
-
import { __decorate, __metadata, __param } from "tslib";
|
|
2
|
-
import { Injectable, forwardRef, Inject } from '@nestjs/common';
|
|
3
|
-
import { ConfigService } from '@nestjs/config';
|
|
4
|
-
import { ToolsetStrategy, } from '@xpert-ai/plugin-sdk';
|
|
5
|
-
import { MinerUResultParserService } from './result-parser.service.js';
|
|
6
|
-
import { MinerUToolset } from './mineru.toolset.js';
|
|
7
|
-
import { MinerU, icon } from './types.js';
|
|
8
|
-
import { buildMinerUTool } from './mineru.tool.js';
|
|
9
|
-
/**
|
|
10
|
-
* ToolsetStrategy for MinerU PDF parser tool
|
|
11
|
-
* Registers MinerU as a toolset that can be used in agent workflows
|
|
12
|
-
*/
|
|
13
|
-
let MinerUToolsetStrategy = class MinerUToolsetStrategy {
|
|
14
|
-
constructor(configService, resultParser) {
|
|
15
|
-
this.configService = configService;
|
|
16
|
-
this.resultParser = resultParser;
|
|
17
|
-
/**
|
|
18
|
-
* Metadata for MinerU toolset
|
|
19
|
-
*/
|
|
20
|
-
this.meta = {
|
|
21
|
-
author: 'Xpert AI',
|
|
22
|
-
tags: ['pdf', 'markdown', 'parser', 'ocr', 'mineru', 'document', 'extraction'],
|
|
23
|
-
name: MinerU,
|
|
24
|
-
label: {
|
|
25
|
-
en_US: 'MinerU PDF Parser',
|
|
26
|
-
zh_Hans: 'MinerU PDF 解析器',
|
|
27
|
-
},
|
|
28
|
-
description: {
|
|
29
|
-
en_US: 'Convert PDF files to markdown format using MinerU. Supports OCR, formula recognition, and table extraction.',
|
|
30
|
-
zh_Hans: '使用 MinerU 将 PDF 文件转换为 Markdown 格式。支持 OCR、公式识别和表格提取。',
|
|
31
|
-
},
|
|
32
|
-
icon: {
|
|
33
|
-
// Provide both shapes to maximize compatibility with different platform icon resolvers
|
|
34
|
-
// - builtin-provider icon endpoints may look for `type/value`
|
|
35
|
-
// - toolset registries may look for `svg`
|
|
36
|
-
type: 'svg',
|
|
37
|
-
value: icon,
|
|
38
|
-
svg: icon,
|
|
39
|
-
color: '#14b8a6',
|
|
40
|
-
},
|
|
41
|
-
configSchema: {
|
|
42
|
-
type: 'object',
|
|
43
|
-
properties: {
|
|
44
|
-
/**
|
|
45
|
-
* NOTE:
|
|
46
|
-
* We intentionally keep MinerU as a "self-contained" toolset that stores its own API credentials,
|
|
47
|
-
* instead of relying on the platform IntegrationPermission flow.
|
|
48
|
-
*
|
|
49
|
-
* Reason: during the built-in toolset authorization step, the platform may send `credentials = null`,
|
|
50
|
-
* and backend may access `credentials.integration`, causing a 500 (`Cannot read properties of null (reading 'integration')`).
|
|
51
|
-
* Defining API fields directly ensures the authorization UI renders fields and always submits an object.
|
|
52
|
-
*/
|
|
53
|
-
apiUrl: {
|
|
54
|
-
type: 'string',
|
|
55
|
-
title: {
|
|
56
|
-
en_US: 'Base URL',
|
|
57
|
-
zh_Hans: 'Base URL',
|
|
58
|
-
},
|
|
59
|
-
description: {
|
|
60
|
-
en_US: 'MinerU API base url. Official: https://mineru.net/api/v4',
|
|
61
|
-
zh_Hans: 'MinerU 服务地址。官方: https://mineru.net/api/v4',
|
|
62
|
-
},
|
|
63
|
-
default: 'https://mineru.net/api/v4',
|
|
64
|
-
// Note: apiUrl is not in required array because it's optional with a default value
|
|
65
|
-
},
|
|
66
|
-
apiKey: {
|
|
67
|
-
type: 'string',
|
|
68
|
-
title: {
|
|
69
|
-
en_US: 'API Key',
|
|
70
|
-
zh_Hans: 'API Key',
|
|
71
|
-
},
|
|
72
|
-
description: {
|
|
73
|
-
en_US: 'The API Key of the MinerU server (required)',
|
|
74
|
-
zh_Hans: 'MinerU 服务令牌(必填)',
|
|
75
|
-
},
|
|
76
|
-
'x-ui': {
|
|
77
|
-
component: 'secretInput',
|
|
78
|
-
label: 'API Key',
|
|
79
|
-
placeholder: 'MinerU API Key',
|
|
80
|
-
revealable: true,
|
|
81
|
-
maskSymbol: '*',
|
|
82
|
-
persist: true,
|
|
83
|
-
},
|
|
84
|
-
},
|
|
85
|
-
// Default parsing settings (optional, can be overridden when calling the tool)
|
|
86
|
-
// Changed isOcr from boolean to string enum
|
|
87
|
-
isOcr: {
|
|
88
|
-
type: 'string',
|
|
89
|
-
title: {
|
|
90
|
-
en_US: 'Enable OCR',
|
|
91
|
-
zh_Hans: '启用 OCR',
|
|
92
|
-
},
|
|
93
|
-
description: {
|
|
94
|
-
en_US: 'Enable OCR for image-based PDFs',
|
|
95
|
-
zh_Hans: '为基于图像的 PDF 启用 OCR',
|
|
96
|
-
},
|
|
97
|
-
enum: ['true', 'false'],
|
|
98
|
-
default: 'true',
|
|
99
|
-
'x-ui': {
|
|
100
|
-
enumLabels: {
|
|
101
|
-
'true': {
|
|
102
|
-
en_US: 'Enabled',
|
|
103
|
-
zh_Hans: '启用',
|
|
104
|
-
},
|
|
105
|
-
'false': {
|
|
106
|
-
en_US: 'Disabled',
|
|
107
|
-
zh_Hans: '禁用',
|
|
108
|
-
},
|
|
109
|
-
},
|
|
110
|
-
},
|
|
111
|
-
},
|
|
112
|
-
// Changed enableFormula from boolean to string enum
|
|
113
|
-
enableFormula: {
|
|
114
|
-
type: 'string',
|
|
115
|
-
title: {
|
|
116
|
-
en_US: 'Enable Formula Recognition',
|
|
117
|
-
zh_Hans: '启用公式识别',
|
|
118
|
-
},
|
|
119
|
-
description: {
|
|
120
|
-
en_US: 'Enable formula recognition',
|
|
121
|
-
zh_Hans: '启用公式识别',
|
|
122
|
-
},
|
|
123
|
-
enum: ['true', 'false'],
|
|
124
|
-
default: 'true',
|
|
125
|
-
'x-ui': {
|
|
126
|
-
enumLabels: {
|
|
127
|
-
'true': {
|
|
128
|
-
en_US: 'Enabled',
|
|
129
|
-
zh_Hans: '启用',
|
|
130
|
-
},
|
|
131
|
-
'false': {
|
|
132
|
-
en_US: 'Disabled',
|
|
133
|
-
zh_Hans: '禁用',
|
|
134
|
-
},
|
|
135
|
-
},
|
|
136
|
-
},
|
|
137
|
-
},
|
|
138
|
-
// Changed enableTable from boolean to string enum
|
|
139
|
-
enableTable: {
|
|
140
|
-
type: 'string',
|
|
141
|
-
title: {
|
|
142
|
-
en_US: 'Enable Table Recognition',
|
|
143
|
-
zh_Hans: '启用表格识别',
|
|
144
|
-
},
|
|
145
|
-
description: {
|
|
146
|
-
en_US: 'Enable table recognition',
|
|
147
|
-
zh_Hans: '启用表格识别',
|
|
148
|
-
},
|
|
149
|
-
enum: ['true', 'false'],
|
|
150
|
-
default: 'true',
|
|
151
|
-
'x-ui': {
|
|
152
|
-
enumLabels: {
|
|
153
|
-
'true': {
|
|
154
|
-
en_US: 'Enabled',
|
|
155
|
-
zh_Hans: '启用',
|
|
156
|
-
},
|
|
157
|
-
'false': {
|
|
158
|
-
en_US: 'Disabled',
|
|
159
|
-
zh_Hans: '禁用',
|
|
160
|
-
},
|
|
161
|
-
},
|
|
162
|
-
},
|
|
163
|
-
},
|
|
164
|
-
language: {
|
|
165
|
-
type: 'string',
|
|
166
|
-
title: {
|
|
167
|
-
en_US: 'Document Language',
|
|
168
|
-
zh_Hans: '文档语言',
|
|
169
|
-
},
|
|
170
|
-
description: {
|
|
171
|
-
en_US: 'Document language: "en" for English, "ch" for Chinese (default: "ch")',
|
|
172
|
-
zh_Hans: '文档语言:"en" 表示英语,"ch" 表示中文(默认:"ch")',
|
|
173
|
-
},
|
|
174
|
-
enum: ['en', 'ch'],
|
|
175
|
-
default: 'ch',
|
|
176
|
-
'x-ui': {
|
|
177
|
-
enumLabels: {
|
|
178
|
-
'en': {
|
|
179
|
-
en_US: 'en',
|
|
180
|
-
zh_Hans: '英文',
|
|
181
|
-
},
|
|
182
|
-
'ch': {
|
|
183
|
-
en_US: 'ch',
|
|
184
|
-
zh_Hans: '中文',
|
|
185
|
-
},
|
|
186
|
-
},
|
|
187
|
-
},
|
|
188
|
-
},
|
|
189
|
-
modelVersion: {
|
|
190
|
-
type: 'string',
|
|
191
|
-
title: {
|
|
192
|
-
en_US: 'Model Version',
|
|
193
|
-
zh_Hans: '模型版本',
|
|
194
|
-
},
|
|
195
|
-
description: {
|
|
196
|
-
en_US: 'Model version: "pipeline" or "vlm" (default: "pipeline")',
|
|
197
|
-
zh_Hans: '模型版本:"pipeline" 或 "vlm"(默认:"pipeline")',
|
|
198
|
-
},
|
|
199
|
-
enum: ['pipeline', 'vlm'],
|
|
200
|
-
default: 'pipeline',
|
|
201
|
-
'x-ui': {
|
|
202
|
-
enumLabels: {
|
|
203
|
-
'pipeline': {
|
|
204
|
-
en_US: 'pipeline',
|
|
205
|
-
zh_Hans: 'pipeline',
|
|
206
|
-
},
|
|
207
|
-
'vlm': {
|
|
208
|
-
en_US: 'vlm',
|
|
209
|
-
zh_Hans: 'vlm',
|
|
210
|
-
},
|
|
211
|
-
},
|
|
212
|
-
},
|
|
213
|
-
},
|
|
214
|
-
},
|
|
215
|
-
required: ['apiKey'],
|
|
216
|
-
},
|
|
217
|
-
};
|
|
218
|
-
/**
|
|
219
|
-
* Permissions required by MinerU toolset
|
|
220
|
-
*/
|
|
221
|
-
this.permissions = [
|
|
222
|
-
{
|
|
223
|
-
type: 'filesystem',
|
|
224
|
-
operations: ['read', 'write', 'list'],
|
|
225
|
-
scope: [],
|
|
226
|
-
},
|
|
227
|
-
];
|
|
228
|
-
}
|
|
229
|
-
/**
|
|
230
|
-
* Validate toolset configuration
|
|
231
|
-
*/
|
|
232
|
-
validateConfig(config) {
|
|
233
|
-
if (!config) {
|
|
234
|
-
return Promise.resolve();
|
|
235
|
-
}
|
|
236
|
-
// apiKey is now a required field, validated by schema.required
|
|
237
|
-
if (!config.apiKey) {
|
|
238
|
-
throw new Error('MinerU apiKey is required');
|
|
239
|
-
}
|
|
240
|
-
return Promise.resolve();
|
|
241
|
-
}
|
|
242
|
-
/**
|
|
243
|
-
* Create MinerU toolset instance
|
|
244
|
-
* Note: config may be null/undefined during authorization phase
|
|
245
|
-
* Modified to read from toolset.credentials (like @searchapi/@email)
|
|
246
|
-
*/
|
|
247
|
-
async create(config) {
|
|
248
|
-
// Check if config is an IXpertToolset object with credentials property
|
|
249
|
-
const toolset = (config && typeof config === 'object' && 'credentials' in config)
|
|
250
|
-
? config
|
|
251
|
-
: null;
|
|
252
|
-
// Priority: toolset.credentials > config (flat structure) > empty object
|
|
253
|
-
const creds = toolset?.credentials ?? config ?? {};
|
|
254
|
-
// Build config with dependencies
|
|
255
|
-
const configWithDependencies = {
|
|
256
|
-
apiUrl: creds.apiUrl,
|
|
257
|
-
apiKey: creds.apiKey,
|
|
258
|
-
isOcr: creds.isOcr,
|
|
259
|
-
enableFormula: creds.enableFormula,
|
|
260
|
-
enableTable: creds.enableTable,
|
|
261
|
-
language: creds.language,
|
|
262
|
-
modelVersion: creds.modelVersion,
|
|
263
|
-
configService: this.configService,
|
|
264
|
-
resultParser: this.resultParser,
|
|
265
|
-
};
|
|
266
|
-
return new MinerUToolset(configWithDependencies);
|
|
267
|
-
}
|
|
268
|
-
/**
|
|
269
|
-
* Create tools for MinerU toolset
|
|
270
|
-
* Tools are created dynamically in MinerUToolset.initTools()
|
|
271
|
-
* based on the toolset credentials/configuration
|
|
272
|
-
*/
|
|
273
|
-
createTools() {
|
|
274
|
-
/**
|
|
275
|
-
* IMPORTANT:
|
|
276
|
-
* The console UI requires builtin providers to expose at least one tool so users can
|
|
277
|
-
* enable it (otherwise it fails the "Enable at least one tool" validation).
|
|
278
|
-
*
|
|
279
|
-
* The returned tools here are used for listing/preview & toggling in UI. Actual execution
|
|
280
|
-
* will use the toolset instance created by `create()` -> `MinerUToolset.initTools()`,
|
|
281
|
-
* which wires credentials (apiUrl/apiKey/serverType) correctly.
|
|
282
|
-
*/
|
|
283
|
-
return [
|
|
284
|
-
buildMinerUTool(this.configService, this.resultParser,
|
|
285
|
-
// No credentials at listing time
|
|
286
|
-
undefined, undefined,
|
|
287
|
-
// Defaults used if user doesn't pass tool-call parameters
|
|
288
|
-
{
|
|
289
|
-
isOcr: true,
|
|
290
|
-
enableFormula: true,
|
|
291
|
-
enableTable: true,
|
|
292
|
-
language: 'ch',
|
|
293
|
-
modelVersion: 'pipeline',
|
|
294
|
-
}),
|
|
295
|
-
];
|
|
296
|
-
}
|
|
297
|
-
};
|
|
298
|
-
MinerUToolsetStrategy = __decorate([
|
|
299
|
-
Injectable(),
|
|
300
|
-
ToolsetStrategy(MinerU),
|
|
301
|
-
__param(0, Inject(forwardRef(() => ConfigService))),
|
|
302
|
-
__param(1, Inject(MinerUResultParserService)),
|
|
303
|
-
__metadata("design:paramtypes", [ConfigService,
|
|
304
|
-
MinerUResultParserService])
|
|
305
|
-
], MinerUToolsetStrategy);
|
|
306
|
-
export { MinerUToolsetStrategy };
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
import { XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
2
|
-
import { z } from 'zod';
|
|
3
|
-
import { ConfigService } from '@nestjs/config';
|
|
4
|
-
import { MinerUResultParserService } from './result-parser.service.js';
|
|
5
|
-
import { MinerUIntegrationOptions } from './types.js';
|
|
6
|
-
/**
|
|
7
|
-
* Default parsing settings for MinerU tool
|
|
8
|
-
*/
|
|
9
|
-
export interface MinerUToolDefaults {
|
|
10
|
-
isOcr?: boolean | string;
|
|
11
|
-
enableFormula?: boolean | string;
|
|
12
|
-
enableTable?: boolean | string;
|
|
13
|
-
language?: 'en' | 'ch';
|
|
14
|
-
modelVersion?: 'pipeline' | 'vlm';
|
|
15
|
-
}
|
|
16
|
-
/**
|
|
17
|
-
* Build MinerU PDF parser tool
|
|
18
|
-
* This tool converts PDF files to markdown format using MinerU service
|
|
19
|
-
*/
|
|
20
|
-
export declare function buildMinerUTool(configService: ConfigService, resultParser: MinerUResultParserService, options?: MinerUIntegrationOptions, fileSystem?: XpFileSystem, defaults?: MinerUToolDefaults): import("@langchain/core/tools").DynamicStructuredTool<z.ZodObject<{
|
|
21
|
-
doc_url: z.ZodString;
|
|
22
|
-
}, "strip", z.ZodTypeAny, {
|
|
23
|
-
doc_url?: string;
|
|
24
|
-
}, {
|
|
25
|
-
doc_url?: string;
|
|
26
|
-
}>, {
|
|
27
|
-
doc_url?: string;
|
|
28
|
-
}, {
|
|
29
|
-
doc_url?: string;
|
|
30
|
-
}, (string | {
|
|
31
|
-
files: any[];
|
|
32
|
-
taskId: string;
|
|
33
|
-
metadata: any;
|
|
34
|
-
})[]>;
|
|
35
|
-
//# sourceMappingURL=mineru.tool.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"mineru.tool.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.tool.ts"],"names":[],"mappings":"AAEA,OAAO,EAAmB,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACrE,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAE/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAEvE,OAAO,EAAqB,wBAAwB,EAAE,MAAM,YAAY,CAAC;AAEzE;;GAEG;AACH,MAAM,WAAW,kBAAkB;IAEjC,KAAK,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACzB,aAAa,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACjC,WAAW,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IAC/B,QAAQ,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACvB,YAAY,CAAC,EAAE,UAAU,GAAG,KAAK,CAAC;CACnC;AAED;;;GAGG;AACH,wBAAgB,eAAe,CAC7B,aAAa,EAAE,aAAa,EAC5B,YAAY,EAAE,yBAAyB,EACvC,OAAO,CAAC,EAAE,wBAAwB,EAClC,UAAU,CAAC,EAAE,YAAY,EACzB,QAAQ,CAAC,EAAE,kBAAkB;;;;;;;;;;;;;;MAmL9B"}
|
package/dist/lib/mineru.tool.js
DELETED
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
import { tool } from '@langchain/core/tools';
|
|
2
|
-
import { getCurrentTaskInput } from '@langchain/langgraph';
|
|
3
|
-
import { getErrorMessage } from '@xpert-ai/plugin-sdk';
|
|
4
|
-
import { z } from 'zod';
|
|
5
|
-
import { MinerUClient } from './mineru.client.js';
|
|
6
|
-
import { MinerUIntegration } from './types.js';
|
|
7
|
-
/**
|
|
8
|
-
* Build MinerU PDF parser tool
|
|
9
|
-
* This tool converts PDF files to markdown format using MinerU service
|
|
10
|
-
*/
|
|
11
|
-
export function buildMinerUTool(configService, resultParser, options, fileSystem, defaults) {
|
|
12
|
-
return tool(async (input) => {
|
|
13
|
-
try {
|
|
14
|
-
const { doc_url } = input;
|
|
15
|
-
// Log raw input (mask nothing sensitive here because doc_url is public; avoid logging other fields)
|
|
16
|
-
console.debug('[MinerU] tool invoked with input', { doc_url, extraKeys: Object.keys(input || {}).filter((k) => k !== 'doc_url') });
|
|
17
|
-
if (!doc_url) {
|
|
18
|
-
throw new Error('doc_url is required');
|
|
19
|
-
}
|
|
20
|
-
// Get workspace context from current task
|
|
21
|
-
const currentState = getCurrentTaskInput();
|
|
22
|
-
const workspacePath = currentState?.['sys']?.['volume'] ?? '/tmp/xpert';
|
|
23
|
-
// Use configuration from authorization page (passed via options and defaults parameters)
|
|
24
|
-
// These values come from the authorization page configuration and are set when the tool is created
|
|
25
|
-
const finalApiUrl = options?.apiUrl || 'https://mineru.net/api/v4';
|
|
26
|
-
const finalApiKey = options?.apiKey; // apiKey is required and validated in authorization page
|
|
27
|
-
// Log effective config (mask key) to确认是否拿到了授权页的凭据
|
|
28
|
-
const maskedKey = finalApiKey && finalApiKey.length > 8
|
|
29
|
-
? `${finalApiKey.slice(0, 4)}***${finalApiKey.slice(-4)}`
|
|
30
|
-
: finalApiKey
|
|
31
|
-
? 'provided'
|
|
32
|
-
: 'missing';
|
|
33
|
-
console.debug('[MinerU] buildMinerUTool config', {
|
|
34
|
-
fromOptions: Boolean(options),
|
|
35
|
-
apiUrl: finalApiUrl,
|
|
36
|
-
apiKey: maskedKey,
|
|
37
|
-
defaults: {
|
|
38
|
-
isOcr: defaults?.isOcr,
|
|
39
|
-
enableFormula: defaults?.enableFormula,
|
|
40
|
-
enableTable: defaults?.enableTable,
|
|
41
|
-
language: defaults?.language,
|
|
42
|
-
modelVersion: defaults?.modelVersion,
|
|
43
|
-
},
|
|
44
|
-
});
|
|
45
|
-
// Use configuration values from authorization page (passed via defaults parameter)
|
|
46
|
-
// Convert string enum values ('true'/'false') to boolean, or use boolean values directly
|
|
47
|
-
// If undefined, default to true
|
|
48
|
-
const finalIsOcr = defaults?.isOcr === undefined
|
|
49
|
-
? true
|
|
50
|
-
: (typeof defaults.isOcr === 'string' ? defaults.isOcr === 'true' : defaults.isOcr === true);
|
|
51
|
-
const finalEnableFormula = defaults?.enableFormula === undefined
|
|
52
|
-
? true
|
|
53
|
-
: (typeof defaults.enableFormula === 'string' ? defaults.enableFormula === 'true' : defaults.enableFormula === true);
|
|
54
|
-
const finalEnableTable = defaults?.enableTable === undefined
|
|
55
|
-
? true
|
|
56
|
-
: (typeof defaults.enableTable === 'string' ? defaults.enableTable === 'true' : defaults.enableTable === true);
|
|
57
|
-
const finalLanguage = defaults?.language || 'ch';
|
|
58
|
-
const finalModelVersion = defaults?.modelVersion || 'pipeline';
|
|
59
|
-
const effectiveOptions = {
|
|
60
|
-
apiUrl: finalApiUrl,
|
|
61
|
-
apiKey: finalApiKey,
|
|
62
|
-
};
|
|
63
|
-
const integration = {
|
|
64
|
-
provider: MinerUIntegration,
|
|
65
|
-
options: effectiveOptions,
|
|
66
|
-
};
|
|
67
|
-
const mineruClient = new MinerUClient(configService, {
|
|
68
|
-
fileSystem,
|
|
69
|
-
integration,
|
|
70
|
-
});
|
|
71
|
-
// Determine file name from URL
|
|
72
|
-
let finalFileName = 'document.pdf';
|
|
73
|
-
try {
|
|
74
|
-
const parsed = new URL(doc_url);
|
|
75
|
-
finalFileName = parsed.pathname.split('/').pop() || 'document.pdf';
|
|
76
|
-
}
|
|
77
|
-
catch {
|
|
78
|
-
// ignore
|
|
79
|
-
}
|
|
80
|
-
// Create MinerU task
|
|
81
|
-
const { taskId } = await mineruClient.createTask({
|
|
82
|
-
url: doc_url,
|
|
83
|
-
fileName: finalFileName,
|
|
84
|
-
isOcr: finalIsOcr,
|
|
85
|
-
enableFormula: finalEnableFormula,
|
|
86
|
-
enableTable: finalEnableTable,
|
|
87
|
-
language: finalLanguage,
|
|
88
|
-
modelVersion: finalModelVersion,
|
|
89
|
-
});
|
|
90
|
-
let parsedResult;
|
|
91
|
-
if (mineruClient.serverType === 'self-hosted') {
|
|
92
|
-
// Self-hosted: get result immediately
|
|
93
|
-
const taskResult = mineruClient.getSelfHostedTask(taskId);
|
|
94
|
-
if (!taskResult) {
|
|
95
|
-
throw new Error('Failed to get MinerU task result');
|
|
96
|
-
}
|
|
97
|
-
parsedResult = await resultParser.parseLocalTask(taskResult, taskId, {
|
|
98
|
-
fileUrl: doc_url,
|
|
99
|
-
name: finalFileName,
|
|
100
|
-
folder: workspacePath,
|
|
101
|
-
}, fileSystem);
|
|
102
|
-
}
|
|
103
|
-
else {
|
|
104
|
-
// Official API: wait for completion
|
|
105
|
-
const result = await mineruClient.waitForTask(taskId, 5 * 60 * 1000, 5000);
|
|
106
|
-
parsedResult = await resultParser.parseFromUrl(result.full_zip_url, taskId, {
|
|
107
|
-
fileUrl: doc_url,
|
|
108
|
-
name: finalFileName,
|
|
109
|
-
folder: workspacePath,
|
|
110
|
-
}, fileSystem);
|
|
111
|
-
}
|
|
112
|
-
// Build file artifacts from parsed result
|
|
113
|
-
const fileArtifacts = [];
|
|
114
|
-
if (parsedResult.metadata?.assets) {
|
|
115
|
-
for (const asset of parsedResult.metadata.assets) {
|
|
116
|
-
if (asset.type === 'file' || asset.type === 'image') {
|
|
117
|
-
const fileName = asset.filePath?.split(/[/\\]/).pop() || asset.url?.split('/').pop() || 'file';
|
|
118
|
-
const extension = fileName.split('.').pop()?.toLowerCase() || 'md';
|
|
119
|
-
const mimeType = asset.type === 'image'
|
|
120
|
-
? (extension === 'png' ? 'image/png' : 'image/jpeg')
|
|
121
|
-
: (extension === 'md' ? 'text/markdown' : 'application/json');
|
|
122
|
-
fileArtifacts.push({
|
|
123
|
-
fileName: fileName,
|
|
124
|
-
filePath: asset.filePath,
|
|
125
|
-
fileUrl: asset.url,
|
|
126
|
-
mimeType: mimeType,
|
|
127
|
-
extension: extension,
|
|
128
|
-
});
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
// Extract markdown content from chunks
|
|
133
|
-
const markdownContent = parsedResult.chunks
|
|
134
|
-
?.map((chunk) => chunk.pageContent)
|
|
135
|
-
.join('\n\n') || '';
|
|
136
|
-
// Return full markdown (do NOT truncate). If the platform/UI needs a preview, it can truncate client-side.
|
|
137
|
-
return [
|
|
138
|
-
markdownContent,
|
|
139
|
-
{
|
|
140
|
-
files: fileArtifacts,
|
|
141
|
-
taskId,
|
|
142
|
-
metadata: parsedResult.metadata,
|
|
143
|
-
},
|
|
144
|
-
];
|
|
145
|
-
}
|
|
146
|
-
catch (error) {
|
|
147
|
-
throw new Error(`MinerU processing failed: ${getErrorMessage(error)}`);
|
|
148
|
-
}
|
|
149
|
-
}, {
|
|
150
|
-
name: 'mineru_pdf_parser',
|
|
151
|
-
description: 'Convert PDF files to markdown format using MinerU. Supports OCR, formula recognition, and table extraction. Returns markdown content and extracted files (images, JSON, etc.).',
|
|
152
|
-
schema: z.object({
|
|
153
|
-
doc_url: z.string().min(1).describe('PDF URL (required)'),
|
|
154
|
-
}),
|
|
155
|
-
responseFormat: 'content_and_artifact',
|
|
156
|
-
});
|
|
157
|
-
}
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
import { StructuredToolInterface, ToolSchemaBase } from '@langchain/core/tools';
|
|
2
|
-
import { BuiltinToolset, XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
3
|
-
import { ConfigService } from '@nestjs/config';
|
|
4
|
-
import { MinerUResultParserService } from './result-parser.service.js';
|
|
5
|
-
/**
|
|
6
|
-
* Configuration for MinerU Toolset
|
|
7
|
-
*/
|
|
8
|
-
export interface MinerUToolsetConfig {
|
|
9
|
-
/**
|
|
10
|
-
* MinerU API options stored in toolset credentials
|
|
11
|
-
*/
|
|
12
|
-
apiUrl?: string;
|
|
13
|
-
apiKey?: string;
|
|
14
|
-
fileSystem?: XpFileSystem;
|
|
15
|
-
configService?: ConfigService;
|
|
16
|
-
resultParser?: MinerUResultParserService;
|
|
17
|
-
isOcr?: boolean | string;
|
|
18
|
-
enableFormula?: boolean | string;
|
|
19
|
-
enableTable?: boolean | string;
|
|
20
|
-
language?: 'en' | 'ch';
|
|
21
|
-
modelVersion?: 'pipeline' | 'vlm';
|
|
22
|
-
}
|
|
23
|
-
/**
|
|
24
|
-
* MinerU Toolset implementation
|
|
25
|
-
* Provides PDF to markdown conversion tool using MinerU service
|
|
26
|
-
*/
|
|
27
|
-
export declare class MinerUToolset extends BuiltinToolset<StructuredToolInterface, MinerUToolsetConfig> {
|
|
28
|
-
private readonly config;
|
|
29
|
-
tools: any[];
|
|
30
|
-
/**
|
|
31
|
-
* Constructor for MinerU Toolset
|
|
32
|
-
* Accepts config which contains credentials and dependencies
|
|
33
|
-
* Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
|
|
34
|
-
* properties (tenantId, env) that are added at runtime
|
|
35
|
-
*/
|
|
36
|
-
constructor(config: MinerUToolsetConfig);
|
|
37
|
-
/**
|
|
38
|
-
* Validate credentials for MinerU toolset
|
|
39
|
-
* Note: During authorization phase, credentials may be incomplete.
|
|
40
|
-
* configService and resultParser are runtime dependencies injected by the strategy.
|
|
41
|
-
* We don't validate anything here to allow authorization to proceed.
|
|
42
|
-
*/
|
|
43
|
-
_validateCredentials(credentials: MinerUToolsetConfig): Promise<void>;
|
|
44
|
-
/**
|
|
45
|
-
* Initialize tools for MinerU toolset
|
|
46
|
-
* Creates the PDF parser tool with necessary dependencies
|
|
47
|
-
*/
|
|
48
|
-
initTools(): Promise<StructuredToolInterface<ToolSchemaBase, any, any>[]>;
|
|
49
|
-
}
|
|
50
|
-
//# sourceMappingURL=mineru.toolset.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"mineru.toolset.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.toolset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAChF,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACpE,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAIvE;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC;;OAEG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,YAAY,CAAC;IAC1B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,YAAY,CAAC,EAAE,yBAAyB,CAAC;IAGzC,KAAK,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACzB,aAAa,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACjC,WAAW,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IAC/B,QAAQ,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACvB,YAAY,CAAC,EAAE,UAAU,GAAG,KAAK,CAAC;CACnC;AAED;;;GAGG;AACH,qBAAa,aAAc,SAAQ,cAAc,CAAC,uBAAuB,EAAE,mBAAmB,CAAC;IAC7F,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAsB;IAEpC,KAAK,EAAE,GAAG,EAAE,CAAM;IAE3B;;;;;OAKG;gBACS,MAAM,EAAE,mBAAmB;IAgBvC;;;;;OAKG;IACY,oBAAoB,CAAC,WAAW,EAAE,mBAAmB,GAAG,OAAO,CAAC,IAAI,CAAC;IAKpF;;;OAGG;IACY,SAAS,IAAI,OAAO,CAAC,uBAAuB,CAAC,cAAc,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;CA+DzF"}
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
import { BuiltinToolset } from '@xpert-ai/plugin-sdk';
|
|
2
|
-
import { buildMinerUTool } from './mineru.tool.js';
|
|
3
|
-
/**
|
|
4
|
-
* MinerU Toolset implementation
|
|
5
|
-
* Provides PDF to markdown conversion tool using MinerU service
|
|
6
|
-
*/
|
|
7
|
-
export class MinerUToolset extends BuiltinToolset {
|
|
8
|
-
/**
|
|
9
|
-
* Constructor for MinerU Toolset
|
|
10
|
-
* Accepts config which contains credentials and dependencies
|
|
11
|
-
* Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
|
|
12
|
-
* properties (tenantId, env) that are added at runtime
|
|
13
|
-
*/
|
|
14
|
-
constructor(config) {
|
|
15
|
-
super('mineru', undefined, config);
|
|
16
|
-
// Ensure `tools` exists even if upstream BuiltinToolset typings differ across versions.
|
|
17
|
-
this.tools = [];
|
|
18
|
-
this.config = config;
|
|
19
|
-
// Log config received in constructor (mask apiKey for security)
|
|
20
|
-
const configForLog = { ...config };
|
|
21
|
-
if (configForLog.apiKey) {
|
|
22
|
-
configForLog.apiKey = configForLog.apiKey.length > 8
|
|
23
|
-
? `${configForLog.apiKey.substring(0, 4)}...${configForLog.apiKey.substring(configForLog.apiKey.length - 4)}`
|
|
24
|
-
: '***';
|
|
25
|
-
}
|
|
26
|
-
// Use base class logger (protected access)
|
|
27
|
-
if ('logger' in this && this.logger) {
|
|
28
|
-
this.logger.log(`[MinerU] MinerUToolset constructor received config: ${JSON.stringify(configForLog, null, 2)}`);
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
/**
|
|
32
|
-
* Validate credentials for MinerU toolset
|
|
33
|
-
* Note: During authorization phase, credentials may be incomplete.
|
|
34
|
-
* configService and resultParser are runtime dependencies injected by the strategy.
|
|
35
|
-
* We don't validate anything here to allow authorization to proceed.
|
|
36
|
-
*/
|
|
37
|
-
async _validateCredentials(credentials) {
|
|
38
|
-
// No validation needed during authorization phase
|
|
39
|
-
// API key validity will be enforced by MinerU server when tool is used
|
|
40
|
-
}
|
|
41
|
-
/**
|
|
42
|
-
* Initialize tools for MinerU toolset
|
|
43
|
-
* Creates the PDF parser tool with necessary dependencies
|
|
44
|
-
*/
|
|
45
|
-
async initTools() {
|
|
46
|
-
const { configService, resultParser, apiUrl, apiKey, fileSystem, isOcr, enableFormula, enableTable, language, modelVersion } = this.config;
|
|
47
|
-
// Log config before destructuring
|
|
48
|
-
const configKeys = Object.keys(this.config);
|
|
49
|
-
const hasApiKey = 'apiKey' in this.config;
|
|
50
|
-
const apiKeyValue = this.config.apiKey;
|
|
51
|
-
const maskedApiKey = apiKeyValue
|
|
52
|
-
? (apiKeyValue.length > 8 ? `${apiKeyValue.substring(0, 4)}...${apiKeyValue.substring(apiKeyValue.length - 4)}` : '***')
|
|
53
|
-
: 'missing';
|
|
54
|
-
// Use base class logger (protected access)
|
|
55
|
-
if ('logger' in this && this.logger) {
|
|
56
|
-
this.logger.log(`[MinerU] MinerUToolset.initTools() - config keys: ${configKeys.join(', ')}, hasApiKey: ${hasApiKey}, apiKey: ${maskedApiKey}`);
|
|
57
|
-
this.logger.log(`[MinerU] MinerUToolset.initTools() - destructured apiKey: ${apiKey ? (apiKey.length > 8 ? `${apiKey.substring(0, 4)}...${apiKey.substring(apiKey.length - 4)}` : '***') : 'missing'}`);
|
|
58
|
-
}
|
|
59
|
-
if (!configService || !resultParser) {
|
|
60
|
-
throw new Error('ConfigService and MinerUResultParserService are required');
|
|
61
|
-
}
|
|
62
|
-
// Use configuration from authorization page
|
|
63
|
-
// apiUrl: use provided value or default to official server URL
|
|
64
|
-
const finalApiUrl = apiUrl || 'https://mineru.net/api/v4';
|
|
65
|
-
// Convert string enum values to boolean (compatible with 'true'/'false' strings and boolean values)
|
|
66
|
-
// Use provided values from authorization page, or default to true
|
|
67
|
-
const finalIsOcr = isOcr === 'true' || isOcr === true;
|
|
68
|
-
const finalEnableFormula = enableFormula === 'true' || enableFormula === true;
|
|
69
|
-
const finalEnableTable = enableTable === 'true' || enableTable === true;
|
|
70
|
-
// Use provided values from authorization page, or use defaults
|
|
71
|
-
const finalLanguage = language || 'ch';
|
|
72
|
-
const finalModelVersion = modelVersion || 'pipeline';
|
|
73
|
-
// Log what we're passing to buildMinerUTool
|
|
74
|
-
const maskedFinalApiKey = apiKey
|
|
75
|
-
? (apiKey.length > 8 ? `${apiKey.substring(0, 4)}...${apiKey.substring(apiKey.length - 4)}` : '***')
|
|
76
|
-
: 'missing';
|
|
77
|
-
// Use base class logger (protected access)
|
|
78
|
-
if ('logger' in this && this.logger) {
|
|
79
|
-
this.logger.log(`[MinerU] MinerUToolset.initTools() - passing to buildMinerUTool: apiUrl=${finalApiUrl}, apiKey=${maskedFinalApiKey}`);
|
|
80
|
-
}
|
|
81
|
-
this.tools = [
|
|
82
|
-
buildMinerUTool(configService, resultParser, {
|
|
83
|
-
apiUrl: finalApiUrl,
|
|
84
|
-
apiKey, // apiKey is required and validated in authorization page
|
|
85
|
-
}, fileSystem, {
|
|
86
|
-
isOcr: finalIsOcr,
|
|
87
|
-
enableFormula: finalEnableFormula,
|
|
88
|
-
enableTable: finalEnableTable,
|
|
89
|
-
language: finalLanguage,
|
|
90
|
-
modelVersion: finalModelVersion,
|
|
91
|
-
}),
|
|
92
|
-
];
|
|
93
|
-
return this.tools;
|
|
94
|
-
}
|
|
95
|
-
}
|