@helloxiaohu/plugin-mineru 0.0.20 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -101
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7 -8
- package/dist/lib/integration.strategy.d.ts.map +1 -1
- package/dist/lib/integration.strategy.js +19 -4
- package/dist/lib/mineru.client.d.ts.map +1 -1
- package/dist/lib/mineru.client.js +157 -11
- package/dist/lib/mineru.plugin.d.ts.map +1 -1
- package/dist/lib/mineru.plugin.js +0 -2
- package/dist/lib/result-parser.service.d.ts.map +1 -1
- package/dist/lib/result-parser.service.js +1 -0
- package/dist/lib/transformer-mineru.strategy.d.ts +11 -0
- package/dist/lib/transformer-mineru.strategy.d.ts.map +1 -1
- package/dist/lib/transformer-mineru.strategy.js +31 -9
- package/dist/lib/types.d.ts +3 -13
- package/dist/lib/types.d.ts.map +1 -1
- package/dist/lib/types.js +22 -35
- package/package.json +54 -62
- package/dist/lib/mineru-toolset.strategy.d.ts +0 -167
- package/dist/lib/mineru-toolset.strategy.d.ts.map +0 -1
- package/dist/lib/mineru-toolset.strategy.js +0 -216
- package/dist/lib/mineru.tool.d.ts +0 -70
- package/dist/lib/mineru.tool.d.ts.map +0 -1
- package/dist/lib/mineru.tool.js +0 -145
- package/dist/lib/mineru.toolset.d.ts +0 -51
- package/dist/lib/mineru.toolset.d.ts.map +0 -1
- package/dist/lib/mineru.toolset.js +0 -52
|
@@ -1,216 +0,0 @@
|
|
|
1
|
-
import { __decorate, __metadata, __param } from "tslib";
|
|
2
|
-
import { Injectable, forwardRef, Inject } from '@nestjs/common';
|
|
3
|
-
import { ConfigService } from '@nestjs/config';
|
|
4
|
-
import { ToolsetStrategy, } from '@xpert-ai/plugin-sdk';
|
|
5
|
-
import { MinerUResultParserService } from './result-parser.service.js';
|
|
6
|
-
import { MinerUToolset } from './mineru.toolset.js';
|
|
7
|
-
import { MinerU, icon } from './types.js';
|
|
8
|
-
/**
|
|
9
|
-
* ToolsetStrategy for MinerU PDF parser tool
|
|
10
|
-
* Registers MinerU as a toolset that can be used in agent workflows
|
|
11
|
-
*/
|
|
12
|
-
let MinerUToolsetStrategy = class MinerUToolsetStrategy {
|
|
13
|
-
constructor(configService, resultParser) {
|
|
14
|
-
this.configService = configService;
|
|
15
|
-
this.resultParser = resultParser;
|
|
16
|
-
/**
|
|
17
|
-
* Metadata for MinerU toolset
|
|
18
|
-
*/
|
|
19
|
-
this.meta = {
|
|
20
|
-
author: 'Xpert AI',
|
|
21
|
-
tags: ['pdf', 'markdown', 'parser', 'ocr', 'mineru', 'document', 'extraction'],
|
|
22
|
-
name: MinerU,
|
|
23
|
-
label: {
|
|
24
|
-
en_US: 'MinerU PDF Parser',
|
|
25
|
-
zh_Hans: 'MinerU PDF 解析器',
|
|
26
|
-
},
|
|
27
|
-
description: {
|
|
28
|
-
en_US: 'Convert PDF files to markdown format using MinerU. Supports OCR, formula recognition, and table extraction.',
|
|
29
|
-
zh_Hans: '使用 MinerU 将 PDF 文件转换为 Markdown 格式。支持 OCR、公式识别和表格提取。',
|
|
30
|
-
},
|
|
31
|
-
icon: {
|
|
32
|
-
svg: icon,
|
|
33
|
-
color: '#14b8a6',
|
|
34
|
-
},
|
|
35
|
-
configSchema: {
|
|
36
|
-
type: 'object',
|
|
37
|
-
properties: {
|
|
38
|
-
/**
|
|
39
|
-
* NOTE:
|
|
40
|
-
* We intentionally keep MinerU as a "self-contained" toolset that stores its own API credentials,
|
|
41
|
-
* instead of relying on the platform IntegrationPermission flow.
|
|
42
|
-
*
|
|
43
|
-
* Reason: during the built-in toolset authorization step, the platform may send `credentials = null`,
|
|
44
|
-
* and backend may access `credentials.integration`, causing a 500 (`Cannot read properties of null (reading 'integration')`).
|
|
45
|
-
* Defining API fields directly ensures the authorization UI renders fields and always submits an object.
|
|
46
|
-
*/
|
|
47
|
-
apiUrl: {
|
|
48
|
-
type: 'string',
|
|
49
|
-
title: {
|
|
50
|
-
en_US: 'Base URL',
|
|
51
|
-
zh_Hans: 'Base URL',
|
|
52
|
-
},
|
|
53
|
-
description: {
|
|
54
|
-
en_US: 'MinerU API base url. Official: https://api.mineru.dev',
|
|
55
|
-
zh_Hans: 'MinerU 服务地址。官方: https://api.mineru.dev',
|
|
56
|
-
},
|
|
57
|
-
default: 'https://api.mineru.dev',
|
|
58
|
-
},
|
|
59
|
-
apiKey: {
|
|
60
|
-
type: 'string',
|
|
61
|
-
title: {
|
|
62
|
-
en_US: 'API Key',
|
|
63
|
-
zh_Hans: 'API Key',
|
|
64
|
-
},
|
|
65
|
-
description: {
|
|
66
|
-
en_US: 'The API Key of the MinerU server (required for official API)',
|
|
67
|
-
zh_Hans: 'MinerU 服务令牌(官方 API 必填)',
|
|
68
|
-
},
|
|
69
|
-
'x-ui': {
|
|
70
|
-
component: 'secretInput',
|
|
71
|
-
label: 'API Key',
|
|
72
|
-
placeholder: 'MinerU API Key',
|
|
73
|
-
revealable: true,
|
|
74
|
-
maskSymbol: '*',
|
|
75
|
-
persist: true,
|
|
76
|
-
},
|
|
77
|
-
},
|
|
78
|
-
serverType: {
|
|
79
|
-
type: 'string',
|
|
80
|
-
title: {
|
|
81
|
-
en_US: 'Server Type',
|
|
82
|
-
zh_Hans: '服务类型',
|
|
83
|
-
},
|
|
84
|
-
description: {
|
|
85
|
-
en_US: 'Select MinerU service type: official API or self-hosted',
|
|
86
|
-
zh_Hans: '选择 MinerU 服务类型:官方 API 或自部署',
|
|
87
|
-
},
|
|
88
|
-
enum: ['official', 'self-hosted'],
|
|
89
|
-
default: 'official',
|
|
90
|
-
},
|
|
91
|
-
// Default parsing settings (optional, can be overridden when calling the tool)
|
|
92
|
-
isOcr: {
|
|
93
|
-
type: 'boolean',
|
|
94
|
-
title: {
|
|
95
|
-
en_US: 'Enable OCR',
|
|
96
|
-
zh_Hans: '启用 OCR',
|
|
97
|
-
},
|
|
98
|
-
description: {
|
|
99
|
-
en_US: 'Enable OCR for image-based PDFs (default: true)',
|
|
100
|
-
zh_Hans: '为基于图像的 PDF 启用 OCR(默认:true)',
|
|
101
|
-
},
|
|
102
|
-
default: true,
|
|
103
|
-
},
|
|
104
|
-
enableFormula: {
|
|
105
|
-
type: 'boolean',
|
|
106
|
-
title: {
|
|
107
|
-
en_US: 'Enable Formula Recognition',
|
|
108
|
-
zh_Hans: '启用公式识别',
|
|
109
|
-
},
|
|
110
|
-
description: {
|
|
111
|
-
en_US: 'Enable formula recognition (default: true)',
|
|
112
|
-
zh_Hans: '启用公式识别(默认:true)',
|
|
113
|
-
},
|
|
114
|
-
default: true,
|
|
115
|
-
},
|
|
116
|
-
enableTable: {
|
|
117
|
-
type: 'boolean',
|
|
118
|
-
title: {
|
|
119
|
-
en_US: 'Enable Table Recognition',
|
|
120
|
-
zh_Hans: '启用表格识别',
|
|
121
|
-
},
|
|
122
|
-
description: {
|
|
123
|
-
en_US: 'Enable table recognition (default: true)',
|
|
124
|
-
zh_Hans: '启用表格识别(默认:true)',
|
|
125
|
-
},
|
|
126
|
-
default: true,
|
|
127
|
-
},
|
|
128
|
-
language: {
|
|
129
|
-
type: 'string',
|
|
130
|
-
title: {
|
|
131
|
-
en_US: 'Document Language',
|
|
132
|
-
zh_Hans: '文档语言',
|
|
133
|
-
},
|
|
134
|
-
description: {
|
|
135
|
-
en_US: 'Document language: "en" for English, "ch" for Chinese (default: "ch")',
|
|
136
|
-
zh_Hans: '文档语言:"en" 表示英语,"ch" 表示中文(默认:"ch")',
|
|
137
|
-
},
|
|
138
|
-
enum: ['en', 'ch'],
|
|
139
|
-
default: 'ch',
|
|
140
|
-
},
|
|
141
|
-
modelVersion: {
|
|
142
|
-
type: 'string',
|
|
143
|
-
title: {
|
|
144
|
-
en_US: 'Model Version',
|
|
145
|
-
zh_Hans: '模型版本',
|
|
146
|
-
},
|
|
147
|
-
description: {
|
|
148
|
-
en_US: 'Model version: "pipeline" or "vlm" (default: "pipeline")',
|
|
149
|
-
zh_Hans: '模型版本:"pipeline" 或 "vlm"(默认:"pipeline")',
|
|
150
|
-
},
|
|
151
|
-
enum: ['pipeline', 'vlm'],
|
|
152
|
-
default: 'pipeline',
|
|
153
|
-
},
|
|
154
|
-
},
|
|
155
|
-
required: ['serverType'], // apiKey required only for official, validated in validateConfig
|
|
156
|
-
},
|
|
157
|
-
};
|
|
158
|
-
/**
|
|
159
|
-
* Permissions required by MinerU toolset
|
|
160
|
-
*/
|
|
161
|
-
this.permissions = [
|
|
162
|
-
{
|
|
163
|
-
type: 'filesystem',
|
|
164
|
-
operations: ['read', 'write', 'list'],
|
|
165
|
-
scope: [],
|
|
166
|
-
},
|
|
167
|
-
];
|
|
168
|
-
}
|
|
169
|
-
/**
|
|
170
|
-
* Validate toolset configuration
|
|
171
|
-
*/
|
|
172
|
-
validateConfig(config) {
|
|
173
|
-
if (!config) {
|
|
174
|
-
return Promise.resolve();
|
|
175
|
-
}
|
|
176
|
-
const serverType = config.serverType ?? 'official';
|
|
177
|
-
if (serverType === 'official' && !config.apiKey) {
|
|
178
|
-
throw new Error('MinerU apiKey is required for official serverType');
|
|
179
|
-
}
|
|
180
|
-
return Promise.resolve();
|
|
181
|
-
}
|
|
182
|
-
/**
|
|
183
|
-
* Create MinerU toolset instance
|
|
184
|
-
* Note: config may be null/undefined during authorization phase
|
|
185
|
-
*/
|
|
186
|
-
async create(config) {
|
|
187
|
-
// Inject dependencies into config
|
|
188
|
-
// If config is null/undefined, create empty config
|
|
189
|
-
const baseConfig = config || {};
|
|
190
|
-
const configWithDependencies = {
|
|
191
|
-
...baseConfig,
|
|
192
|
-
configService: this.configService,
|
|
193
|
-
resultParser: this.resultParser,
|
|
194
|
-
};
|
|
195
|
-
return new MinerUToolset(configWithDependencies);
|
|
196
|
-
}
|
|
197
|
-
/**
|
|
198
|
-
* Create tools for MinerU toolset
|
|
199
|
-
* Tools are created dynamically in MinerUToolset.initTools()
|
|
200
|
-
* based on the toolset credentials/configuration
|
|
201
|
-
*/
|
|
202
|
-
createTools() {
|
|
203
|
-
// Tools are created dynamically in MinerUToolset.initTools()
|
|
204
|
-
// based on the toolset credentials/configuration
|
|
205
|
-
return [];
|
|
206
|
-
}
|
|
207
|
-
};
|
|
208
|
-
MinerUToolsetStrategy = __decorate([
|
|
209
|
-
Injectable(),
|
|
210
|
-
ToolsetStrategy(MinerU),
|
|
211
|
-
__param(0, Inject(forwardRef(() => ConfigService))),
|
|
212
|
-
__param(1, Inject(MinerUResultParserService)),
|
|
213
|
-
__metadata("design:paramtypes", [ConfigService,
|
|
214
|
-
MinerUResultParserService])
|
|
215
|
-
], MinerUToolsetStrategy);
|
|
216
|
-
export { MinerUToolsetStrategy };
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
import { XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
2
|
-
import { z } from 'zod';
|
|
3
|
-
import { ConfigService } from '@nestjs/config';
|
|
4
|
-
import { MinerUResultParserService } from './result-parser.service.js';
|
|
5
|
-
import { MinerUIntegrationOptions } from './types.js';
|
|
6
|
-
/**
|
|
7
|
-
* Default parsing settings for MinerU tool
|
|
8
|
-
*/
|
|
9
|
-
export interface MinerUToolDefaults {
|
|
10
|
-
isOcr?: boolean;
|
|
11
|
-
enableFormula?: boolean;
|
|
12
|
-
enableTable?: boolean;
|
|
13
|
-
language?: 'en' | 'ch';
|
|
14
|
-
modelVersion?: 'pipeline' | 'vlm';
|
|
15
|
-
}
|
|
16
|
-
/**
|
|
17
|
-
* Build MinerU PDF parser tool
|
|
18
|
-
* This tool converts PDF files to markdown format using MinerU service
|
|
19
|
-
*/
|
|
20
|
-
export declare function buildMinerUTool(configService: ConfigService, resultParser: MinerUResultParserService, options?: MinerUIntegrationOptions, fileSystem?: XpFileSystem, defaults?: MinerUToolDefaults): import("@langchain/core/tools").DynamicStructuredTool<z.ZodObject<{
|
|
21
|
-
fileUrl: z.ZodNullable<z.ZodOptional<z.ZodString>>;
|
|
22
|
-
filePath: z.ZodNullable<z.ZodOptional<z.ZodString>>;
|
|
23
|
-
fileName: z.ZodNullable<z.ZodOptional<z.ZodString>>;
|
|
24
|
-
isOcr: z.ZodNullable<z.ZodOptional<z.ZodBoolean>>;
|
|
25
|
-
enableFormula: z.ZodNullable<z.ZodOptional<z.ZodBoolean>>;
|
|
26
|
-
enableTable: z.ZodNullable<z.ZodOptional<z.ZodBoolean>>;
|
|
27
|
-
language: z.ZodNullable<z.ZodOptional<z.ZodEnum<["en", "ch"]>>>;
|
|
28
|
-
modelVersion: z.ZodNullable<z.ZodOptional<z.ZodEnum<["pipeline", "vlm"]>>>;
|
|
29
|
-
}, "strip", z.ZodTypeAny, {
|
|
30
|
-
filePath?: string;
|
|
31
|
-
fileUrl?: string;
|
|
32
|
-
isOcr?: boolean;
|
|
33
|
-
enableFormula?: boolean;
|
|
34
|
-
enableTable?: boolean;
|
|
35
|
-
language?: "en" | "ch";
|
|
36
|
-
modelVersion?: "vlm" | "pipeline";
|
|
37
|
-
fileName?: string;
|
|
38
|
-
}, {
|
|
39
|
-
filePath?: string;
|
|
40
|
-
fileUrl?: string;
|
|
41
|
-
isOcr?: boolean;
|
|
42
|
-
enableFormula?: boolean;
|
|
43
|
-
enableTable?: boolean;
|
|
44
|
-
language?: "en" | "ch";
|
|
45
|
-
modelVersion?: "vlm" | "pipeline";
|
|
46
|
-
fileName?: string;
|
|
47
|
-
}>, {
|
|
48
|
-
filePath?: string;
|
|
49
|
-
fileUrl?: string;
|
|
50
|
-
isOcr?: boolean;
|
|
51
|
-
enableFormula?: boolean;
|
|
52
|
-
enableTable?: boolean;
|
|
53
|
-
language?: "en" | "ch";
|
|
54
|
-
modelVersion?: "vlm" | "pipeline";
|
|
55
|
-
fileName?: string;
|
|
56
|
-
}, {
|
|
57
|
-
filePath?: string;
|
|
58
|
-
fileUrl?: string;
|
|
59
|
-
isOcr?: boolean;
|
|
60
|
-
enableFormula?: boolean;
|
|
61
|
-
enableTable?: boolean;
|
|
62
|
-
language?: "en" | "ch";
|
|
63
|
-
modelVersion?: "vlm" | "pipeline";
|
|
64
|
-
fileName?: string;
|
|
65
|
-
}, (string | {
|
|
66
|
-
files: any[];
|
|
67
|
-
taskId: string;
|
|
68
|
-
metadata: any;
|
|
69
|
-
})[]>;
|
|
70
|
-
//# sourceMappingURL=mineru.tool.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"mineru.tool.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.tool.ts"],"names":[],"mappings":"AAEA,OAAO,EAAmB,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACrE,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAE/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAEvE,OAAO,EAAqB,wBAAwB,EAAE,MAAM,YAAY,CAAC;AAEzE;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,QAAQ,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACvB,YAAY,CAAC,EAAE,UAAU,GAAG,KAAK,CAAC;CACnC;AAED;;;GAGG;AACH,wBAAgB,eAAe,CAC7B,aAAa,EAAE,aAAa,EAC5B,YAAY,EAAE,yBAAyB,EACvC,OAAO,CAAC,EAAE,wBAAwB,EAClC,UAAU,CAAC,EAAE,YAAY,EACzB,QAAQ,CAAC,EAAE,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAkK9B"}
|
package/dist/lib/mineru.tool.js
DELETED
|
@@ -1,145 +0,0 @@
|
|
|
1
|
-
import { tool } from '@langchain/core/tools';
|
|
2
|
-
import { getCurrentTaskInput } from '@langchain/langgraph';
|
|
3
|
-
import { getErrorMessage } from '@xpert-ai/plugin-sdk';
|
|
4
|
-
import { z } from 'zod';
|
|
5
|
-
import { MinerUClient } from './mineru.client.js';
|
|
6
|
-
import { MinerUIntegration } from './types.js';
|
|
7
|
-
/**
|
|
8
|
-
* Build MinerU PDF parser tool
|
|
9
|
-
* This tool converts PDF files to markdown format using MinerU service
|
|
10
|
-
*/
|
|
11
|
-
export function buildMinerUTool(configService, resultParser, options, fileSystem, defaults) {
|
|
12
|
-
return tool(async (input) => {
|
|
13
|
-
try {
|
|
14
|
-
const { fileUrl, filePath, fileName, isOcr, enableFormula, enableTable, language, modelVersion } = input;
|
|
15
|
-
// Use defaults from toolset config if not provided in input
|
|
16
|
-
const finalIsOcr = isOcr ?? defaults?.isOcr ?? true;
|
|
17
|
-
const finalEnableFormula = enableFormula ?? defaults?.enableFormula ?? true;
|
|
18
|
-
const finalEnableTable = enableTable ?? defaults?.enableTable ?? true;
|
|
19
|
-
const finalLanguage = language ?? defaults?.language ?? 'ch';
|
|
20
|
-
const finalModelVersion = modelVersion ?? defaults?.modelVersion ?? 'pipeline';
|
|
21
|
-
if (!fileUrl && !filePath) {
|
|
22
|
-
throw new Error('Either fileUrl or filePath must be provided');
|
|
23
|
-
}
|
|
24
|
-
// Get workspace context from current task
|
|
25
|
-
const currentState = getCurrentTaskInput();
|
|
26
|
-
const workspacePath = currentState?.['sys']?.['volume'] ?? '/tmp/xpert';
|
|
27
|
-
const baseUrl = currentState?.['sys']?.['workspace_url'];
|
|
28
|
-
// Create MinerU client with API options stored in toolset credentials
|
|
29
|
-
const integration = options
|
|
30
|
-
? {
|
|
31
|
-
provider: MinerUIntegration,
|
|
32
|
-
options,
|
|
33
|
-
}
|
|
34
|
-
: undefined;
|
|
35
|
-
const mineruClient = new MinerUClient(configService, {
|
|
36
|
-
fileSystem,
|
|
37
|
-
integration,
|
|
38
|
-
});
|
|
39
|
-
// Determine file name if not provided
|
|
40
|
-
let finalFileName = fileName;
|
|
41
|
-
if (!finalFileName) {
|
|
42
|
-
if (fileUrl) {
|
|
43
|
-
try {
|
|
44
|
-
const url = new URL(fileUrl);
|
|
45
|
-
finalFileName = url.pathname.split('/').pop() || 'document.pdf';
|
|
46
|
-
}
|
|
47
|
-
catch {
|
|
48
|
-
finalFileName = 'document.pdf';
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
else if (filePath) {
|
|
52
|
-
const pathParts = filePath.split(/[/\\]/);
|
|
53
|
-
finalFileName = pathParts[pathParts.length - 1] || 'document.pdf';
|
|
54
|
-
}
|
|
55
|
-
else {
|
|
56
|
-
finalFileName = 'document.pdf';
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
// Create MinerU task
|
|
60
|
-
const { taskId } = await mineruClient.createTask({
|
|
61
|
-
url: fileUrl,
|
|
62
|
-
filePath: filePath,
|
|
63
|
-
fileName: finalFileName,
|
|
64
|
-
isOcr: finalIsOcr,
|
|
65
|
-
enableFormula: finalEnableFormula,
|
|
66
|
-
enableTable: finalEnableTable,
|
|
67
|
-
language: finalLanguage,
|
|
68
|
-
modelVersion: finalModelVersion,
|
|
69
|
-
});
|
|
70
|
-
let parsedResult;
|
|
71
|
-
if (mineruClient.serverType === 'self-hosted') {
|
|
72
|
-
// Self-hosted: get result immediately
|
|
73
|
-
const taskResult = mineruClient.getSelfHostedTask(taskId);
|
|
74
|
-
if (!taskResult) {
|
|
75
|
-
throw new Error('Failed to get MinerU task result');
|
|
76
|
-
}
|
|
77
|
-
parsedResult = await resultParser.parseLocalTask(taskResult, taskId, {
|
|
78
|
-
fileUrl,
|
|
79
|
-
filePath,
|
|
80
|
-
name: finalFileName,
|
|
81
|
-
folder: workspacePath,
|
|
82
|
-
}, fileSystem);
|
|
83
|
-
}
|
|
84
|
-
else {
|
|
85
|
-
// Official API: wait for completion
|
|
86
|
-
const result = await mineruClient.waitForTask(taskId, 5 * 60 * 1000, 5000);
|
|
87
|
-
parsedResult = await resultParser.parseFromUrl(result.full_zip_url, taskId, {
|
|
88
|
-
fileUrl,
|
|
89
|
-
filePath,
|
|
90
|
-
name: finalFileName,
|
|
91
|
-
folder: workspacePath,
|
|
92
|
-
}, fileSystem);
|
|
93
|
-
}
|
|
94
|
-
// Build file artifacts from parsed result
|
|
95
|
-
const fileArtifacts = [];
|
|
96
|
-
if (parsedResult.metadata?.assets) {
|
|
97
|
-
for (const asset of parsedResult.metadata.assets) {
|
|
98
|
-
if (asset.type === 'file' || asset.type === 'image') {
|
|
99
|
-
const fileName = asset.filePath?.split(/[/\\]/).pop() || asset.url?.split('/').pop() || 'file';
|
|
100
|
-
const extension = fileName.split('.').pop()?.toLowerCase() || 'md';
|
|
101
|
-
const mimeType = asset.type === 'image'
|
|
102
|
-
? (extension === 'png' ? 'image/png' : 'image/jpeg')
|
|
103
|
-
: (extension === 'md' ? 'text/markdown' : 'application/json');
|
|
104
|
-
fileArtifacts.push({
|
|
105
|
-
fileName: fileName,
|
|
106
|
-
filePath: asset.filePath,
|
|
107
|
-
fileUrl: asset.url,
|
|
108
|
-
mimeType: mimeType,
|
|
109
|
-
extension: extension,
|
|
110
|
-
});
|
|
111
|
-
}
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
// Extract markdown content from chunks
|
|
115
|
-
const markdownContent = parsedResult.chunks
|
|
116
|
-
?.map((chunk) => chunk.pageContent)
|
|
117
|
-
.join('\n\n') || '';
|
|
118
|
-
return [
|
|
119
|
-
`PDF processed successfully by MinerU.\n\nTask ID: ${taskId}\n\nMarkdown Content:\n${markdownContent.substring(0, 1000)}${markdownContent.length > 1000 ? '...' : ''}`,
|
|
120
|
-
{
|
|
121
|
-
files: fileArtifacts,
|
|
122
|
-
taskId,
|
|
123
|
-
metadata: parsedResult.metadata,
|
|
124
|
-
},
|
|
125
|
-
];
|
|
126
|
-
}
|
|
127
|
-
catch (error) {
|
|
128
|
-
throw new Error(`MinerU processing failed: ${getErrorMessage(error)}`);
|
|
129
|
-
}
|
|
130
|
-
}, {
|
|
131
|
-
name: 'mineru_pdf_parser',
|
|
132
|
-
description: 'Convert PDF files to markdown format using MinerU. Supports OCR, formula recognition, and table extraction. Returns markdown content and extracted files (images, JSON, etc.).',
|
|
133
|
-
schema: z.object({
|
|
134
|
-
fileUrl: z.string().optional().nullable().describe('URL of the PDF file to process'),
|
|
135
|
-
filePath: z.string().optional().nullable().describe('Local file path of the PDF file'),
|
|
136
|
-
fileName: z.string().optional().nullable().describe('Name of the PDF file'),
|
|
137
|
-
isOcr: z.boolean().optional().nullable().describe('Enable OCR for image-based PDFs (default: true)'),
|
|
138
|
-
enableFormula: z.boolean().optional().nullable().describe('Enable formula recognition (default: true)'),
|
|
139
|
-
enableTable: z.boolean().optional().nullable().describe('Enable table recognition (default: true)'),
|
|
140
|
-
language: z.enum(['en', 'ch']).optional().nullable().describe('Document language: "en" for English, "ch" for Chinese (default: "ch")'),
|
|
141
|
-
modelVersion: z.enum(['pipeline', 'vlm']).optional().nullable().describe('Model version: "pipeline" or "vlm" (default: "pipeline")'),
|
|
142
|
-
}),
|
|
143
|
-
responseFormat: 'content_and_artifact',
|
|
144
|
-
});
|
|
145
|
-
}
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
import { StructuredToolInterface, ToolSchemaBase } from '@langchain/core/tools';
|
|
2
|
-
import { BuiltinToolset, XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
3
|
-
import { ConfigService } from '@nestjs/config';
|
|
4
|
-
import { MinerUResultParserService } from './result-parser.service.js';
|
|
5
|
-
import { MinerUIntegrationOptions } from './types.js';
|
|
6
|
-
/**
|
|
7
|
-
* Configuration for MinerU Toolset
|
|
8
|
-
*/
|
|
9
|
-
export interface MinerUToolsetConfig {
|
|
10
|
-
/**
|
|
11
|
-
* MinerU API options stored in toolset credentials
|
|
12
|
-
*/
|
|
13
|
-
apiUrl?: string;
|
|
14
|
-
apiKey?: string;
|
|
15
|
-
serverType?: MinerUIntegrationOptions['serverType'];
|
|
16
|
-
fileSystem?: XpFileSystem;
|
|
17
|
-
configService?: ConfigService;
|
|
18
|
-
resultParser?: MinerUResultParserService;
|
|
19
|
-
isOcr?: boolean;
|
|
20
|
-
enableFormula?: boolean;
|
|
21
|
-
enableTable?: boolean;
|
|
22
|
-
language?: 'en' | 'ch';
|
|
23
|
-
modelVersion?: 'pipeline' | 'vlm';
|
|
24
|
-
}
|
|
25
|
-
/**
|
|
26
|
-
* MinerU Toolset implementation
|
|
27
|
-
* Provides PDF to markdown conversion tool using MinerU service
|
|
28
|
-
*/
|
|
29
|
-
export declare class MinerUToolset extends BuiltinToolset<StructuredToolInterface, MinerUToolsetConfig> {
|
|
30
|
-
private readonly config;
|
|
31
|
-
/**
|
|
32
|
-
* Constructor for MinerU Toolset
|
|
33
|
-
* Accepts config which contains credentials and dependencies
|
|
34
|
-
* Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
|
|
35
|
-
* properties (tenantId, env) that are added at runtime
|
|
36
|
-
*/
|
|
37
|
-
constructor(config: MinerUToolsetConfig);
|
|
38
|
-
/**
|
|
39
|
-
* Validate credentials for MinerU toolset
|
|
40
|
-
* Note: During authorization phase, credentials may be incomplete.
|
|
41
|
-
* configService and resultParser are runtime dependencies injected by the strategy.
|
|
42
|
-
* We don't validate anything here to allow authorization to proceed.
|
|
43
|
-
*/
|
|
44
|
-
_validateCredentials(credentials: MinerUToolsetConfig): Promise<void>;
|
|
45
|
-
/**
|
|
46
|
-
* Initialize tools for MinerU toolset
|
|
47
|
-
* Creates the PDF parser tool with necessary dependencies
|
|
48
|
-
*/
|
|
49
|
-
initTools(): Promise<StructuredToolInterface<ToolSchemaBase, any, any>[]>;
|
|
50
|
-
}
|
|
51
|
-
//# sourceMappingURL=mineru.toolset.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"mineru.toolset.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.toolset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAChF,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACpE,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAEvE,OAAO,EAAE,wBAAwB,EAAE,MAAM,YAAY,CAAC;AAEtD;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC;;OAEG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,wBAAwB,CAAC,YAAY,CAAC,CAAC;IACpD,UAAU,CAAC,EAAE,YAAY,CAAC;IAC1B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,YAAY,CAAC,EAAE,yBAAyB,CAAC;IAEzC,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,QAAQ,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACvB,YAAY,CAAC,EAAE,UAAU,GAAG,KAAK,CAAC;CACnC;AAED;;;GAGG;AACH,qBAAa,aAAc,SAAQ,cAAc,CAAC,uBAAuB,EAAE,mBAAmB,CAAC;IAC7F,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAsB;IAE7C;;;;;OAKG;gBACS,MAAM,EAAE,mBAAmB;IAKvC;;;;;OAKG;IACY,oBAAoB,CAAC,WAAW,EAAE,mBAAmB,GAAG,OAAO,CAAC,IAAI,CAAC;IAKpF;;;OAGG;IACY,SAAS,IAAI,OAAO,CAAC,uBAAuB,CAAC,cAAc,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;CA0BzF"}
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
import { BuiltinToolset } from '@xpert-ai/plugin-sdk';
|
|
2
|
-
import { buildMinerUTool } from './mineru.tool.js';
|
|
3
|
-
/**
|
|
4
|
-
* MinerU Toolset implementation
|
|
5
|
-
* Provides PDF to markdown conversion tool using MinerU service
|
|
6
|
-
*/
|
|
7
|
-
export class MinerUToolset extends BuiltinToolset {
|
|
8
|
-
/**
|
|
9
|
-
* Constructor for MinerU Toolset
|
|
10
|
-
* Accepts config which contains credentials and dependencies
|
|
11
|
-
* Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
|
|
12
|
-
* properties (tenantId, env) that are added at runtime
|
|
13
|
-
*/
|
|
14
|
-
constructor(config) {
|
|
15
|
-
super('mineru', undefined, config);
|
|
16
|
-
this.config = config;
|
|
17
|
-
}
|
|
18
|
-
/**
|
|
19
|
-
* Validate credentials for MinerU toolset
|
|
20
|
-
* Note: During authorization phase, credentials may be incomplete.
|
|
21
|
-
* configService and resultParser are runtime dependencies injected by the strategy.
|
|
22
|
-
* We don't validate anything here to allow authorization to proceed.
|
|
23
|
-
*/
|
|
24
|
-
async _validateCredentials(credentials) {
|
|
25
|
-
// No validation needed during authorization phase
|
|
26
|
-
// API key validity will be enforced by MinerU server when tool is used
|
|
27
|
-
}
|
|
28
|
-
/**
|
|
29
|
-
* Initialize tools for MinerU toolset
|
|
30
|
-
* Creates the PDF parser tool with necessary dependencies
|
|
31
|
-
*/
|
|
32
|
-
async initTools() {
|
|
33
|
-
const { configService, resultParser, apiUrl, apiKey, serverType, fileSystem, isOcr, enableFormula, enableTable, language, modelVersion } = this.config;
|
|
34
|
-
if (!configService || !resultParser) {
|
|
35
|
-
throw new Error('ConfigService and MinerUResultParserService are required');
|
|
36
|
-
}
|
|
37
|
-
this.tools = [
|
|
38
|
-
buildMinerUTool(configService, resultParser, {
|
|
39
|
-
apiUrl,
|
|
40
|
-
apiKey,
|
|
41
|
-
serverType: serverType ?? 'official',
|
|
42
|
-
}, fileSystem, {
|
|
43
|
-
isOcr,
|
|
44
|
-
enableFormula,
|
|
45
|
-
enableTable,
|
|
46
|
-
language,
|
|
47
|
-
modelVersion,
|
|
48
|
-
}),
|
|
49
|
-
];
|
|
50
|
-
return this.tools;
|
|
51
|
-
}
|
|
52
|
-
}
|