@chenchaolong/plugin-mineru-chen 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +39 -0
- package/dist/lib/parse.tool.d.ts +63 -0
- package/dist/lib/parse.tool.d.ts.map +1 -0
- package/dist/lib/parse.tool.js +437 -0
- package/dist/lib/plugin.d.ts +16 -0
- package/dist/lib/plugin.d.ts.map +1 -0
- package/dist/lib/plugin.js +46 -0
- package/dist/lib/strategy.d.ts +42 -0
- package/dist/lib/strategy.d.ts.map +1 -0
- package/dist/lib/strategy.js +62 -0
- package/dist/lib/toolset.d.ts +16 -0
- package/dist/lib/toolset.d.ts.map +1 -0
- package/dist/lib/toolset.js +22 -0
- package/dist/lib/types.d.ts +28 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +23 -0
- package/package.json +42 -0
package/README.md
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# MinerU Plugin
|
|
2
|
+
|
|
3
|
+
## 简介
|
|
4
|
+
|
|
5
|
+
MinerU 是一个将文件转换为机器可读格式(如 markdown、JSON)的工具,可以方便地提取为任意格式。
|
|
6
|
+
|
|
7
|
+
MinerU 是一个文档解析器,可以解析复杂的文档数据,适用于任何下游 LLM 用例(RAG、智能体等)。
|
|
8
|
+
|
|
9
|
+
[GitHub - opendatalab/MinerU](https://github.com/opendatalab/MinerU)
|
|
10
|
+
|
|
11
|
+
## 主要特性
|
|
12
|
+
|
|
13
|
+
- 移除页眉、页脚、脚注、页码等,确保语义连贯性
|
|
14
|
+
- 以人类可读的顺序输出文本,适用于单列、多列和复杂布局
|
|
15
|
+
- 保留原始文档的结构,包括标题、段落、列表等
|
|
16
|
+
- 提取图片、图片描述、表格、表格标题和脚注
|
|
17
|
+
- 自动识别并将文档中的公式转换为 LaTeX 格式
|
|
18
|
+
- 自动识别并将文档中的表格转换为 HTML 格式
|
|
19
|
+
- 自动检测扫描 PDF 和乱码 PDF 并启用 OCR 功能
|
|
20
|
+
- OCR 支持检测和识别 84 种语言
|
|
21
|
+
- 支持多种输出格式,如多模态和 NLP Markdown、按阅读顺序排序的 JSON,以及丰富的中间格式
|
|
22
|
+
- 支持多种可视化结果,包括布局可视化和跨度可视化,用于高效确认输出质量
|
|
23
|
+
- 支持在纯 CPU 环境中运行,也支持 GPU(CUDA)/NPU(CANN)/MPS 加速
|
|
24
|
+
- 兼容 Windows、Linux 和 Mac 平台
|
|
25
|
+
|
|
26
|
+
## 支持的输入文件类型
|
|
27
|
+
|
|
28
|
+
**PDF, DOC, DOCX, PPT, PPTX, PNG, JPG, JPEG**
|
|
29
|
+
|
|
30
|
+
## 使用方法
|
|
31
|
+
|
|
32
|
+
### 配置步骤
|
|
33
|
+
|
|
34
|
+
1. 配置 MinerU 插件参数:
|
|
35
|
+
- Base URL: MinerU API 服务的基础 URL(例如:`https://mineru.net`)
|
|
36
|
+
- Token: [从 MinerU 获取您的 API token](https://mineru.net/apiManage/token)
|
|
37
|
+
|
|
38
|
+
2. 保存配置
|
|
39
|
+
|
|
40
|
+
### 输入参数
|
|
41
|
+
|
|
42
|
+
| 参数 | 类型 | 必填 | 示例 | 描述 |
|
|
43
|
+
| :--- | :--- | :--- | :--- | :--- |
|
|
44
|
+
| enable_formula | bool | 否 | true | 是否启用公式识别,默认为 true |
|
|
45
|
+
| enable_table | bool | 否 | true | 是否启用表格识别,默认为 true |
|
|
46
|
+
| language | string | 否 | ch | 指定文档语言,默认为 ch,可以设置为 auto,当为 auto 时,模型将自动识别文档语言,其他可选值列表请参见:[PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/ppocr/blog/multi_languages.html#5) |
|
|
47
|
+
| enable_ocr | bool | 否 | false | 是否启动 OCR 功能,默认为 false |
|
|
48
|
+
| extra_formats | [string] | 否 | ["docx","html"] | Markdown 和 json 是默认的导出格式,无需设置。此参数仅支持 docx、html、latex 三种格式中的一种或多种 |
|
|
49
|
+
| model_version | string | 否 | pipeline | MinerU 模型版本;选项:pipeline 或 vlm,默认为 pipeline |
|
|
50
|
+
|
|
51
|
+
### 输出
|
|
52
|
+
|
|
53
|
+
插件为每个处理的文件提供以下输出:
|
|
54
|
+
|
|
55
|
+
- **text**: 解析后的 Markdown 文本
|
|
56
|
+
- **files**: 额外导出格式的文件(html、docx、latex)和提取的图片
|
|
57
|
+
- **json**: 解析后的内容列表
|
|
58
|
+
|
|
59
|
+
### 支持的导出格式
|
|
60
|
+
|
|
61
|
+
默认情况下,插件会输出 Markdown 和 JSON 格式。通过 `extra_formats` 参数,还可以获取以下额外格式:
|
|
62
|
+
|
|
63
|
+
- **HTML**: 格式化的 HTML 文档
|
|
64
|
+
- **DOCX**: Microsoft Word 文档格式
|
|
65
|
+
- **LaTeX**: LaTeX 源代码格式
|
|
66
|
+
|
|
67
|
+
这些额外格式的文件将存储在输出的 `files` 中。
|
|
68
|
+
|
|
69
|
+
## 技术细节
|
|
70
|
+
|
|
71
|
+
### API 端点
|
|
72
|
+
|
|
73
|
+
插件使用 MinerU 官方 API,主要端点包括:
|
|
74
|
+
|
|
75
|
+
- `POST /api/v4/file-urls/batch`: 创建解析任务
|
|
76
|
+
- `GET /api/v4/extract-results/batch/{batch_id}`: 获取解析结果
|
|
77
|
+
|
|
78
|
+
### 处理流程
|
|
79
|
+
|
|
80
|
+
1. 上传文件到 MinerU API 获取的上传 URL
|
|
81
|
+
2. 创建解析任务
|
|
82
|
+
3. 轮询获取解析结果
|
|
83
|
+
4. 下载并解压 ZIP 文件
|
|
84
|
+
5. 提取 Markdown、图片、JSON 等格式的内容
|
|
85
|
+
6. 替换 Markdown 中的图片路径为可访问的 URL
|
|
86
|
+
|
|
87
|
+
## 许可证
|
|
88
|
+
|
|
89
|
+
请参考项目根目录的 LICENSE 文件。
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { XpertPlugin } from '@xpert-ai/plugin-sdk';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
declare const ConfigSchema: z.ZodObject<{}, "strip", z.ZodTypeAny, {}, {}>;
|
|
4
|
+
declare const plugin: XpertPlugin<z.infer<typeof ConfigSchema>>;
|
|
5
|
+
export default plugin;
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAIvD,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAA;AAYvB,QAAA,MAAM,YAAY,gDAAe,CAAA;AAEjC,QAAA,MAAM,MAAM,EAAE,WAAW,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,YAAY,CAAC,CA4BrD,CAAA;AAED,eAAe,MAAM,CAAA"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { readFileSync } from 'fs';
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
import { dirname, join } from 'path';
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
import { MinerUPlugin } from './lib/plugin.js';
|
|
6
|
+
import { icon } from './lib/types.js';
|
|
7
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
8
|
+
const dir_name = dirname(__filename);
|
|
9
|
+
const packageJson = JSON.parse(readFileSync(join(dir_name, '../package.json'), 'utf8'));
|
|
10
|
+
const ConfigSchema = z.object({});
|
|
11
|
+
const plugin = {
|
|
12
|
+
meta: {
|
|
13
|
+
name: packageJson.name,
|
|
14
|
+
version: packageJson.version,
|
|
15
|
+
category: 'tools',
|
|
16
|
+
icon: {
|
|
17
|
+
type: 'svg',
|
|
18
|
+
value: icon,
|
|
19
|
+
},
|
|
20
|
+
displayName: 'MinerU',
|
|
21
|
+
description: 'MinerU is a tool that converts FILES into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format.',
|
|
22
|
+
keywords: ['mineru', 'pdf', 'parser', 'document', 'markdown', 'json'],
|
|
23
|
+
author: 'XpertAI Team',
|
|
24
|
+
},
|
|
25
|
+
config: {
|
|
26
|
+
schema: ConfigSchema,
|
|
27
|
+
},
|
|
28
|
+
register(ctx) {
|
|
29
|
+
ctx.logger.log('register mineru plugin');
|
|
30
|
+
return { module: MinerUPlugin, global: true };
|
|
31
|
+
},
|
|
32
|
+
async onStart(ctx) {
|
|
33
|
+
ctx.logger.log('mineru plugin started');
|
|
34
|
+
},
|
|
35
|
+
async onStop(ctx) {
|
|
36
|
+
ctx.logger.log('mineru plugin stopped');
|
|
37
|
+
},
|
|
38
|
+
};
|
|
39
|
+
export default plugin;
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { Credentials } from './types.js';
|
|
3
|
+
/**
|
|
4
|
+
* Build parse file tool
|
|
5
|
+
* @param credentials - MinerU API credentials (base_url and token)
|
|
6
|
+
*/
|
|
7
|
+
export declare function buildParseFileTool(credentials: Credentials): import("@langchain/core/tools").DynamicStructuredTool<z.ZodObject<{
|
|
8
|
+
fileUrl: z.ZodNullable<z.ZodOptional<z.ZodString>>;
|
|
9
|
+
filePath: z.ZodNullable<z.ZodOptional<z.ZodString>>;
|
|
10
|
+
file: z.ZodNullable<z.ZodOptional<z.ZodAny>>;
|
|
11
|
+
content: z.ZodNullable<z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodType<Buffer<ArrayBufferLike>, z.ZodTypeDef, Buffer<ArrayBufferLike>>, z.ZodType<Uint8Array<ArrayBuffer>, z.ZodTypeDef, Uint8Array<ArrayBuffer>>]>>>;
|
|
12
|
+
enable_formula: z.ZodNullable<z.ZodOptional<z.ZodBoolean>>;
|
|
13
|
+
enable_table: z.ZodNullable<z.ZodOptional<z.ZodBoolean>>;
|
|
14
|
+
language: z.ZodNullable<z.ZodOptional<z.ZodString>>;
|
|
15
|
+
enable_ocr: z.ZodNullable<z.ZodOptional<z.ZodBoolean>>;
|
|
16
|
+
extra_formats: z.ZodNullable<z.ZodOptional<z.ZodString>>;
|
|
17
|
+
model_version: z.ZodNullable<z.ZodOptional<z.ZodEnum<["pipeline", "vlm"]>>>;
|
|
18
|
+
}, "strip", z.ZodTypeAny, {
|
|
19
|
+
filePath?: string;
|
|
20
|
+
fileUrl?: string;
|
|
21
|
+
file?: any;
|
|
22
|
+
content?: string | Uint8Array<ArrayBuffer> | Buffer<ArrayBufferLike>;
|
|
23
|
+
enable_formula?: boolean;
|
|
24
|
+
enable_table?: boolean;
|
|
25
|
+
language?: string;
|
|
26
|
+
enable_ocr?: boolean;
|
|
27
|
+
extra_formats?: string;
|
|
28
|
+
model_version?: "vlm" | "pipeline";
|
|
29
|
+
}, {
|
|
30
|
+
filePath?: string;
|
|
31
|
+
fileUrl?: string;
|
|
32
|
+
file?: any;
|
|
33
|
+
content?: string | Uint8Array<ArrayBuffer> | Buffer<ArrayBufferLike>;
|
|
34
|
+
enable_formula?: boolean;
|
|
35
|
+
enable_table?: boolean;
|
|
36
|
+
language?: string;
|
|
37
|
+
enable_ocr?: boolean;
|
|
38
|
+
extra_formats?: string;
|
|
39
|
+
model_version?: "vlm" | "pipeline";
|
|
40
|
+
}>, {
|
|
41
|
+
filePath?: string;
|
|
42
|
+
fileUrl?: string;
|
|
43
|
+
file?: any;
|
|
44
|
+
content?: string | Uint8Array<ArrayBuffer> | Buffer<ArrayBufferLike>;
|
|
45
|
+
enable_formula?: boolean;
|
|
46
|
+
enable_table?: boolean;
|
|
47
|
+
language?: string;
|
|
48
|
+
enable_ocr?: boolean;
|
|
49
|
+
extra_formats?: string;
|
|
50
|
+
model_version?: "vlm" | "pipeline";
|
|
51
|
+
}, {
|
|
52
|
+
filePath?: string;
|
|
53
|
+
fileUrl?: string;
|
|
54
|
+
file?: any;
|
|
55
|
+
content?: string | Uint8Array<ArrayBuffer> | Buffer<ArrayBufferLike>;
|
|
56
|
+
enable_formula?: boolean;
|
|
57
|
+
enable_table?: boolean;
|
|
58
|
+
language?: string;
|
|
59
|
+
enable_ocr?: boolean;
|
|
60
|
+
extra_formats?: string;
|
|
61
|
+
model_version?: "vlm" | "pipeline";
|
|
62
|
+
}, any[]>;
|
|
63
|
+
//# sourceMappingURL=parse.tool.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parse.tool.d.ts","sourceRoot":"","sources":["../../src/lib/parse.tool.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAA;AAIvB,OAAO,EAAE,WAAW,EAAwF,MAAM,YAAY,CAAA;AA0V9H;;;GAGG;AACH,wBAAgB,kBAAkB,CAAC,WAAW,EAAE,WAAW;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;UA2K1D"}
|
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
import { tool } from '@langchain/core/tools';
|
|
2
|
+
import { getCurrentTaskInput } from '@langchain/langgraph';
|
|
3
|
+
import { getErrorMessage } from '@xpert-ai/plugin-sdk';
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
import * as path from 'path';
|
|
6
|
+
import * as fs from 'fs/promises';
|
|
7
|
+
import JSZip from 'jszip';
|
|
8
|
+
import { SUPPORTED_EXTENSIONS, SUPPORTED_IMAGE_EXTENSIONS, MAX_RETRIES } from './types.js';
|
|
9
|
+
/**
|
|
10
|
+
* Get MIME type based on file extension
|
|
11
|
+
*/
|
|
12
|
+
function getMimeType(fileName) {
|
|
13
|
+
const ext = path.extname(fileName).toLowerCase().slice(1);
|
|
14
|
+
const mimeTypes = {
|
|
15
|
+
pdf: 'application/pdf',
|
|
16
|
+
doc: 'application/msword',
|
|
17
|
+
docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
18
|
+
ppt: 'application/vnd.ms-powerpoint',
|
|
19
|
+
pptx: 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
20
|
+
png: 'image/png',
|
|
21
|
+
jpg: 'image/jpeg',
|
|
22
|
+
jpeg: 'image/jpeg',
|
|
23
|
+
md: 'text/markdown',
|
|
24
|
+
json: 'application/json',
|
|
25
|
+
html: 'text/html',
|
|
26
|
+
tex: 'application/x-tex',
|
|
27
|
+
};
|
|
28
|
+
return mimeTypes[ext] || 'application/octet-stream';
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Build API URL by combining base URL with paths
|
|
32
|
+
*/
|
|
33
|
+
function buildApiUrl(baseUrl, ...paths) {
|
|
34
|
+
const url = new URL(baseUrl);
|
|
35
|
+
const pathParts = paths.filter(p => p).join('/');
|
|
36
|
+
if (pathParts) {
|
|
37
|
+
url.pathname = path.posix.join(url.pathname, pathParts);
|
|
38
|
+
}
|
|
39
|
+
return url.toString();
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Get request headers for remote API
|
|
43
|
+
*/
|
|
44
|
+
function getHeaders(credentials) {
|
|
45
|
+
return {
|
|
46
|
+
Authorization: `Bearer ${credentials.token}`,
|
|
47
|
+
'Content-Type': 'application/json',
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Validate file type
|
|
52
|
+
*/
|
|
53
|
+
function validateFileType(filename) {
|
|
54
|
+
const extension = path.extname(filename).toLowerCase();
|
|
55
|
+
if (!SUPPORTED_EXTENSIONS.includes(extension)) {
|
|
56
|
+
throw new Error(`File extension ${extension} is not supported. Supported extensions: ${SUPPORTED_EXTENSIONS.join(', ')}`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Process base64 encoded image
|
|
61
|
+
*/
|
|
62
|
+
async function processBase64Image(encodedImageData, fileName, workspacePath, baseUrl) {
|
|
63
|
+
try {
|
|
64
|
+
// Extract base64 data (remove data:image/...;base64, prefix if present)
|
|
65
|
+
const base64Data = encodedImageData.includes(',')
|
|
66
|
+
? encodedImageData.split(',')[1]
|
|
67
|
+
: encodedImageData;
|
|
68
|
+
const imageBytes = Buffer.from(base64Data, 'base64');
|
|
69
|
+
// Save image to workspace
|
|
70
|
+
const filePath = path.join(workspacePath, fileName);
|
|
71
|
+
await fs.mkdir(path.dirname(filePath), { recursive: true });
|
|
72
|
+
await fs.writeFile(filePath, imageBytes);
|
|
73
|
+
const fileUrl = new URL(encodeURIComponent(fileName), baseUrl).href;
|
|
74
|
+
return {
|
|
75
|
+
mimeType: 'image/jpeg',
|
|
76
|
+
fileName,
|
|
77
|
+
filePath,
|
|
78
|
+
fileUrl,
|
|
79
|
+
extension: 'jpeg',
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
catch (error) {
|
|
83
|
+
throw new Error(`Failed to decode base64 image ${fileName}: ${getErrorMessage(error)}`);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Replace image paths in Markdown content with file URLs
|
|
88
|
+
*/
|
|
89
|
+
function replaceMdImgPath(mdContent, images) {
|
|
90
|
+
let result = mdContent;
|
|
91
|
+
for (const image of images) {
|
|
92
|
+
if (image.fileUrl) {
|
|
93
|
+
result = result.replace(`images/${image.fileName}`, image.fileUrl);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
return result;
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Poll for parse result
|
|
100
|
+
*/
|
|
101
|
+
async function pollGetParseResult(credentials, batchId) {
|
|
102
|
+
const url = buildApiUrl(credentials.base_url, 'api/v4/extract-results/batch', batchId);
|
|
103
|
+
const headers = getHeaders(credentials);
|
|
104
|
+
for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
|
|
105
|
+
try {
|
|
106
|
+
const response = await fetch(url, { headers });
|
|
107
|
+
if (!response.ok) {
|
|
108
|
+
if (attempt === MAX_RETRIES - 1) {
|
|
109
|
+
throw new Error(`Failed to get parse result, status: ${response.status}`);
|
|
110
|
+
}
|
|
111
|
+
await new Promise(resolve => setTimeout(resolve, 5000));
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
const data = (await response.json());
|
|
115
|
+
const extractResults = data.data?.extract_result || [];
|
|
116
|
+
if (extractResults.length === 0) {
|
|
117
|
+
await new Promise(resolve => setTimeout(resolve, 5000));
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
const extractResult = extractResults[0];
|
|
121
|
+
const state = extractResult.state;
|
|
122
|
+
if (state === 'done') {
|
|
123
|
+
return extractResult;
|
|
124
|
+
}
|
|
125
|
+
else if (state === 'failed') {
|
|
126
|
+
const errMsg = extractResult.err_msg || 'Unknown error';
|
|
127
|
+
throw new Error(`Parse failed, reason: ${errMsg}`);
|
|
128
|
+
}
|
|
129
|
+
// Still processing, wait and retry
|
|
130
|
+
await new Promise(resolve => setTimeout(resolve, 5000));
|
|
131
|
+
}
|
|
132
|
+
catch (error) {
|
|
133
|
+
if (attempt === MAX_RETRIES - 1) {
|
|
134
|
+
throw error;
|
|
135
|
+
}
|
|
136
|
+
console.warn(`Poll attempt ${attempt + 1} failed:`, error);
|
|
137
|
+
await new Promise(resolve => setTimeout(resolve, 5000));
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
throw new Error('Polling timeout reached without getting completed result');
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Process zip file entry
|
|
144
|
+
*/
|
|
145
|
+
async function processZipFile(zipEntry, fileName, content, workspacePath, baseUrl) {
|
|
146
|
+
try {
|
|
147
|
+
const fileContent = await zipEntry.async('nodebuffer');
|
|
148
|
+
if (fileName.startsWith('images/') && SUPPORTED_IMAGE_EXTENSIONS.some(ext => fileName.endsWith(ext))) {
|
|
149
|
+
const baseName = path.basename(fileName);
|
|
150
|
+
const filePath = path.join(workspacePath, baseName);
|
|
151
|
+
await fs.mkdir(path.dirname(filePath), { recursive: true });
|
|
152
|
+
await fs.writeFile(filePath, fileContent);
|
|
153
|
+
const fileUrl = new URL(encodeURIComponent(baseName), baseUrl).href;
|
|
154
|
+
content.images.push({
|
|
155
|
+
mimeType: 'image/jpeg',
|
|
156
|
+
fileName: baseName,
|
|
157
|
+
filePath,
|
|
158
|
+
fileUrl,
|
|
159
|
+
extension: 'jpeg',
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
else if (fileName.endsWith('.md')) {
|
|
163
|
+
content.mdContent = fileContent.toString('utf-8');
|
|
164
|
+
}
|
|
165
|
+
else if (fileName.endsWith('.json') && fileName !== 'layout.json') {
|
|
166
|
+
try {
|
|
167
|
+
const jsonContent = JSON.parse(fileContent.toString('utf-8'));
|
|
168
|
+
content.contentList.push(jsonContent);
|
|
169
|
+
}
|
|
170
|
+
catch (error) {
|
|
171
|
+
console.error(`Failed to parse JSON file ${fileName}:`, error);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
else if (fileName.endsWith('.html')) {
|
|
175
|
+
content.htmlContent = fileContent.toString('utf-8');
|
|
176
|
+
}
|
|
177
|
+
else if (fileName.endsWith('.docx')) {
|
|
178
|
+
content.docxContent = fileContent;
|
|
179
|
+
}
|
|
180
|
+
else if (fileName.endsWith('.tex')) {
|
|
181
|
+
content.latexContent = fileContent.toString('utf-8');
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
catch (error) {
|
|
185
|
+
console.error(`Failed to process file ${fileName}:`, error);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Download and extract zip file from URL
|
|
190
|
+
*/
|
|
191
|
+
async function downloadAndExtractZip(url, workspacePath, baseUrl) {
|
|
192
|
+
const response = await fetch(url);
|
|
193
|
+
if (!response.ok) {
|
|
194
|
+
throw new Error(`Failed to download zip file: ${response.statusText}`);
|
|
195
|
+
}
|
|
196
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
197
|
+
const zipBuffer = Buffer.from(arrayBuffer);
|
|
198
|
+
const zip = await JSZip.loadAsync(zipBuffer);
|
|
199
|
+
const content = {
|
|
200
|
+
mdContent: '',
|
|
201
|
+
contentList: [],
|
|
202
|
+
images: [],
|
|
203
|
+
};
|
|
204
|
+
// Process all files in zip
|
|
205
|
+
const promises = [];
|
|
206
|
+
zip.forEach((relativePath, zipEntry) => {
|
|
207
|
+
if (!zipEntry.dir) {
|
|
208
|
+
promises.push(processZipFile(zipEntry, relativePath, content, workspacePath, baseUrl));
|
|
209
|
+
}
|
|
210
|
+
});
|
|
211
|
+
await Promise.all(promises);
|
|
212
|
+
// Replace image paths in markdown
|
|
213
|
+
content.mdContent = replaceMdImgPath(content.mdContent, content.images);
|
|
214
|
+
return content;
|
|
215
|
+
}
|
|
216
|
+
/**
|
|
217
|
+
* Parse file using MinerU API
|
|
218
|
+
*/
|
|
219
|
+
async function parseFile(credentials, file, fileContent, fileName, toolParameters, workspacePath, baseUrl) {
|
|
220
|
+
const headers = getHeaders(credentials);
|
|
221
|
+
// Create parsing task
|
|
222
|
+
const data = {
|
|
223
|
+
enable_formula: toolParameters.enable_formula ?? true,
|
|
224
|
+
enable_table: toolParameters.enable_table ?? true,
|
|
225
|
+
language: toolParameters.language || 'auto',
|
|
226
|
+
model_version: toolParameters.model_version || 'pipeline',
|
|
227
|
+
extra_formats: typeof toolParameters.extra_formats === 'string'
|
|
228
|
+
? JSON.parse(toolParameters.extra_formats || '[]')
|
|
229
|
+
: toolParameters.extra_formats || [],
|
|
230
|
+
files: [
|
|
231
|
+
{
|
|
232
|
+
name: fileName,
|
|
233
|
+
is_ocr: toolParameters.enable_ocr ?? false,
|
|
234
|
+
},
|
|
235
|
+
],
|
|
236
|
+
};
|
|
237
|
+
const taskUrl = buildApiUrl(credentials.base_url, 'api/v4/file-urls/batch');
|
|
238
|
+
const response = await fetch(taskUrl, {
|
|
239
|
+
method: 'POST',
|
|
240
|
+
headers,
|
|
241
|
+
body: JSON.stringify(data),
|
|
242
|
+
});
|
|
243
|
+
if (!response.ok) {
|
|
244
|
+
const text = await response.text();
|
|
245
|
+
throw new Error(`Apply upload url failed. Status: ${response.status}, result: ${text}`);
|
|
246
|
+
}
|
|
247
|
+
const result = (await response.json());
|
|
248
|
+
if (result.code === 0) {
|
|
249
|
+
const batchId = result.data.batch_id;
|
|
250
|
+
const urls = result.data.file_urls;
|
|
251
|
+
// Upload file
|
|
252
|
+
const uploadResponse = await fetch(urls[0], {
|
|
253
|
+
method: 'PUT',
|
|
254
|
+
body: fileContent,
|
|
255
|
+
});
|
|
256
|
+
if (!uploadResponse.ok) {
|
|
257
|
+
throw new Error(`Failed to upload file: ${uploadResponse.statusText}`);
|
|
258
|
+
}
|
|
259
|
+
// Poll for result
|
|
260
|
+
const extractResult = await pollGetParseResult(credentials, batchId);
|
|
261
|
+
const fullZipUrl = extractResult.full_zip_url;
|
|
262
|
+
if (!fullZipUrl) {
|
|
263
|
+
throw new Error('No zip URL found in parse result');
|
|
264
|
+
}
|
|
265
|
+
// Download and extract zip
|
|
266
|
+
const zipContent = await downloadAndExtractZip(fullZipUrl, workspacePath, baseUrl);
|
|
267
|
+
return {
|
|
268
|
+
markdown: zipContent.mdContent,
|
|
269
|
+
images: zipContent.images,
|
|
270
|
+
contentList: zipContent.contentList,
|
|
271
|
+
fullZipUrl,
|
|
272
|
+
};
|
|
273
|
+
}
|
|
274
|
+
else {
|
|
275
|
+
throw new Error(`Apply upload url failed, reason: ${result.msg || 'Unknown error'}`);
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* Validate credentials by checking API connection
|
|
280
|
+
*/
|
|
281
|
+
async function validateCredentials(credentials) {
|
|
282
|
+
const url = buildApiUrl(credentials.base_url, 'api/v4/file-urls/batch');
|
|
283
|
+
const headers = getHeaders(credentials);
|
|
284
|
+
const response = await fetch(url, {
|
|
285
|
+
method: 'POST',
|
|
286
|
+
headers,
|
|
287
|
+
body: JSON.stringify({}),
|
|
288
|
+
signal: AbortSignal.timeout(10000),
|
|
289
|
+
});
|
|
290
|
+
if (!response.ok) {
|
|
291
|
+
throw new Error('Please check your base_url and token');
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
/**
|
|
295
|
+
* Build parse file tool
|
|
296
|
+
* @param credentials - MinerU API credentials (base_url and token)
|
|
297
|
+
*/
|
|
298
|
+
export function buildParseFileTool(credentials) {
|
|
299
|
+
return tool(async (input) => {
|
|
300
|
+
try {
|
|
301
|
+
const { file, fileUrl, filePath, content, enable_formula, enable_table, language, enable_ocr, extra_formats, model_version, } = input;
|
|
302
|
+
// Validate credentials are available
|
|
303
|
+
if (!credentials?.base_url) {
|
|
304
|
+
throw new Error('base_url is not configured. Please configure MinerU toolset with base_url and token.');
|
|
305
|
+
}
|
|
306
|
+
if (!credentials?.token) {
|
|
307
|
+
throw new Error('token is not configured. Please configure MinerU toolset with base_url and token.');
|
|
308
|
+
}
|
|
309
|
+
// Validate credentials
|
|
310
|
+
try {
|
|
311
|
+
await validateCredentials(credentials);
|
|
312
|
+
}
|
|
313
|
+
catch (error) {
|
|
314
|
+
const errorMsg = error?.message || String(error);
|
|
315
|
+
if (errorMsg.includes('fetch failed') || errorMsg.includes('ECONNREFUSED')) {
|
|
316
|
+
throw new Error(`Cannot connect to MinerU service at ${credentials.base_url}. ` +
|
|
317
|
+
`Please ensure MinerU service is running and accessible. ` +
|
|
318
|
+
`Error: ${errorMsg}`);
|
|
319
|
+
}
|
|
320
|
+
throw error;
|
|
321
|
+
}
|
|
322
|
+
// Get file content
|
|
323
|
+
let fileContent;
|
|
324
|
+
let fileName;
|
|
325
|
+
if (fileUrl) {
|
|
326
|
+
// Download file from URL (preferred input method)
|
|
327
|
+
const response = await fetch(fileUrl);
|
|
328
|
+
if (!response.ok) {
|
|
329
|
+
throw new Error(`Failed to download file: ${response.statusText}`);
|
|
330
|
+
}
|
|
331
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
332
|
+
fileContent = Buffer.from(arrayBuffer);
|
|
333
|
+
// Extract file name from URL pathname, fallback to file object if provided
|
|
334
|
+
fileName = file?.fileName || file?.filename || path.basename(new URL(fileUrl).pathname) || 'file';
|
|
335
|
+
}
|
|
336
|
+
else if (filePath) {
|
|
337
|
+
fileContent = await fs.readFile(filePath);
|
|
338
|
+
fileName = file?.fileName || path.basename(filePath) || 'file';
|
|
339
|
+
}
|
|
340
|
+
else if (file?.content) {
|
|
341
|
+
fileContent = Buffer.isBuffer(file.content)
|
|
342
|
+
? file.content
|
|
343
|
+
: Buffer.from(file.content);
|
|
344
|
+
fileName = file.fileName || file.filename || 'file';
|
|
345
|
+
}
|
|
346
|
+
else if (content) {
|
|
347
|
+
fileContent = Buffer.isBuffer(content)
|
|
348
|
+
? content
|
|
349
|
+
: typeof content === 'string'
|
|
350
|
+
? Buffer.from(content, 'base64')
|
|
351
|
+
: Buffer.from(content);
|
|
352
|
+
fileName = file?.fileName || 'file';
|
|
353
|
+
}
|
|
354
|
+
else {
|
|
355
|
+
throw new Error('File is required');
|
|
356
|
+
}
|
|
357
|
+
// Validate file type
|
|
358
|
+
validateFileType(fileName);
|
|
359
|
+
// Get workspace paths
|
|
360
|
+
let workspacePath = '/tmp/xpert';
|
|
361
|
+
let baseUrl = 'http://localhost:3000';
|
|
362
|
+
try {
|
|
363
|
+
const currentState = getCurrentTaskInput();
|
|
364
|
+
workspacePath = currentState?.['sys']?.['volume'] ?? workspacePath;
|
|
365
|
+
baseUrl = currentState?.['sys']?.['workspace_url'] ?? baseUrl;
|
|
366
|
+
}
|
|
367
|
+
catch (error) {
|
|
368
|
+
// If getCurrentTaskInput fails (e.g., in test environment), use defaults
|
|
369
|
+
console.warn('getCurrentTaskInput failed, using default paths:', error);
|
|
370
|
+
}
|
|
371
|
+
// Parse file using MinerU API
|
|
372
|
+
const result = await parseFile(credentials, file, fileContent, fileName, { enable_formula, enable_table, language, enable_ocr, extra_formats, model_version }, workspacePath, baseUrl);
|
|
373
|
+
// Build response with artifacts
|
|
374
|
+
const artifacts = {
|
|
375
|
+
files: [
|
|
376
|
+
// Markdown file
|
|
377
|
+
{
|
|
378
|
+
mimeType: 'text/markdown',
|
|
379
|
+
fileName: `${path.parse(fileName).name}.md`,
|
|
380
|
+
filePath: path.join(workspacePath, `${path.parse(fileName).name}.md`),
|
|
381
|
+
fileUrl: new URL(encodeURIComponent(`${path.parse(fileName).name}.md`), baseUrl).href,
|
|
382
|
+
extension: 'md',
|
|
383
|
+
},
|
|
384
|
+
// Images
|
|
385
|
+
...result.images,
|
|
386
|
+
],
|
|
387
|
+
};
|
|
388
|
+
// Add JSON content list if available
|
|
389
|
+
if (result.contentList.length > 0) {
|
|
390
|
+
const jsonFileName = `${path.parse(fileName).name}.json`;
|
|
391
|
+
const jsonPath = path.join(workspacePath, jsonFileName);
|
|
392
|
+
await fs.writeFile(jsonPath, JSON.stringify(result.contentList, null, 2));
|
|
393
|
+
artifacts.files.push({
|
|
394
|
+
mimeType: 'application/json',
|
|
395
|
+
fileName: jsonFileName,
|
|
396
|
+
filePath: jsonPath,
|
|
397
|
+
fileUrl: new URL(encodeURIComponent(jsonFileName), baseUrl).href,
|
|
398
|
+
extension: 'json',
|
|
399
|
+
});
|
|
400
|
+
}
|
|
401
|
+
// Save markdown to file
|
|
402
|
+
const mdFileName = `${path.parse(fileName).name}.md`;
|
|
403
|
+
const mdPath = path.join(workspacePath, mdFileName);
|
|
404
|
+
await fs.writeFile(mdPath, result.markdown, 'utf-8');
|
|
405
|
+
return [
|
|
406
|
+
result.markdown,
|
|
407
|
+
artifacts,
|
|
408
|
+
];
|
|
409
|
+
}
|
|
410
|
+
catch (error) {
|
|
411
|
+
const errorMsg = getErrorMessage(error);
|
|
412
|
+
// Provide more helpful error messages for common issues
|
|
413
|
+
if (errorMsg.includes('fetch failed') || errorMsg.includes('ECONNREFUSED')) {
|
|
414
|
+
throw new Error(`Cannot connect to MinerU service. ` +
|
|
415
|
+
`Please ensure MinerU service is running and accessible at the configured base_url. ` +
|
|
416
|
+
`Original error: ${errorMsg}`);
|
|
417
|
+
}
|
|
418
|
+
throw new Error(`Error parsing file: ${errorMsg}`);
|
|
419
|
+
}
|
|
420
|
+
}, {
|
|
421
|
+
name: 'parse-file',
|
|
422
|
+
description: 'A tool for parsing text, tables, and images, supporting multiple formats such as pdf, pptx, docx, etc. supporting multiple languages such as English, Chinese, etc.',
|
|
423
|
+
schema: z.object({
|
|
424
|
+
fileUrl: z.string().optional().nullable().describe('URL of the file to parse (preferred input method)'),
|
|
425
|
+
filePath: z.string().optional().nullable().describe('Local path of the file to parse'),
|
|
426
|
+
file: z.any().optional().nullable().describe('The file object to be parsed (support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)'),
|
|
427
|
+
content: z.union([z.string(), z.instanceof(Buffer), z.instanceof(Uint8Array)]).optional().nullable().describe('File content'),
|
|
428
|
+
enable_formula: z.boolean().optional().nullable().describe('Whether to enable formula recognition'),
|
|
429
|
+
enable_table: z.boolean().optional().nullable().describe('Whether to enable table recognition'),
|
|
430
|
+
language: z.string().optional().nullable().describe('Specify document language, default is auto'),
|
|
431
|
+
enable_ocr: z.boolean().optional().nullable().describe('Whether to enable OCR recognition'),
|
|
432
|
+
extra_formats: z.string().optional().nullable().describe('Example: ["docx","html"], markdown, json are the default export formats'),
|
|
433
|
+
model_version: z.enum(['pipeline', 'vlm']).optional().nullable().describe('MinerU model version; options: pipeline or vlm, default is pipeline'),
|
|
434
|
+
}),
|
|
435
|
+
responseFormat: 'content_and_artifact',
|
|
436
|
+
});
|
|
437
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { IOnPluginBootstrap, IOnPluginDestroy } from '@xpert-ai/plugin-sdk';
|
|
2
|
+
/**
|
|
3
|
+
* MinerU plugin class that implements plugin lifecycle hooks
|
|
4
|
+
*/
|
|
5
|
+
export declare class MinerUPlugin implements IOnPluginBootstrap, IOnPluginDestroy {
|
|
6
|
+
private logEnabled;
|
|
7
|
+
/**
|
|
8
|
+
* Called when the plugin is being initialized.
|
|
9
|
+
*/
|
|
10
|
+
onPluginBootstrap(): void | Promise<void>;
|
|
11
|
+
/**
|
|
12
|
+
* Called when the plugin is being destroyed.
|
|
13
|
+
*/
|
|
14
|
+
onPluginDestroy(): void | Promise<void>;
|
|
15
|
+
}
|
|
16
|
+
//# sourceMappingURL=plugin.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"plugin.d.ts","sourceRoot":"","sources":["../../src/lib/plugin.ts"],"names":[],"mappings":"AACA,OAAO,EAAqB,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAA;AAI9F;;GAEG;AACH,qBAaa,YAAa,YAAW,kBAAkB,EAAE,gBAAgB;IAEvE,OAAO,CAAC,UAAU,CAAO;IAEzB;;OAEG;IACH,iBAAiB,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAMzC;;OAEG;IACH,eAAe,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;CAKxC"}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
var MinerUPlugin_1;
|
|
2
|
+
import { __decorate } from "tslib";
|
|
3
|
+
import chalk from 'chalk';
|
|
4
|
+
import { XpertServerPlugin } from '@xpert-ai/plugin-sdk';
|
|
5
|
+
import { ConfigModule } from '@nestjs/config';
|
|
6
|
+
import { MinerUStrategy } from './strategy.js';
|
|
7
|
+
/**
|
|
8
|
+
* MinerU plugin class that implements plugin lifecycle hooks
|
|
9
|
+
*/
|
|
10
|
+
let MinerUPlugin = MinerUPlugin_1 = class MinerUPlugin {
|
|
11
|
+
constructor() {
|
|
12
|
+
// We disable by default additional logging for each event to avoid cluttering the logs
|
|
13
|
+
this.logEnabled = true;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Called when the plugin is being initialized.
|
|
17
|
+
*/
|
|
18
|
+
onPluginBootstrap() {
|
|
19
|
+
if (this.logEnabled) {
|
|
20
|
+
console.log(chalk.green(`${MinerUPlugin_1.name} is being bootstrapped...`));
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Called when the plugin is being destroyed.
|
|
25
|
+
*/
|
|
26
|
+
onPluginDestroy() {
|
|
27
|
+
if (this.logEnabled) {
|
|
28
|
+
console.log(chalk.green(`${MinerUPlugin_1.name} is being destroyed...`));
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
};
|
|
32
|
+
MinerUPlugin = MinerUPlugin_1 = __decorate([
|
|
33
|
+
XpertServerPlugin({
|
|
34
|
+
/**
|
|
35
|
+
* An array of modules that will be imported and registered with the plugin.
|
|
36
|
+
*/
|
|
37
|
+
imports: [ConfigModule],
|
|
38
|
+
/**
|
|
39
|
+
* An array of Entity classes. The plugin (or ORM) will
|
|
40
|
+
* register these entities for use within the application.
|
|
41
|
+
*/
|
|
42
|
+
entities: [],
|
|
43
|
+
providers: [MinerUStrategy],
|
|
44
|
+
})
|
|
45
|
+
], MinerUPlugin);
|
|
46
|
+
export { MinerUPlugin };
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { BuiltinToolset, IToolsetStrategy } from '@xpert-ai/plugin-sdk';
|
|
2
|
+
/**
|
|
3
|
+
* MinerU toolset strategy implementation
|
|
4
|
+
*/
|
|
5
|
+
export declare class MinerUStrategy implements IToolsetStrategy<any> {
|
|
6
|
+
meta: {
|
|
7
|
+
author: string;
|
|
8
|
+
tags: string[];
|
|
9
|
+
name: string;
|
|
10
|
+
label: {
|
|
11
|
+
en_US: string;
|
|
12
|
+
zh_Hans: string;
|
|
13
|
+
};
|
|
14
|
+
description: {
|
|
15
|
+
en_US: string;
|
|
16
|
+
zh_Hans: string;
|
|
17
|
+
};
|
|
18
|
+
icon: {
|
|
19
|
+
svg: string;
|
|
20
|
+
color: string;
|
|
21
|
+
};
|
|
22
|
+
configSchema: {
|
|
23
|
+
type: string;
|
|
24
|
+
properties: {};
|
|
25
|
+
required: any[];
|
|
26
|
+
};
|
|
27
|
+
};
|
|
28
|
+
/**
|
|
29
|
+
* Validate configuration
|
|
30
|
+
*/
|
|
31
|
+
validateConfig(config: any): Promise<void>;
|
|
32
|
+
/**
|
|
33
|
+
* Create toolset instance
|
|
34
|
+
*/
|
|
35
|
+
create(config: any): Promise<BuiltinToolset>;
|
|
36
|
+
/**
|
|
37
|
+
* Create tools for this toolset
|
|
38
|
+
* Note: When using BuiltinToolset, this method may not be used, initTools is used instead
|
|
39
|
+
*/
|
|
40
|
+
createTools(): any[];
|
|
41
|
+
}
|
|
42
|
+
//# sourceMappingURL=strategy.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"strategy.d.ts","sourceRoot":"","sources":["../../src/lib/strategy.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,cAAc,EAAE,gBAAgB,EAAmB,MAAM,sBAAsB,CAAA;AAKxF;;GAEG;AACH,qBAEa,cAAe,YAAW,gBAAgB,CAAC,GAAG,CAAC;IAC1D,IAAI;;;;;;;;;;;;;;;;;;;;;MAuBH;IAED;;OAEG;IACH,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAK1C;;OAEG;IACG,MAAM,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,cAAc,CAAC;IAKlD;;;OAGG;IACH,WAAW;CAKZ"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { __decorate } from "tslib";
|
|
2
|
+
import { Injectable } from '@nestjs/common';
|
|
3
|
+
import { ToolsetStrategy } from '@xpert-ai/plugin-sdk';
|
|
4
|
+
import { MinerU, icon } from './types.js';
|
|
5
|
+
import { MinerUToolset } from './toolset.js';
|
|
6
|
+
/**
|
|
7
|
+
* MinerU toolset strategy implementation
|
|
8
|
+
*/
|
|
9
|
+
let MinerUStrategy = class MinerUStrategy {
|
|
10
|
+
constructor() {
|
|
11
|
+
this.meta = {
|
|
12
|
+
author: 'Xpert AI',
|
|
13
|
+
tags: ['mineru', 'pdf', 'parser', 'document', 'tool'],
|
|
14
|
+
name: MinerU,
|
|
15
|
+
label: {
|
|
16
|
+
en_US: 'MinerU',
|
|
17
|
+
zh_Hans: 'MinerU',
|
|
18
|
+
},
|
|
19
|
+
description: {
|
|
20
|
+
en_US: 'MinerU is a tool that converts FILES into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format. MinerU was born during the pre-training process of InternLM. We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models.',
|
|
21
|
+
zh_Hans: 'MinerU是一款可以在本地部署的将FILES转化为机器可读格式的工具(如markdown、json),可以很方便地抽取为任意格式。MinerU诞生于书生-浦语的预训练过程中,我们将会集中精力解决科技文献中的符号转化问题,希望在大模型时代为科技发展做出贡献。',
|
|
22
|
+
},
|
|
23
|
+
icon: {
|
|
24
|
+
svg: icon,
|
|
25
|
+
color: '#4a90e2',
|
|
26
|
+
},
|
|
27
|
+
configSchema: {
|
|
28
|
+
type: 'object',
|
|
29
|
+
properties: {},
|
|
30
|
+
required: [],
|
|
31
|
+
},
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Validate configuration
|
|
36
|
+
*/
|
|
37
|
+
validateConfig(config) {
|
|
38
|
+
// No validation needed, credentials are hardcoded
|
|
39
|
+
return Promise.resolve();
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Create toolset instance
|
|
43
|
+
*/
|
|
44
|
+
async create(config) {
|
|
45
|
+
// Credentials are hardcoded in toolset, config is not used
|
|
46
|
+
return new MinerUToolset(config || {});
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Create tools for this toolset
|
|
50
|
+
* Note: When using BuiltinToolset, this method may not be used, initTools is used instead
|
|
51
|
+
*/
|
|
52
|
+
createTools() {
|
|
53
|
+
// This should not be called when using BuiltinToolset, but required by interface
|
|
54
|
+
// Return empty array as fallback - actual tools are created in toolset.initTools()
|
|
55
|
+
return [];
|
|
56
|
+
}
|
|
57
|
+
};
|
|
58
|
+
MinerUStrategy = __decorate([
|
|
59
|
+
Injectable(),
|
|
60
|
+
ToolsetStrategy(MinerU)
|
|
61
|
+
], MinerUStrategy);
|
|
62
|
+
export { MinerUStrategy };
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { StructuredToolInterface, ToolSchemaBase } from '@langchain/core/tools';
|
|
2
|
+
import { BuiltinToolset } from '@xpert-ai/plugin-sdk';
|
|
3
|
+
/**
|
|
4
|
+
* MinerU toolset class that manages all MinerU tools
|
|
5
|
+
*/
|
|
6
|
+
export declare class MinerUToolset extends BuiltinToolset<StructuredToolInterface, any> {
|
|
7
|
+
/**
|
|
8
|
+
* Validate credentials for MinerU toolset
|
|
9
|
+
*/
|
|
10
|
+
_validateCredentials(credentials: any): Promise<void>;
|
|
11
|
+
/**
|
|
12
|
+
* Initialize tools for MinerU toolset
|
|
13
|
+
*/
|
|
14
|
+
initTools(): Promise<StructuredToolInterface<ToolSchemaBase, any, any>[]>;
|
|
15
|
+
}
|
|
16
|
+
//# sourceMappingURL=toolset.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toolset.d.ts","sourceRoot":"","sources":["../../src/lib/toolset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAA;AAC/E,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAA;AAIrD;;GAEG;AACH,qBAAa,aAAc,SAAQ,cAAc,CAAC,uBAAuB,EAAE,GAAG,CAAC;IAC7E;;OAEG;IACY,oBAAoB,CAAC,WAAW,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAIpE;;OAEG;IACY,SAAS,IAAI,OAAO,CAAC,uBAAuB,CAAC,cAAc,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;CAKzF"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { BuiltinToolset } from '@xpert-ai/plugin-sdk';
|
|
2
|
+
import { buildParseFileTool } from './parse.tool.js';
|
|
3
|
+
import { DEFAULT_CREDENTIALS } from './types.js';
|
|
4
|
+
/**
|
|
5
|
+
* MinerU toolset class that manages all MinerU tools
|
|
6
|
+
*/
|
|
7
|
+
export class MinerUToolset extends BuiltinToolset {
|
|
8
|
+
/**
|
|
9
|
+
* Validate credentials for MinerU toolset
|
|
10
|
+
*/
|
|
11
|
+
async _validateCredentials(credentials) {
|
|
12
|
+
// Credentials are hardcoded, no validation needed
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Initialize tools for MinerU toolset
|
|
16
|
+
*/
|
|
17
|
+
async initTools() {
|
|
18
|
+
// Use hardcoded credentials
|
|
19
|
+
this.tools = [buildParseFileTool(DEFAULT_CREDENTIALS)];
|
|
20
|
+
return this.tools;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
export declare const MinerU = "mineru";
|
|
2
|
+
export declare const icon: string;
|
|
3
|
+
export declare const SUPPORTED_EXTENSIONS: string[];
|
|
4
|
+
export declare const SUPPORTED_IMAGE_EXTENSIONS: string[];
|
|
5
|
+
export declare const MAX_RETRIES = 50;
|
|
6
|
+
export declare const MINERU_BASE_URL = "https://mineru.net";
|
|
7
|
+
export declare const MINERU_TOKEN = "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI1MjAwMDg2NCIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc2NjA0NzI0OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYjc4MzIxNDItMjA0Yi00MDk1LTk2MjMtNzdhMzhlNTU2NzcyIiwiZW1haWwiOiIiLCJleHAiOjE3NjcyNTY4NDh9.xy9gFita6it7FBWNRrx-EUzpD9Mr5yhJBZ8m9hHOzrO3RfM9E1IE8AlW8lZSV4cxexuNPKisvCpW6c-RfjbHZw";
|
|
8
|
+
export declare const DEFAULT_CREDENTIALS: Credentials;
|
|
9
|
+
export interface Credentials {
|
|
10
|
+
base_url: string;
|
|
11
|
+
token: string;
|
|
12
|
+
}
|
|
13
|
+
export interface ZipContent {
|
|
14
|
+
mdContent: string;
|
|
15
|
+
contentList: Array<Record<string, any>>;
|
|
16
|
+
images: Array<any>;
|
|
17
|
+
htmlContent?: string;
|
|
18
|
+
docxContent?: Buffer;
|
|
19
|
+
latexContent?: string;
|
|
20
|
+
}
|
|
21
|
+
export interface TFileInfo {
|
|
22
|
+
mimeType: string;
|
|
23
|
+
fileName: string;
|
|
24
|
+
fileUrl?: string;
|
|
25
|
+
filePath: string;
|
|
26
|
+
extension?: string;
|
|
27
|
+
}
|
|
28
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAKA,eAAO,MAAM,MAAM,WAAW,CAAA;AAK9B,eAAO,MAAM,IAAI,QAAkE,CAAA;AAGnF,eAAO,MAAM,oBAAoB,UAAsE,CAAA;AAGvG,eAAO,MAAM,0BAA0B,UAA4B,CAAA;AAGnE,eAAO,MAAM,WAAW,KAAK,CAAA;AAG7B,eAAO,MAAM,eAAe,uBAAuB,CAAA;AACnD,eAAO,MAAM,YAAY,uZAAuZ,CAAA;AAGhb,eAAO,MAAM,mBAAmB,EAAE,WAGjC,CAAA;AAGD,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,MAAM,CAAA;IAChB,KAAK,EAAE,MAAM,CAAA;CACd;AAGD,MAAM,WAAW,UAAU;IACzB,SAAS,EAAE,MAAM,CAAA;IACjB,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAA;IACvC,MAAM,EAAE,KAAK,CAAC,GAAG,CAAC,CAAA;IAClB,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,YAAY,CAAC,EAAE,MAAM,CAAA;CACtB;AAGD,MAAM,WAAW,SAAS;IACxB,QAAQ,EAAE,MAAM,CAAA;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,SAAS,CAAC,EAAE,MAAM,CAAA;CACnB"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { readFileSync } from 'fs';
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
import { dirname, join } from 'path';
|
|
4
|
+
// MinerU plugin identifier
|
|
5
|
+
export const MinerU = 'mineru';
|
|
6
|
+
// SVG icon for mineru plugin (loaded from _assets/icon.svg)
|
|
7
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
8
|
+
const __dirname = dirname(__filename);
|
|
9
|
+
export const icon = readFileSync(join(__dirname, '../../_assets/icon.svg'), 'utf8');
|
|
10
|
+
// Supported file extensions
|
|
11
|
+
export const SUPPORTED_EXTENSIONS = ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.png', '.jpg', '.jpeg'];
|
|
12
|
+
// Supported image extensions
|
|
13
|
+
export const SUPPORTED_IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg'];
|
|
14
|
+
// Maximum retries for polling
|
|
15
|
+
export const MAX_RETRIES = 50;
|
|
16
|
+
// Hardcoded MinerU API credentials
|
|
17
|
+
export const MINERU_BASE_URL = 'https://mineru.net';
|
|
18
|
+
export const MINERU_TOKEN = 'eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI1MjAwMDg2NCIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc2NjA0NzI0OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYjc4MzIxNDItMjA0Yi00MDk1LTk2MjMtNzdhMzhlNTU2NzcyIiwiZW1haWwiOiIiLCJleHAiOjE3NjcyNTY4NDh9.xy9gFita6it7FBWNRrx-EUzpD9Mr5yhJBZ8m9hHOzrO3RfM9E1IE8AlW8lZSV4cxexuNPKisvCpW6c-RfjbHZw';
|
|
19
|
+
// Default credentials
|
|
20
|
+
export const DEFAULT_CREDENTIALS = {
|
|
21
|
+
base_url: MINERU_BASE_URL,
|
|
22
|
+
token: MINERU_TOKEN,
|
|
23
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@chenchaolong/plugin-mineru-chen",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"license": "AGPL-3.0",
|
|
5
|
+
"repository": {
|
|
6
|
+
"type": "git",
|
|
7
|
+
"url": "https://github.com/xpert-ai/xpert-plugins.git"
|
|
8
|
+
},
|
|
9
|
+
"bugs": {
|
|
10
|
+
"url": "https://github.com/xpert-ai/xpert-plugins/issues"
|
|
11
|
+
},
|
|
12
|
+
"type": "module",
|
|
13
|
+
"main": "./dist/index.js",
|
|
14
|
+
"module": "./dist/index.js",
|
|
15
|
+
"types": "./dist/index.d.ts",
|
|
16
|
+
"exports": {
|
|
17
|
+
"./package.json": "./package.json",
|
|
18
|
+
".": {
|
|
19
|
+
"@xpert-plugins-starter/source": "./src/index.ts",
|
|
20
|
+
"types": "./dist/index.d.ts",
|
|
21
|
+
"import": "./dist/index.js",
|
|
22
|
+
"default": "./dist/index.js"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"files": [
|
|
26
|
+
"dist",
|
|
27
|
+
"!**/*.tsbuildinfo"
|
|
28
|
+
],
|
|
29
|
+
"dependencies": {
|
|
30
|
+
"jszip": "^3.10.1",
|
|
31
|
+
"tslib": "^2.3.0"
|
|
32
|
+
},
|
|
33
|
+
"peerDependencies": {
|
|
34
|
+
"@langchain/core": "0.3.72",
|
|
35
|
+
"@langchain/langgraph": "0.4.7",
|
|
36
|
+
"@nestjs/common": "^11.1.6",
|
|
37
|
+
"@nestjs/config": "^4.0.2",
|
|
38
|
+
"@xpert-ai/plugin-sdk": "^3.6.2",
|
|
39
|
+
"chalk": "4.1.2",
|
|
40
|
+
"zod": "3.25.67"
|
|
41
|
+
}
|
|
42
|
+
}
|