@helloxiaohu/plugin-mineru 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/mineru-toolset.strategy.d.ts +218 -0
- package/dist/lib/mineru-toolset.strategy.d.ts.map +1 -0
- package/dist/lib/mineru-toolset.strategy.js +261 -0
- package/dist/lib/mineru.plugin.d.ts.map +1 -1
- package/dist/lib/mineru.plugin.js +2 -0
- package/dist/lib/mineru.tool.d.ts +33 -0
- package/dist/lib/mineru.tool.d.ts.map +1 -0
- package/dist/lib/mineru.tool.js +180 -0
- package/dist/lib/mineru.toolset.d.ts +25 -0
- package/dist/lib/mineru.toolset.d.ts.map +1 -0
- package/dist/lib/mineru.toolset.js +48 -0
- package/dist/lib/result-parser.service.d.ts +2 -2
- package/dist/lib/result-parser.service.d.ts.map +1 -1
- package/dist/lib/result-parser.service.js +69 -43
- package/package.json +1 -1
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import { ConfigService } from '@nestjs/config';
|
|
2
|
+
import { BuiltinToolset, IToolsetStrategy, FileSystemPermission, ISchemaSecretField } from '@xpert-ai/plugin-sdk';
|
|
3
|
+
import { MinerUResultParserService } from './result-parser.service.js';
|
|
4
|
+
import { MinerUToolsetConfig } from './mineru.toolset.js';
|
|
5
|
+
export declare class MinerUToolsetStrategy implements IToolsetStrategy<MinerUToolsetConfig> {
|
|
6
|
+
private readonly configService;
|
|
7
|
+
private readonly resultParser;
|
|
8
|
+
meta: {
|
|
9
|
+
author: string;
|
|
10
|
+
tags: string[];
|
|
11
|
+
name: string;
|
|
12
|
+
label: {
|
|
13
|
+
en_US: string;
|
|
14
|
+
zh_Hans: string;
|
|
15
|
+
};
|
|
16
|
+
description: {
|
|
17
|
+
en_US: string;
|
|
18
|
+
zh_Hans: string;
|
|
19
|
+
};
|
|
20
|
+
icon: {
|
|
21
|
+
type: string;
|
|
22
|
+
value: string;
|
|
23
|
+
svg: string;
|
|
24
|
+
color: string;
|
|
25
|
+
};
|
|
26
|
+
configSchema: {
|
|
27
|
+
type: string;
|
|
28
|
+
properties: {
|
|
29
|
+
apiUrl: {
|
|
30
|
+
type: string;
|
|
31
|
+
title: {
|
|
32
|
+
en_US: string;
|
|
33
|
+
zh_Hans: string;
|
|
34
|
+
};
|
|
35
|
+
description: {
|
|
36
|
+
en_US: string;
|
|
37
|
+
zh_Hans: string;
|
|
38
|
+
};
|
|
39
|
+
default: string;
|
|
40
|
+
};
|
|
41
|
+
apiKey: {
|
|
42
|
+
type: string;
|
|
43
|
+
title: {
|
|
44
|
+
en_US: string;
|
|
45
|
+
zh_Hans: string;
|
|
46
|
+
};
|
|
47
|
+
description: {
|
|
48
|
+
en_US: string;
|
|
49
|
+
zh_Hans: string;
|
|
50
|
+
};
|
|
51
|
+
'x-ui': ISchemaSecretField;
|
|
52
|
+
};
|
|
53
|
+
isOcr: {
|
|
54
|
+
type: string;
|
|
55
|
+
title: {
|
|
56
|
+
en_US: string;
|
|
57
|
+
zh_Hans: string;
|
|
58
|
+
};
|
|
59
|
+
description: {
|
|
60
|
+
en_US: string;
|
|
61
|
+
zh_Hans: string;
|
|
62
|
+
};
|
|
63
|
+
enum: string[];
|
|
64
|
+
default: string;
|
|
65
|
+
'x-ui': {
|
|
66
|
+
enumLabels: {
|
|
67
|
+
true: {
|
|
68
|
+
en_US: string;
|
|
69
|
+
zh_Hans: string;
|
|
70
|
+
};
|
|
71
|
+
false: {
|
|
72
|
+
en_US: string;
|
|
73
|
+
zh_Hans: string;
|
|
74
|
+
};
|
|
75
|
+
};
|
|
76
|
+
};
|
|
77
|
+
};
|
|
78
|
+
enableFormula: {
|
|
79
|
+
type: string;
|
|
80
|
+
title: {
|
|
81
|
+
en_US: string;
|
|
82
|
+
zh_Hans: string;
|
|
83
|
+
};
|
|
84
|
+
description: {
|
|
85
|
+
en_US: string;
|
|
86
|
+
zh_Hans: string;
|
|
87
|
+
};
|
|
88
|
+
enum: string[];
|
|
89
|
+
default: string;
|
|
90
|
+
'x-ui': {
|
|
91
|
+
enumLabels: {
|
|
92
|
+
true: {
|
|
93
|
+
en_US: string;
|
|
94
|
+
zh_Hans: string;
|
|
95
|
+
};
|
|
96
|
+
false: {
|
|
97
|
+
en_US: string;
|
|
98
|
+
zh_Hans: string;
|
|
99
|
+
};
|
|
100
|
+
};
|
|
101
|
+
};
|
|
102
|
+
};
|
|
103
|
+
enableTable: {
|
|
104
|
+
type: string;
|
|
105
|
+
title: {
|
|
106
|
+
en_US: string;
|
|
107
|
+
zh_Hans: string;
|
|
108
|
+
};
|
|
109
|
+
description: {
|
|
110
|
+
en_US: string;
|
|
111
|
+
zh_Hans: string;
|
|
112
|
+
};
|
|
113
|
+
enum: string[];
|
|
114
|
+
default: string;
|
|
115
|
+
'x-ui': {
|
|
116
|
+
enumLabels: {
|
|
117
|
+
true: {
|
|
118
|
+
en_US: string;
|
|
119
|
+
zh_Hans: string;
|
|
120
|
+
};
|
|
121
|
+
false: {
|
|
122
|
+
en_US: string;
|
|
123
|
+
zh_Hans: string;
|
|
124
|
+
};
|
|
125
|
+
};
|
|
126
|
+
};
|
|
127
|
+
};
|
|
128
|
+
language: {
|
|
129
|
+
type: string;
|
|
130
|
+
title: {
|
|
131
|
+
en_US: string;
|
|
132
|
+
zh_Hans: string;
|
|
133
|
+
};
|
|
134
|
+
description: {
|
|
135
|
+
en_US: string;
|
|
136
|
+
zh_Hans: string;
|
|
137
|
+
};
|
|
138
|
+
enum: string[];
|
|
139
|
+
default: string;
|
|
140
|
+
'x-ui': {
|
|
141
|
+
enumLabels: {
|
|
142
|
+
en: {
|
|
143
|
+
en_US: string;
|
|
144
|
+
zh_Hans: string;
|
|
145
|
+
};
|
|
146
|
+
ch: {
|
|
147
|
+
en_US: string;
|
|
148
|
+
zh_Hans: string;
|
|
149
|
+
};
|
|
150
|
+
};
|
|
151
|
+
};
|
|
152
|
+
};
|
|
153
|
+
modelVersion: {
|
|
154
|
+
type: string;
|
|
155
|
+
title: {
|
|
156
|
+
en_US: string;
|
|
157
|
+
zh_Hans: string;
|
|
158
|
+
};
|
|
159
|
+
description: {
|
|
160
|
+
en_US: string;
|
|
161
|
+
zh_Hans: string;
|
|
162
|
+
};
|
|
163
|
+
enum: string[];
|
|
164
|
+
default: string;
|
|
165
|
+
'x-ui': {
|
|
166
|
+
enumLabels: {
|
|
167
|
+
pipeline: {
|
|
168
|
+
en_US: string;
|
|
169
|
+
zh_Hans: string;
|
|
170
|
+
};
|
|
171
|
+
vlm: {
|
|
172
|
+
en_US: string;
|
|
173
|
+
zh_Hans: string;
|
|
174
|
+
};
|
|
175
|
+
};
|
|
176
|
+
};
|
|
177
|
+
};
|
|
178
|
+
extraFormats: {
|
|
179
|
+
type: string;
|
|
180
|
+
title: {
|
|
181
|
+
en_US: string;
|
|
182
|
+
zh_Hans: string;
|
|
183
|
+
};
|
|
184
|
+
description: {
|
|
185
|
+
en_US: string;
|
|
186
|
+
zh_Hans: string;
|
|
187
|
+
};
|
|
188
|
+
};
|
|
189
|
+
};
|
|
190
|
+
required: string[];
|
|
191
|
+
};
|
|
192
|
+
};
|
|
193
|
+
readonly permissions: FileSystemPermission[];
|
|
194
|
+
constructor(configService: ConfigService, resultParser: MinerUResultParserService);
|
|
195
|
+
validateConfig(config: MinerUToolsetConfig | null | undefined): Promise<void>;
|
|
196
|
+
create(config: any): Promise<BuiltinToolset>;
|
|
197
|
+
createTools(): import("@langchain/core/tools").DynamicStructuredTool<import("zod").ZodObject<{
|
|
198
|
+
doc_url: import("zod").ZodString;
|
|
199
|
+
page_ranges: import("zod").ZodNullable<import("zod").ZodOptional<import("zod").ZodString>>;
|
|
200
|
+
}, "strip", import("zod").ZodTypeAny, {
|
|
201
|
+
doc_url?: string;
|
|
202
|
+
page_ranges?: string;
|
|
203
|
+
}, {
|
|
204
|
+
doc_url?: string;
|
|
205
|
+
page_ranges?: string;
|
|
206
|
+
}>, {
|
|
207
|
+
doc_url?: string;
|
|
208
|
+
page_ranges?: string;
|
|
209
|
+
}, {
|
|
210
|
+
doc_url?: string;
|
|
211
|
+
page_ranges?: string;
|
|
212
|
+
}, (string | {
|
|
213
|
+
files: any[];
|
|
214
|
+
taskId: string;
|
|
215
|
+
metadata: any;
|
|
216
|
+
})[]>[];
|
|
217
|
+
}
|
|
218
|
+
//# sourceMappingURL=mineru-toolset.strategy.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mineru-toolset.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/mineru-toolset.strategy.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EACL,cAAc,EACd,gBAAgB,EAEhB,oBAAoB,EACpB,kBAAkB,EACnB,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AACvE,OAAO,EAAiB,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AASzE,qBAEa,qBAAsB,YAAW,gBAAgB,CAAC,mBAAmB,CAAC;IA6M/E,OAAO,CAAC,QAAQ,CAAC,aAAa;IAE9B,OAAO,CAAC,QAAQ,CAAC,YAAY;IA9M/B,IAAI;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;4BA4Ca,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAoJjC;IAEF,QAAQ,CAAC,WAAW,yBAMlB;gBAIiB,aAAa,EAAE,aAAa,EAE5B,YAAY,EAAE,yBAAyB;IAG1D,cAAc,CAAC,MAAM,EAAE,mBAAmB,GAAG,IAAI,GAAG,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC;IAWvE,MAAM,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,cAAc,CAAC;IAwBlD,WAAW;;;;;;;;;;;;;;;;;;;;CAiBZ"}
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import { __decorate, __metadata, __param } from "tslib";
|
|
2
|
+
import { Injectable, forwardRef, Inject } from '@nestjs/common';
|
|
3
|
+
import { ConfigService } from '@nestjs/config';
|
|
4
|
+
import { ToolsetStrategy, } from '@xpert-ai/plugin-sdk';
|
|
5
|
+
import { MinerUResultParserService } from './result-parser.service.js';
|
|
6
|
+
import { MinerUToolset } from './mineru.toolset.js';
|
|
7
|
+
import { MinerU, icon } from './types.js';
|
|
8
|
+
import { buildMinerUTool } from './mineru.tool.js';
|
|
9
|
+
let MinerUToolsetStrategy = class MinerUToolsetStrategy {
|
|
10
|
+
constructor(configService, resultParser) {
|
|
11
|
+
this.configService = configService;
|
|
12
|
+
this.resultParser = resultParser;
|
|
13
|
+
this.meta = {
|
|
14
|
+
author: 'Xpert AI',
|
|
15
|
+
tags: ['pdf', 'markdown', 'parser', 'ocr', 'mineru', 'document', 'extraction'],
|
|
16
|
+
name: MinerU,
|
|
17
|
+
label: {
|
|
18
|
+
en_US: 'MinerU PDF Parser',
|
|
19
|
+
zh_Hans: 'MinerU PDF 解析器',
|
|
20
|
+
},
|
|
21
|
+
description: {
|
|
22
|
+
en_US: 'Convert documents to markdown format using MinerU. Supports OCR, formula recognition, and table extraction.',
|
|
23
|
+
zh_Hans: '使用 MinerU 将文档转换为 Markdown 格式。支持 OCR、公式识别和表格提取。',
|
|
24
|
+
},
|
|
25
|
+
icon: {
|
|
26
|
+
type: 'svg',
|
|
27
|
+
value: icon,
|
|
28
|
+
svg: icon,
|
|
29
|
+
color: '#14b8a6',
|
|
30
|
+
},
|
|
31
|
+
configSchema: {
|
|
32
|
+
type: 'object',
|
|
33
|
+
properties: {
|
|
34
|
+
apiUrl: {
|
|
35
|
+
type: 'string',
|
|
36
|
+
title: {
|
|
37
|
+
en_US: 'Base URL',
|
|
38
|
+
zh_Hans: 'Base URL',
|
|
39
|
+
},
|
|
40
|
+
description: {
|
|
41
|
+
en_US: 'MinerU API base url. Official: https://mineru.net/api/v4',
|
|
42
|
+
zh_Hans: 'MinerU 服务地址。官方: https://mineru.net/api/v4',
|
|
43
|
+
},
|
|
44
|
+
default: 'https://mineru.net/api/v4',
|
|
45
|
+
},
|
|
46
|
+
apiKey: {
|
|
47
|
+
type: 'string',
|
|
48
|
+
title: {
|
|
49
|
+
en_US: 'API Key',
|
|
50
|
+
zh_Hans: 'API Key',
|
|
51
|
+
},
|
|
52
|
+
description: {
|
|
53
|
+
en_US: 'The API Key of the MinerU server (required)',
|
|
54
|
+
zh_Hans: 'MinerU 服务令牌(必填)',
|
|
55
|
+
},
|
|
56
|
+
'x-ui': {
|
|
57
|
+
component: 'secretInput',
|
|
58
|
+
label: 'API Key',
|
|
59
|
+
placeholder: 'MinerU API Key',
|
|
60
|
+
revealable: true,
|
|
61
|
+
maskSymbol: '*',
|
|
62
|
+
persist: true,
|
|
63
|
+
},
|
|
64
|
+
},
|
|
65
|
+
isOcr: {
|
|
66
|
+
type: 'string',
|
|
67
|
+
title: {
|
|
68
|
+
en_US: 'Enable OCR',
|
|
69
|
+
zh_Hans: '启用 OCR',
|
|
70
|
+
},
|
|
71
|
+
description: {
|
|
72
|
+
en_US: 'Enable OCR for image-based documents',
|
|
73
|
+
zh_Hans: '为基于图像的文档启用 OCR',
|
|
74
|
+
},
|
|
75
|
+
enum: ['true', 'false'],
|
|
76
|
+
default: 'true',
|
|
77
|
+
'x-ui': {
|
|
78
|
+
enumLabels: {
|
|
79
|
+
'true': {
|
|
80
|
+
en_US: 'Enabled',
|
|
81
|
+
zh_Hans: '启用',
|
|
82
|
+
},
|
|
83
|
+
'false': {
|
|
84
|
+
en_US: 'Disabled',
|
|
85
|
+
zh_Hans: '禁用',
|
|
86
|
+
},
|
|
87
|
+
},
|
|
88
|
+
},
|
|
89
|
+
},
|
|
90
|
+
enableFormula: {
|
|
91
|
+
type: 'string',
|
|
92
|
+
title: {
|
|
93
|
+
en_US: 'Enable Formula Recognition',
|
|
94
|
+
zh_Hans: '启用公式识别',
|
|
95
|
+
},
|
|
96
|
+
description: {
|
|
97
|
+
en_US: 'Enable formula recognition',
|
|
98
|
+
zh_Hans: '启用公式识别',
|
|
99
|
+
},
|
|
100
|
+
enum: ['true', 'false'],
|
|
101
|
+
default: 'true',
|
|
102
|
+
'x-ui': {
|
|
103
|
+
enumLabels: {
|
|
104
|
+
'true': {
|
|
105
|
+
en_US: 'Enabled',
|
|
106
|
+
zh_Hans: '启用',
|
|
107
|
+
},
|
|
108
|
+
'false': {
|
|
109
|
+
en_US: 'Disabled',
|
|
110
|
+
zh_Hans: '禁用',
|
|
111
|
+
},
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
enableTable: {
|
|
116
|
+
type: 'string',
|
|
117
|
+
title: {
|
|
118
|
+
en_US: 'Enable Table Recognition',
|
|
119
|
+
zh_Hans: '启用表格识别',
|
|
120
|
+
},
|
|
121
|
+
description: {
|
|
122
|
+
en_US: 'Enable table recognition',
|
|
123
|
+
zh_Hans: '启用表格识别',
|
|
124
|
+
},
|
|
125
|
+
enum: ['true', 'false'],
|
|
126
|
+
default: 'true',
|
|
127
|
+
'x-ui': {
|
|
128
|
+
enumLabels: {
|
|
129
|
+
'true': {
|
|
130
|
+
en_US: 'Enabled',
|
|
131
|
+
zh_Hans: '启用',
|
|
132
|
+
},
|
|
133
|
+
'false': {
|
|
134
|
+
en_US: 'Disabled',
|
|
135
|
+
zh_Hans: '禁用',
|
|
136
|
+
},
|
|
137
|
+
},
|
|
138
|
+
},
|
|
139
|
+
},
|
|
140
|
+
language: {
|
|
141
|
+
type: 'string',
|
|
142
|
+
title: {
|
|
143
|
+
en_US: 'Document Language',
|
|
144
|
+
zh_Hans: '文档语言',
|
|
145
|
+
},
|
|
146
|
+
description: {
|
|
147
|
+
en_US: 'Document language: "en" for English, "ch" for Chinese (default: "ch")',
|
|
148
|
+
zh_Hans: '文档语言:"en" 表示英语,"ch" 表示中文(默认:"ch")',
|
|
149
|
+
},
|
|
150
|
+
enum: ['en', 'ch'],
|
|
151
|
+
default: 'ch',
|
|
152
|
+
'x-ui': {
|
|
153
|
+
enumLabels: {
|
|
154
|
+
'en': {
|
|
155
|
+
en_US: 'en',
|
|
156
|
+
zh_Hans: '英文',
|
|
157
|
+
},
|
|
158
|
+
'ch': {
|
|
159
|
+
en_US: 'ch',
|
|
160
|
+
zh_Hans: '中文',
|
|
161
|
+
},
|
|
162
|
+
},
|
|
163
|
+
},
|
|
164
|
+
},
|
|
165
|
+
modelVersion: {
|
|
166
|
+
type: 'string',
|
|
167
|
+
title: {
|
|
168
|
+
en_US: 'Model Version',
|
|
169
|
+
zh_Hans: '模型版本',
|
|
170
|
+
},
|
|
171
|
+
description: {
|
|
172
|
+
en_US: 'Model version: "pipeline" or "vlm" (default: "pipeline")',
|
|
173
|
+
zh_Hans: '模型版本:"pipeline" 或 "vlm"(默认:"pipeline")',
|
|
174
|
+
},
|
|
175
|
+
enum: ['pipeline', 'vlm'],
|
|
176
|
+
default: 'pipeline',
|
|
177
|
+
'x-ui': {
|
|
178
|
+
enumLabels: {
|
|
179
|
+
'pipeline': {
|
|
180
|
+
en_US: 'pipeline',
|
|
181
|
+
zh_Hans: 'pipeline',
|
|
182
|
+
},
|
|
183
|
+
'vlm': {
|
|
184
|
+
en_US: 'vlm',
|
|
185
|
+
zh_Hans: 'vlm',
|
|
186
|
+
},
|
|
187
|
+
},
|
|
188
|
+
},
|
|
189
|
+
},
|
|
190
|
+
extraFormats: {
|
|
191
|
+
type: 'string',
|
|
192
|
+
title: {
|
|
193
|
+
en_US: 'Extra Formats',
|
|
194
|
+
zh_Hans: '额外输出格式',
|
|
195
|
+
},
|
|
196
|
+
description: {
|
|
197
|
+
en_US: 'Optional extra formats, comma-separated (docx, html, latex).',
|
|
198
|
+
zh_Hans: '可选额外输出格式,逗号分隔(docx、html、latex)。',
|
|
199
|
+
},
|
|
200
|
+
},
|
|
201
|
+
},
|
|
202
|
+
required: ['apiKey'],
|
|
203
|
+
},
|
|
204
|
+
};
|
|
205
|
+
this.permissions = [
|
|
206
|
+
{
|
|
207
|
+
type: 'filesystem',
|
|
208
|
+
operations: ['read', 'write', 'list'],
|
|
209
|
+
scope: [],
|
|
210
|
+
},
|
|
211
|
+
];
|
|
212
|
+
}
|
|
213
|
+
validateConfig(config) {
|
|
214
|
+
if (!config) {
|
|
215
|
+
return Promise.resolve();
|
|
216
|
+
}
|
|
217
|
+
if (!config.apiKey) {
|
|
218
|
+
throw new Error('MinerU apiKey is required');
|
|
219
|
+
}
|
|
220
|
+
return Promise.resolve();
|
|
221
|
+
}
|
|
222
|
+
async create(config) {
|
|
223
|
+
const toolset = config && typeof config === 'object' && 'credentials' in config
|
|
224
|
+
? config
|
|
225
|
+
: null;
|
|
226
|
+
const creds = toolset?.credentials ?? config ?? {};
|
|
227
|
+
const configWithDependencies = {
|
|
228
|
+
apiUrl: creds.apiUrl,
|
|
229
|
+
apiKey: creds.apiKey,
|
|
230
|
+
extraFormats: creds.extraFormats,
|
|
231
|
+
isOcr: creds.isOcr,
|
|
232
|
+
enableFormula: creds.enableFormula,
|
|
233
|
+
enableTable: creds.enableTable,
|
|
234
|
+
language: creds.language,
|
|
235
|
+
modelVersion: creds.modelVersion,
|
|
236
|
+
configService: this.configService,
|
|
237
|
+
resultParser: this.resultParser,
|
|
238
|
+
};
|
|
239
|
+
return new MinerUToolset(configWithDependencies);
|
|
240
|
+
}
|
|
241
|
+
createTools() {
|
|
242
|
+
return [
|
|
243
|
+
buildMinerUTool(this.configService, this.resultParser, undefined, undefined, {
|
|
244
|
+
isOcr: true,
|
|
245
|
+
enableFormula: true,
|
|
246
|
+
enableTable: true,
|
|
247
|
+
language: 'ch',
|
|
248
|
+
modelVersion: 'pipeline',
|
|
249
|
+
}),
|
|
250
|
+
];
|
|
251
|
+
}
|
|
252
|
+
};
|
|
253
|
+
MinerUToolsetStrategy = __decorate([
|
|
254
|
+
Injectable(),
|
|
255
|
+
ToolsetStrategy(MinerU),
|
|
256
|
+
__param(0, Inject(forwardRef(() => ConfigService))),
|
|
257
|
+
__param(1, Inject(MinerUResultParserService)),
|
|
258
|
+
__metadata("design:paramtypes", [ConfigService,
|
|
259
|
+
MinerUResultParserService])
|
|
260
|
+
], MinerUToolsetStrategy);
|
|
261
|
+
export { MinerUToolsetStrategy };
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"mineru.plugin.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.plugin.ts"],"names":[],"mappings":"AACA,OAAO,EAAqB,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;
|
|
1
|
+
{"version":3,"file":"mineru.plugin.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.plugin.ts"],"names":[],"mappings":"AACA,OAAO,EAAqB,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAQ/F,qBAkBa,YAAa,YAAW,kBAAkB,EAAE,gBAAgB;IAExE,OAAO,CAAC,UAAU,CAAQ;IAE1B;;OAEG;IACH,iBAAiB,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAMzC;;OAEG;IACH,eAAe,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;CAKvC"}
|
|
@@ -7,6 +7,7 @@ import { MinerUTransformerStrategy } from './transformer-mineru.strategy.js';
|
|
|
7
7
|
import { MinerUResultParserService } from './result-parser.service.js';
|
|
8
8
|
import { MinerUIntegrationStrategy } from './integration.strategy.js';
|
|
9
9
|
import { MinerUController } from './mineru.controller.js';
|
|
10
|
+
import { MinerUToolsetStrategy } from './mineru-toolset.strategy.js';
|
|
10
11
|
let MinerUPlugin = MinerUPlugin_1 = class MinerUPlugin {
|
|
11
12
|
constructor() {
|
|
12
13
|
// We disable by default additional logging for each event to avoid cluttering the logs
|
|
@@ -41,6 +42,7 @@ MinerUPlugin = MinerUPlugin_1 = __decorate([
|
|
|
41
42
|
MinerUIntegrationStrategy,
|
|
42
43
|
MinerUTransformerStrategy,
|
|
43
44
|
MinerUResultParserService,
|
|
45
|
+
MinerUToolsetStrategy,
|
|
44
46
|
],
|
|
45
47
|
controllers: [
|
|
46
48
|
MinerUController
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
import { ConfigService } from '@nestjs/config';
|
|
4
|
+
import { MinerUResultParserService } from './result-parser.service.js';
|
|
5
|
+
import { MinerUIntegrationOptions } from './types.js';
|
|
6
|
+
export interface MinerUToolDefaults {
|
|
7
|
+
isOcr?: boolean | string;
|
|
8
|
+
enableFormula?: boolean | string;
|
|
9
|
+
enableTable?: boolean | string;
|
|
10
|
+
language?: 'en' | 'ch';
|
|
11
|
+
modelVersion?: 'pipeline' | 'vlm';
|
|
12
|
+
}
|
|
13
|
+
export declare function buildMinerUTool(configService: ConfigService, resultParser: MinerUResultParserService, options?: MinerUIntegrationOptions, fileSystem?: XpFileSystem, defaults?: MinerUToolDefaults): import("@langchain/core/tools").DynamicStructuredTool<z.ZodObject<{
|
|
14
|
+
doc_url: z.ZodString;
|
|
15
|
+
page_ranges: z.ZodNullable<z.ZodOptional<z.ZodString>>;
|
|
16
|
+
}, "strip", z.ZodTypeAny, {
|
|
17
|
+
doc_url?: string;
|
|
18
|
+
page_ranges?: string;
|
|
19
|
+
}, {
|
|
20
|
+
doc_url?: string;
|
|
21
|
+
page_ranges?: string;
|
|
22
|
+
}>, {
|
|
23
|
+
doc_url?: string;
|
|
24
|
+
page_ranges?: string;
|
|
25
|
+
}, {
|
|
26
|
+
doc_url?: string;
|
|
27
|
+
page_ranges?: string;
|
|
28
|
+
}, (string | {
|
|
29
|
+
files: any[];
|
|
30
|
+
taskId: string;
|
|
31
|
+
metadata: any;
|
|
32
|
+
})[]>;
|
|
33
|
+
//# sourceMappingURL=mineru.tool.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mineru.tool.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.tool.ts"],"names":[],"mappings":"AAEA,OAAO,EAAmB,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACrE,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAE/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAEvE,OAAO,EAAU,wBAAwB,EAAE,MAAM,YAAY,CAAC;AAE9D,MAAM,WAAW,kBAAkB;IACjC,KAAK,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACzB,aAAa,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACjC,WAAW,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IAC/B,QAAQ,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACvB,YAAY,CAAC,EAAE,UAAU,GAAG,KAAK,CAAC;CACnC;AAiBD,wBAAgB,eAAe,CAC7B,aAAa,EAAE,aAAa,EAC5B,YAAY,EAAE,yBAAyB,EACvC,OAAO,CAAC,EAAE,wBAAwB,EAClC,UAAU,CAAC,EAAE,YAAY,EACzB,QAAQ,CAAC,EAAE,kBAAkB;;;;;;;;;;;;;;;;;;;MAsM9B"}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import { tool } from '@langchain/core/tools';
|
|
2
|
+
import { getCurrentTaskInput } from '@langchain/langgraph';
|
|
3
|
+
import { getErrorMessage } from '@xpert-ai/plugin-sdk';
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
import { MinerUClient } from './mineru.client.js';
|
|
6
|
+
import { MinerU } from './types.js';
|
|
7
|
+
function normalizeExtraFormats(value) {
|
|
8
|
+
if (!value) {
|
|
9
|
+
return undefined;
|
|
10
|
+
}
|
|
11
|
+
if (Array.isArray(value)) {
|
|
12
|
+
const formats = value.map((item) => String(item).trim()).filter(Boolean);
|
|
13
|
+
return formats.length ? formats : undefined;
|
|
14
|
+
}
|
|
15
|
+
const formats = value
|
|
16
|
+
.split(',')
|
|
17
|
+
.map((item) => item.trim())
|
|
18
|
+
.filter(Boolean);
|
|
19
|
+
return formats.length ? formats : undefined;
|
|
20
|
+
}
|
|
21
|
+
export function buildMinerUTool(configService, resultParser, options, fileSystem, defaults) {
|
|
22
|
+
return tool(async (input) => {
|
|
23
|
+
try {
|
|
24
|
+
const { doc_url, page_ranges } = input;
|
|
25
|
+
console.debug('[MinerU] tool invoked with input', {
|
|
26
|
+
doc_url,
|
|
27
|
+
extraKeys: Object.keys(input || {}).filter((key) => key !== 'doc_url'),
|
|
28
|
+
});
|
|
29
|
+
if (!doc_url) {
|
|
30
|
+
throw new Error('doc_url is required');
|
|
31
|
+
}
|
|
32
|
+
const currentState = getCurrentTaskInput();
|
|
33
|
+
const workspacePath = currentState?.['sys']?.['volume'] ?? '/tmp/xpert';
|
|
34
|
+
const finalApiUrl = options?.apiUrl || 'https://mineru.net/api/v4';
|
|
35
|
+
const finalApiKey = options?.apiKey;
|
|
36
|
+
const maskedKey = finalApiKey && finalApiKey.length > 8
|
|
37
|
+
? `${finalApiKey.slice(0, 4)}***${finalApiKey.slice(-4)}`
|
|
38
|
+
: finalApiKey
|
|
39
|
+
? 'provided'
|
|
40
|
+
: 'missing';
|
|
41
|
+
console.debug('[MinerU] buildMinerUTool config', {
|
|
42
|
+
fromOptions: Boolean(options),
|
|
43
|
+
apiUrl: finalApiUrl,
|
|
44
|
+
apiKey: maskedKey,
|
|
45
|
+
defaults: {
|
|
46
|
+
isOcr: defaults?.isOcr,
|
|
47
|
+
enableFormula: defaults?.enableFormula,
|
|
48
|
+
enableTable: defaults?.enableTable,
|
|
49
|
+
language: defaults?.language,
|
|
50
|
+
modelVersion: defaults?.modelVersion,
|
|
51
|
+
},
|
|
52
|
+
});
|
|
53
|
+
const finalIsOcr = defaults?.isOcr === undefined
|
|
54
|
+
? true
|
|
55
|
+
: typeof defaults.isOcr === 'string'
|
|
56
|
+
? defaults.isOcr === 'true'
|
|
57
|
+
: defaults.isOcr === true;
|
|
58
|
+
const finalEnableFormula = defaults?.enableFormula === undefined
|
|
59
|
+
? true
|
|
60
|
+
: typeof defaults.enableFormula === 'string'
|
|
61
|
+
? defaults.enableFormula === 'true'
|
|
62
|
+
: defaults.enableFormula === true;
|
|
63
|
+
const finalEnableTable = defaults?.enableTable === undefined
|
|
64
|
+
? true
|
|
65
|
+
: typeof defaults.enableTable === 'string'
|
|
66
|
+
? defaults.enableTable === 'true'
|
|
67
|
+
: defaults.enableTable === true;
|
|
68
|
+
const finalLanguage = defaults?.language || 'ch';
|
|
69
|
+
const finalModelVersion = defaults?.modelVersion || 'pipeline';
|
|
70
|
+
const finalExtraFormats = normalizeExtraFormats(options?.extraFormats);
|
|
71
|
+
const effectiveOptions = {
|
|
72
|
+
apiUrl: finalApiUrl,
|
|
73
|
+
apiKey: finalApiKey,
|
|
74
|
+
extraFormats: finalExtraFormats,
|
|
75
|
+
};
|
|
76
|
+
const integration = {
|
|
77
|
+
provider: MinerU,
|
|
78
|
+
options: effectiveOptions,
|
|
79
|
+
};
|
|
80
|
+
const mineruClient = new MinerUClient(configService, {
|
|
81
|
+
fileSystem,
|
|
82
|
+
integration,
|
|
83
|
+
});
|
|
84
|
+
let finalFileName = 'document.pdf';
|
|
85
|
+
try {
|
|
86
|
+
const parsed = new URL(doc_url);
|
|
87
|
+
finalFileName = parsed.pathname.split('/').pop() || 'document.pdf';
|
|
88
|
+
}
|
|
89
|
+
catch {
|
|
90
|
+
// Ignore URL parsing errors.
|
|
91
|
+
}
|
|
92
|
+
const { taskId } = await mineruClient.createTask({
|
|
93
|
+
url: doc_url,
|
|
94
|
+
fileName: finalFileName,
|
|
95
|
+
isOcr: finalIsOcr,
|
|
96
|
+
enableFormula: finalEnableFormula,
|
|
97
|
+
enableTable: finalEnableTable,
|
|
98
|
+
language: finalLanguage,
|
|
99
|
+
modelVersion: finalModelVersion,
|
|
100
|
+
pageRanges: page_ranges ?? undefined,
|
|
101
|
+
extraFormats: finalExtraFormats,
|
|
102
|
+
});
|
|
103
|
+
let parsedResult;
|
|
104
|
+
if (mineruClient.serverType === 'self-hosted') {
|
|
105
|
+
const taskResult = mineruClient.getSelfHostedTask(taskId);
|
|
106
|
+
if (!taskResult) {
|
|
107
|
+
throw new Error('Failed to get MinerU task result');
|
|
108
|
+
}
|
|
109
|
+
parsedResult = await resultParser.parseLocalTask(taskResult, taskId, {
|
|
110
|
+
fileUrl: doc_url,
|
|
111
|
+
name: finalFileName,
|
|
112
|
+
folder: workspacePath,
|
|
113
|
+
}, fileSystem);
|
|
114
|
+
}
|
|
115
|
+
else {
|
|
116
|
+
const result = await mineruClient.waitForTask(taskId, 5 * 60 * 1000, 5000);
|
|
117
|
+
const fullZipUrl = result.full_zip_url;
|
|
118
|
+
parsedResult = await resultParser.parseFromUrl(fullZipUrl, taskId, {
|
|
119
|
+
fileUrl: doc_url,
|
|
120
|
+
name: finalFileName,
|
|
121
|
+
folder: workspacePath,
|
|
122
|
+
}, fileSystem);
|
|
123
|
+
if (fullZipUrl) {
|
|
124
|
+
parsedResult.metadata = parsedResult.metadata ?? {};
|
|
125
|
+
parsedResult.metadata.fullZipUrl = parsedResult.metadata.fullZipUrl ?? fullZipUrl;
|
|
126
|
+
parsedResult.metadata.full_zip_url = parsedResult.metadata.full_zip_url ?? fullZipUrl;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
const fileArtifacts = [];
|
|
130
|
+
if (parsedResult.metadata?.assets) {
|
|
131
|
+
for (const asset of parsedResult.metadata.assets) {
|
|
132
|
+
if (asset.type === 'file' || asset.type === 'image') {
|
|
133
|
+
const fileName = asset.filePath?.split(/[/\\]/).pop() ||
|
|
134
|
+
asset.url?.split('/').pop() ||
|
|
135
|
+
'file';
|
|
136
|
+
const extension = fileName.split('.').pop()?.toLowerCase() || 'md';
|
|
137
|
+
const mimeType = asset.type === 'image'
|
|
138
|
+
? extension === 'png'
|
|
139
|
+
? 'image/png'
|
|
140
|
+
: 'image/jpeg'
|
|
141
|
+
: extension === 'md'
|
|
142
|
+
? 'text/markdown'
|
|
143
|
+
: 'application/json';
|
|
144
|
+
fileArtifacts.push({
|
|
145
|
+
fileName: fileName,
|
|
146
|
+
filePath: asset.filePath,
|
|
147
|
+
fileUrl: asset.url,
|
|
148
|
+
mimeType: mimeType,
|
|
149
|
+
extension: extension,
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
const markdownContent = parsedResult.chunks?.map((chunk) => chunk.pageContent).join('\n\n') || '';
|
|
155
|
+
return [
|
|
156
|
+
markdownContent,
|
|
157
|
+
{
|
|
158
|
+
files: fileArtifacts,
|
|
159
|
+
taskId,
|
|
160
|
+
metadata: parsedResult.metadata,
|
|
161
|
+
},
|
|
162
|
+
];
|
|
163
|
+
}
|
|
164
|
+
catch (error) {
|
|
165
|
+
throw new Error(`MinerU processing failed: ${getErrorMessage(error)}`);
|
|
166
|
+
}
|
|
167
|
+
}, {
|
|
168
|
+
name: 'mineru_pdf_parser',
|
|
169
|
+
description: 'Convert documents to markdown using MinerU. Supports OCR, formula recognition, and table extraction. Returns markdown content and extracted files.',
|
|
170
|
+
schema: z.object({
|
|
171
|
+
doc_url: z.string().min(1).describe('Document URL (required)'),
|
|
172
|
+
page_ranges: z
|
|
173
|
+
.string()
|
|
174
|
+
.optional()
|
|
175
|
+
.nullable()
|
|
176
|
+
.describe('Page ranges like "2,4-6" or "2--2"'),
|
|
177
|
+
}),
|
|
178
|
+
responseFormat: 'content_and_artifact',
|
|
179
|
+
});
|
|
180
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { StructuredToolInterface, ToolSchemaBase } from '@langchain/core/tools';
|
|
2
|
+
import { BuiltinToolset, XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
3
|
+
import { ConfigService } from '@nestjs/config';
|
|
4
|
+
import { MinerUResultParserService } from './result-parser.service.js';
|
|
5
|
+
export interface MinerUToolsetConfig {
|
|
6
|
+
apiUrl?: string;
|
|
7
|
+
apiKey?: string;
|
|
8
|
+
extraFormats?: string | string[];
|
|
9
|
+
fileSystem?: XpFileSystem;
|
|
10
|
+
configService?: ConfigService;
|
|
11
|
+
resultParser?: MinerUResultParserService;
|
|
12
|
+
isOcr?: boolean | string;
|
|
13
|
+
enableFormula?: boolean | string;
|
|
14
|
+
enableTable?: boolean | string;
|
|
15
|
+
language?: 'en' | 'ch';
|
|
16
|
+
modelVersion?: 'pipeline' | 'vlm';
|
|
17
|
+
}
|
|
18
|
+
export declare class MinerUToolset extends BuiltinToolset<StructuredToolInterface, MinerUToolsetConfig> {
|
|
19
|
+
private readonly config;
|
|
20
|
+
tools: any[];
|
|
21
|
+
constructor(config: MinerUToolsetConfig);
|
|
22
|
+
_validateCredentials(_credentials: MinerUToolsetConfig): Promise<void>;
|
|
23
|
+
initTools(): Promise<StructuredToolInterface<ToolSchemaBase, any, any>[]>;
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=mineru.toolset.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mineru.toolset.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.toolset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAChF,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACpE,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAIvE,MAAM,WAAW,mBAAmB;IAClC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IACjC,UAAU,CAAC,EAAE,YAAY,CAAC;IAC1B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,YAAY,CAAC,EAAE,yBAAyB,CAAC;IACzC,KAAK,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACzB,aAAa,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IACjC,WAAW,CAAC,EAAE,OAAO,GAAG,MAAM,CAAC;IAC/B,QAAQ,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACvB,YAAY,CAAC,EAAE,UAAU,GAAG,KAAK,CAAC;CACnC;AAED,qBAAa,aAAc,SAAQ,cAAc,CAAC,uBAAuB,EAAE,mBAAmB,CAAC;IAC7F,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAsB;IACpC,KAAK,EAAE,GAAG,EAAE,CAAM;gBAEf,MAAM,EAAE,mBAAmB;IAiBxB,oBAAoB,CAAC,YAAY,EAAE,mBAAmB,GAAG,OAAO,CAAC,IAAI,CAAC;IAItE,SAAS,IAAI,OAAO,CAAC,uBAAuB,CAAC,cAAc,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;CA+CzF"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { BuiltinToolset } from '@xpert-ai/plugin-sdk';
|
|
2
|
+
import { buildMinerUTool } from './mineru.tool.js';
|
|
3
|
+
export class MinerUToolset extends BuiltinToolset {
|
|
4
|
+
constructor(config) {
|
|
5
|
+
super('mineru', undefined, config);
|
|
6
|
+
this.tools = [];
|
|
7
|
+
this.config = config;
|
|
8
|
+
const configForLog = { ...config };
|
|
9
|
+
if (configForLog.apiKey) {
|
|
10
|
+
configForLog.apiKey =
|
|
11
|
+
configForLog.apiKey.length > 8
|
|
12
|
+
? `${configForLog.apiKey.substring(0, 4)}...${configForLog.apiKey.substring(configForLog.apiKey.length - 4)}`
|
|
13
|
+
: '***';
|
|
14
|
+
}
|
|
15
|
+
if ('logger' in this && this.logger) {
|
|
16
|
+
this.logger.log(`[MinerU] MinerUToolset constructor received config: ${JSON.stringify(configForLog, null, 2)}`);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
async _validateCredentials(_credentials) {
|
|
20
|
+
// No validation during authorization phase.
|
|
21
|
+
}
|
|
22
|
+
async initTools() {
|
|
23
|
+
const { configService, resultParser, apiUrl, apiKey, extraFormats, fileSystem, isOcr, enableFormula, enableTable, language, modelVersion, } = this.config;
|
|
24
|
+
if (!configService || !resultParser) {
|
|
25
|
+
throw new Error('ConfigService and MinerUResultParserService are required');
|
|
26
|
+
}
|
|
27
|
+
const finalApiUrl = apiUrl || 'https://mineru.net/api/v4';
|
|
28
|
+
const finalIsOcr = isOcr === 'true' || isOcr === true;
|
|
29
|
+
const finalEnableFormula = enableFormula === 'true' || enableFormula === true;
|
|
30
|
+
const finalEnableTable = enableTable === 'true' || enableTable === true;
|
|
31
|
+
const finalLanguage = language || 'ch';
|
|
32
|
+
const finalModelVersion = modelVersion || 'pipeline';
|
|
33
|
+
this.tools = [
|
|
34
|
+
buildMinerUTool(configService, resultParser, {
|
|
35
|
+
apiUrl: finalApiUrl,
|
|
36
|
+
apiKey,
|
|
37
|
+
extraFormats,
|
|
38
|
+
}, fileSystem, {
|
|
39
|
+
isOcr: finalIsOcr,
|
|
40
|
+
enableFormula: finalEnableFormula,
|
|
41
|
+
enableTable: finalEnableTable,
|
|
42
|
+
language: finalLanguage,
|
|
43
|
+
modelVersion: finalModelVersion,
|
|
44
|
+
}),
|
|
45
|
+
];
|
|
46
|
+
return this.tools;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
@@ -4,12 +4,12 @@ import { ChunkMetadata, XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
|
4
4
|
import { MinerUDocumentMetadata, MineruSelfHostedTaskResult } from './types.js';
|
|
5
5
|
export declare class MinerUResultParserService {
|
|
6
6
|
private readonly logger;
|
|
7
|
-
parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem
|
|
7
|
+
parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem?: XpFileSystem): Promise<{
|
|
8
8
|
id?: string;
|
|
9
9
|
chunks: Document<ChunkMetadata>[];
|
|
10
10
|
metadata: MinerUDocumentMetadata;
|
|
11
11
|
}>;
|
|
12
|
-
parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem
|
|
12
|
+
parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem?: XpFileSystem): Promise<{
|
|
13
13
|
id?: string;
|
|
14
14
|
chunks: Document<ChunkMetadata>[];
|
|
15
15
|
metadata: MinerUDocumentMetadata;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,
|
|
1
|
+
{"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,CAAC,EAAE,YAAY,GACxB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;IAmGI,cAAc,CAClB,MAAM,EAAE,0BAA0B,EAClC,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,CAAC,EAAE,YAAY,GACxB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;CA2DH"}
|
|
@@ -37,43 +37,59 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
|
|
|
37
37
|
zipEntries.push({ entryName: entry.path, data });
|
|
38
38
|
const fileName = entry.path;
|
|
39
39
|
const filePath = join(document.folder || '', entry.path);
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
40
|
+
if (fileSystem) {
|
|
41
|
+
const url = await fileSystem.writeFile(filePath, data);
|
|
42
|
+
pathMap.set(fileName, url);
|
|
43
|
+
// Write images to local file system
|
|
44
|
+
if (fileName.startsWith('images/')) {
|
|
45
|
+
assets.push({
|
|
46
|
+
type: 'image',
|
|
47
|
+
url: url,
|
|
48
|
+
filePath: filePath,
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
else if (fileName.endsWith('layout.json')) {
|
|
52
|
+
layoutJson = JSON.parse(data.toString('utf-8'));
|
|
53
|
+
metadata.mineruBackend = layoutJson?._backend;
|
|
54
|
+
metadata.mineruVersion = layoutJson?._version_name;
|
|
55
|
+
assets.push({
|
|
56
|
+
type: 'file',
|
|
57
|
+
url,
|
|
58
|
+
filePath: filePath,
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
else if (fileName.endsWith('content_list.json')) {
|
|
62
|
+
assets.push({
|
|
63
|
+
type: 'file',
|
|
64
|
+
url,
|
|
65
|
+
filePath: filePath,
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
else if (fileName.endsWith('full.md')) {
|
|
69
|
+
fullMd = data.toString('utf-8');
|
|
70
|
+
assets.push({
|
|
71
|
+
type: 'file',
|
|
72
|
+
url,
|
|
73
|
+
filePath: filePath,
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
else if (fileName.endsWith('origin.pdf')) {
|
|
77
|
+
metadata.originPdfUrl = fileName;
|
|
78
|
+
}
|
|
74
79
|
}
|
|
75
|
-
else
|
|
76
|
-
metadata
|
|
80
|
+
else {
|
|
81
|
+
// Still extract key metadata & markdown without writing to filesystem.
|
|
82
|
+
if (fileName.endsWith('layout.json')) {
|
|
83
|
+
layoutJson = JSON.parse(data.toString('utf-8'));
|
|
84
|
+
metadata.mineruBackend = layoutJson?._backend;
|
|
85
|
+
metadata.mineruVersion = layoutJson?._version_name;
|
|
86
|
+
}
|
|
87
|
+
else if (fileName.endsWith('full.md')) {
|
|
88
|
+
fullMd = data.toString('utf-8');
|
|
89
|
+
}
|
|
90
|
+
else if (fileName.endsWith('origin.pdf')) {
|
|
91
|
+
metadata.originPdfUrl = fileName;
|
|
92
|
+
}
|
|
77
93
|
}
|
|
78
94
|
}
|
|
79
95
|
metadata.assets = assets;
|
|
@@ -102,13 +118,23 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
|
|
|
102
118
|
const pathMap = new Map();
|
|
103
119
|
for (const image of result.images) {
|
|
104
120
|
const filePath = join(document.folder || '', 'images', image.name);
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
121
|
+
if (fileSystem) {
|
|
122
|
+
const url = await fileSystem.writeFile(filePath, Buffer.from(image.dataUrl.split(',')[1], 'base64'));
|
|
123
|
+
pathMap.set(`images/${image.name}`, url);
|
|
124
|
+
assets.push({
|
|
125
|
+
type: 'image',
|
|
126
|
+
url: url,
|
|
127
|
+
filePath: filePath,
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
pathMap.set(`images/${image.name}`, image.dataUrl);
|
|
132
|
+
assets.push({
|
|
133
|
+
type: 'image',
|
|
134
|
+
url: image.dataUrl,
|
|
135
|
+
filePath: filePath,
|
|
136
|
+
});
|
|
137
|
+
}
|
|
112
138
|
}
|
|
113
139
|
if (result.sourceUrl) {
|
|
114
140
|
assets.push({
|