@helloxiaohu/plugin-mineru6 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +40 -0
- package/dist/lib/integration.strategy.d.ts +10 -0
- package/dist/lib/integration.strategy.d.ts.map +1 -0
- package/dist/lib/integration.strategy.js +118 -0
- package/dist/lib/mineru-toolset.strategy.d.ts +221 -0
- package/dist/lib/mineru-toolset.strategy.d.ts.map +1 -0
- package/dist/lib/mineru-toolset.strategy.js +236 -0
- package/dist/lib/mineru.client.d.ts +120 -0
- package/dist/lib/mineru.client.d.ts.map +1 -0
- package/dist/lib/mineru.client.js +456 -0
- package/dist/lib/mineru.controller.d.ts +9 -0
- package/dist/lib/mineru.controller.d.ts.map +1 -0
- package/dist/lib/mineru.controller.js +41 -0
- package/dist/lib/mineru.plugin.d.ts +13 -0
- package/dist/lib/mineru.plugin.d.ts.map +1 -0
- package/dist/lib/mineru.plugin.js +52 -0
- package/dist/lib/mineru.tool.d.ts +75 -0
- package/dist/lib/mineru.tool.d.ts.map +1 -0
- package/dist/lib/mineru.tool.js +141 -0
- package/dist/lib/mineru.toolset.d.ts +51 -0
- package/dist/lib/mineru.toolset.d.ts.map +1 -0
- package/dist/lib/mineru.toolset.js +52 -0
- package/dist/lib/path-meta.d.ts +5 -0
- package/dist/lib/path-meta.d.ts.map +1 -0
- package/dist/lib/path-meta.js +8 -0
- package/dist/lib/result-parser.service.d.ts +18 -0
- package/dist/lib/result-parser.service.d.ts.map +1 -0
- package/dist/lib/result-parser.service.js +171 -0
- package/dist/lib/transformer-mineru.strategy.d.ts +95 -0
- package/dist/lib/transformer-mineru.strategy.d.ts.map +1 -0
- package/dist/lib/transformer-mineru.strategy.js +163 -0
- package/dist/lib/types.d.ts +53 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +40 -0
- package/package.json +62 -0
|
@@ -0,0 +1,456 @@
|
|
|
1
|
+
import { BadRequestException, Logger } from '@nestjs/common';
|
|
2
|
+
import { getErrorMessage } from '@xpert-ai/plugin-sdk';
|
|
3
|
+
import axios from 'axios';
|
|
4
|
+
import FormData from 'form-data';
|
|
5
|
+
import { randomUUID } from 'crypto';
|
|
6
|
+
import { basename } from 'path';
|
|
7
|
+
import fs from 'fs';
|
|
8
|
+
import { ENV_MINERU_API_BASE_URL, ENV_MINERU_API_TOKEN, ENV_MINERU_SERVER_TYPE, } from './types.js';
|
|
9
|
+
const DEFAULT_OFFICIAL_BASE_URL = 'https://mineru.net/api/v4';
|
|
10
|
+
export class MinerUClient {
|
|
11
|
+
get fileSystem() {
|
|
12
|
+
return this.permissions?.fileSystem;
|
|
13
|
+
}
|
|
14
|
+
constructor(configService, permissions) {
|
|
15
|
+
this.configService = configService;
|
|
16
|
+
this.permissions = permissions;
|
|
17
|
+
this.logger = new Logger(MinerUClient.name);
|
|
18
|
+
this.localTasks = new Map();
|
|
19
|
+
const integration = this.permissions?.integration;
|
|
20
|
+
this.serverType = this.resolveServerType(integration);
|
|
21
|
+
const { baseUrl, token } = this.resolveCredentials(integration);
|
|
22
|
+
if (!baseUrl) {
|
|
23
|
+
throw new Error('MinerU base URL is required');
|
|
24
|
+
}
|
|
25
|
+
this.baseUrl = this.normalizeBaseUrl(baseUrl);
|
|
26
|
+
this.token = token;
|
|
27
|
+
if (this.serverType === 'official' && !this.token) {
|
|
28
|
+
throw new Error('MinerU official API requires an access token');
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Create a MinerU extraction task. For self-hosted deployments the file will be uploaded immediately
|
|
33
|
+
* and the parsed result cached locally, while official deployments follow the async task lifecycle.
|
|
34
|
+
*/
|
|
35
|
+
async createTask(options) {
|
|
36
|
+
if (!options.url) {
|
|
37
|
+
throw new Error('MinerU createTask requires a document URL');
|
|
38
|
+
}
|
|
39
|
+
if (this.serverType === 'self-hosted') {
|
|
40
|
+
return this.createSelfHostedTask(options);
|
|
41
|
+
}
|
|
42
|
+
return this.createOfficialTask(options);
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Create a batch MinerU extraction task. Only supported for official MinerU deployments.
|
|
46
|
+
*/
|
|
47
|
+
async createBatchTask(options) {
|
|
48
|
+
this.ensureOfficial('createBatchTask');
|
|
49
|
+
const url = this.buildApiUrl('extract', 'task', 'batch');
|
|
50
|
+
const body = {
|
|
51
|
+
files: options.files.map((file) => {
|
|
52
|
+
const entry = { url: file.url };
|
|
53
|
+
if (file.isOcr !== undefined)
|
|
54
|
+
entry.is_ocr = file.isOcr;
|
|
55
|
+
if (file.dataId)
|
|
56
|
+
entry.data_id = file.dataId;
|
|
57
|
+
if (file.pageRanges)
|
|
58
|
+
entry.page_ranges = file.pageRanges;
|
|
59
|
+
return entry;
|
|
60
|
+
}),
|
|
61
|
+
};
|
|
62
|
+
if (options.enableFormula !== undefined)
|
|
63
|
+
body.enable_formula = options.enableFormula;
|
|
64
|
+
if (options.enableTable !== undefined)
|
|
65
|
+
body.enable_table = options.enableTable;
|
|
66
|
+
if (options.language)
|
|
67
|
+
body.language = options.language;
|
|
68
|
+
if (options.modelVersion)
|
|
69
|
+
body.model_version = options.modelVersion;
|
|
70
|
+
if (options.extraFormats)
|
|
71
|
+
body.extra_formats = options.extraFormats;
|
|
72
|
+
if (options.callbackUrl)
|
|
73
|
+
body.callback = options.callbackUrl;
|
|
74
|
+
if (options.seed)
|
|
75
|
+
body.seed = options.seed;
|
|
76
|
+
try {
|
|
77
|
+
const resp = await axios.post(url, body, { headers: this.getOfficialHeaders() });
|
|
78
|
+
const data = resp.data;
|
|
79
|
+
if (data.code !== 0) {
|
|
80
|
+
throw new Error(`MinerU createBatchTask failed: ${data.msg}`);
|
|
81
|
+
}
|
|
82
|
+
return { batchId: data.data.batch_id, fileUrls: data.data.file_urls };
|
|
83
|
+
}
|
|
84
|
+
catch (err) {
|
|
85
|
+
this.logger.error('createBatchTask error', err instanceof Error ? err.stack : err);
|
|
86
|
+
throw err;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
getSelfHostedTask(taskId) {
|
|
90
|
+
if (this.serverType !== 'self-hosted') {
|
|
91
|
+
throw new Error('getSelfHostedTask is only available for self-hosted MinerU deployments');
|
|
92
|
+
}
|
|
93
|
+
return this.localTasks.get(taskId);
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Query offical task status or results.
|
|
97
|
+
*/
|
|
98
|
+
async getTaskResult(taskId, options) {
|
|
99
|
+
const url = this.buildApiUrl('extract', 'task', taskId);
|
|
100
|
+
const params = {};
|
|
101
|
+
if (options?.enableFormula !== undefined)
|
|
102
|
+
params.enable_formula = options.enableFormula;
|
|
103
|
+
if (options?.enableTable !== undefined)
|
|
104
|
+
params.enable_table = options.enableTable;
|
|
105
|
+
if (options?.language)
|
|
106
|
+
params.language = options.language;
|
|
107
|
+
try {
|
|
108
|
+
const resp = await axios.get(url, { headers: this.getOfficialHeaders(), params });
|
|
109
|
+
const data = resp.data;
|
|
110
|
+
if (data.code !== 0) {
|
|
111
|
+
throw new Error(`MinerU getTaskResult failed: ${data.msg}`);
|
|
112
|
+
}
|
|
113
|
+
return data.data;
|
|
114
|
+
}
|
|
115
|
+
catch (err) {
|
|
116
|
+
this.logger.error('getTaskResult error', err instanceof Error ? err.stack : err);
|
|
117
|
+
throw err;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Query batch task results. Only supported for official MinerU deployments.
|
|
122
|
+
*/
|
|
123
|
+
async getBatchResult(batchId) {
|
|
124
|
+
this.ensureOfficial('getBatchResult');
|
|
125
|
+
const url = this.buildApiUrl('extract-results', 'batch', batchId);
|
|
126
|
+
try {
|
|
127
|
+
const resp = await axios.get(url, { headers: this.getOfficialHeaders() });
|
|
128
|
+
const data = resp.data;
|
|
129
|
+
if (data.code !== 0) {
|
|
130
|
+
throw new Error(`MinerU getBatchResult failed: ${data.msg}`);
|
|
131
|
+
}
|
|
132
|
+
return data.data;
|
|
133
|
+
}
|
|
134
|
+
catch (err) {
|
|
135
|
+
this.logger.error('getBatchResult error', err instanceof Error ? err.stack : err);
|
|
136
|
+
throw err;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Wait for a task to complete and return the result when available.
|
|
141
|
+
*/
|
|
142
|
+
async waitForTask(taskId, timeoutMs = 5 * 60 * 1000, intervalMs = 5000) {
|
|
143
|
+
if (this.serverType === 'self-hosted') {
|
|
144
|
+
throw new Error('waitForTask is not supported for self-hosted MinerU deployments');
|
|
145
|
+
}
|
|
146
|
+
const start = Date.now();
|
|
147
|
+
while (true) {
|
|
148
|
+
const result = await this.getTaskResult(taskId);
|
|
149
|
+
this.logger.debug(`MinerU waiting task result: ${JSON.stringify(result)}`);
|
|
150
|
+
if (result?.full_zip_url || result?.full_url || result?.content || result?.status === 'done') {
|
|
151
|
+
return result;
|
|
152
|
+
}
|
|
153
|
+
if (Date.now() - start > timeoutMs) {
|
|
154
|
+
throw new Error(`MinerU waitForTask timeout after ${timeoutMs} ms`);
|
|
155
|
+
}
|
|
156
|
+
await new Promise((resolve) => setTimeout(resolve, intervalMs));
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
ensureOfficial(feature) {
|
|
160
|
+
if (this.serverType !== 'official') {
|
|
161
|
+
throw new Error(`${feature} is only supported for official MinerU deployments`);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
resolveServerType(integration) {
|
|
165
|
+
const integrationType = this.readIntegrationOptions(integration)?.serverType;
|
|
166
|
+
if (integrationType === 'self-hosted' || integrationType === 'official') {
|
|
167
|
+
return integrationType;
|
|
168
|
+
}
|
|
169
|
+
const envValue = this.configService.get(ENV_MINERU_SERVER_TYPE)?.toLowerCase();
|
|
170
|
+
if (envValue === 'self-hosted') {
|
|
171
|
+
return 'self-hosted';
|
|
172
|
+
}
|
|
173
|
+
return 'official';
|
|
174
|
+
}
|
|
175
|
+
resolveCredentials(integration) {
|
|
176
|
+
const options = this.readIntegrationOptions(integration);
|
|
177
|
+
const baseUrlFromIntegration = options?.apiUrl;
|
|
178
|
+
const tokenFromIntegration = options?.apiKey;
|
|
179
|
+
const baseUrlEnvKey = this.serverType === 'self-hosted' ? ENV_MINERU_API_BASE_URL : ENV_MINERU_API_BASE_URL;
|
|
180
|
+
const tokenEnvKey = this.serverType === 'self-hosted' ? ENV_MINERU_API_TOKEN : ENV_MINERU_API_TOKEN;
|
|
181
|
+
const baseUrlFromEnv = this.configService.get(baseUrlEnvKey);
|
|
182
|
+
const tokenFromEnv = this.configService.get(tokenEnvKey);
|
|
183
|
+
const baseUrl = baseUrlFromIntegration ||
|
|
184
|
+
baseUrlFromEnv ||
|
|
185
|
+
(this.serverType === 'official' ? DEFAULT_OFFICIAL_BASE_URL : null);
|
|
186
|
+
const token = tokenFromIntegration || tokenFromEnv;
|
|
187
|
+
return { baseUrl, token };
|
|
188
|
+
}
|
|
189
|
+
readIntegrationOptions(integration) {
|
|
190
|
+
return integration?.options || undefined;
|
|
191
|
+
}
|
|
192
|
+
normalizeBaseUrl(url) {
|
|
193
|
+
return url.replace(/\/+$/, '');
|
|
194
|
+
}
|
|
195
|
+
buildApiUrl(...segments) {
|
|
196
|
+
const path = segments
|
|
197
|
+
.filter(Boolean)
|
|
198
|
+
.map((segment) => segment.replace(/^\/+|\/+$/g, ''))
|
|
199
|
+
.join('/');
|
|
200
|
+
return path ? `${this.baseUrl}/${path}` : this.baseUrl;
|
|
201
|
+
}
|
|
202
|
+
getOfficialHeaders() {
|
|
203
|
+
return {
|
|
204
|
+
'Content-Type': 'application/json',
|
|
205
|
+
Authorization: `Bearer ${this.token}`,
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
getSelfHostedHeaders() {
|
|
209
|
+
const headers = {
|
|
210
|
+
accept: 'application/json',
|
|
211
|
+
};
|
|
212
|
+
if (this.token) {
|
|
213
|
+
headers.Authorization = `Bearer ${this.token}`;
|
|
214
|
+
}
|
|
215
|
+
return headers;
|
|
216
|
+
}
|
|
217
|
+
async createOfficialTask(options) {
|
|
218
|
+
const url = this.buildApiUrl('extract', 'task');
|
|
219
|
+
const body = { url: options.url };
|
|
220
|
+
if (options.isOcr !== undefined)
|
|
221
|
+
body.is_ocr = options.isOcr;
|
|
222
|
+
if (options.enableFormula !== undefined)
|
|
223
|
+
body.enable_formula = options.enableFormula;
|
|
224
|
+
if (options.enableTable !== undefined)
|
|
225
|
+
body.enable_table = options.enableTable;
|
|
226
|
+
if (options.language)
|
|
227
|
+
body.language = options.language;
|
|
228
|
+
if (options.modelVersion)
|
|
229
|
+
body.model_version = options.modelVersion;
|
|
230
|
+
if (options.dataId)
|
|
231
|
+
body.data_id = options.dataId;
|
|
232
|
+
if (options.pageRanges)
|
|
233
|
+
body.page_ranges = options.pageRanges;
|
|
234
|
+
if (options.extraFormats)
|
|
235
|
+
body.extra_formats = options.extraFormats;
|
|
236
|
+
if (options.callbackUrl)
|
|
237
|
+
body.callback = options.callbackUrl;
|
|
238
|
+
if (options.seed)
|
|
239
|
+
body.seed = options.seed;
|
|
240
|
+
try {
|
|
241
|
+
const resp = await axios.post(url, body, { headers: this.getOfficialHeaders() });
|
|
242
|
+
const data = resp.data;
|
|
243
|
+
if (data.code !== 0) {
|
|
244
|
+
throw new Error(`MinerU createTask failed: ${data.msg}`);
|
|
245
|
+
}
|
|
246
|
+
return { taskId: data.data.task_id };
|
|
247
|
+
}
|
|
248
|
+
catch (err) {
|
|
249
|
+
this.logger.error('createTask error', err instanceof Error ? err.stack : err);
|
|
250
|
+
throw err;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
async createSelfHostedTask(options) {
|
|
254
|
+
const filePath = this.fileSystem.fullPath(options.filePath);
|
|
255
|
+
const taskId = randomUUID();
|
|
256
|
+
const result = await this.invokeSelfHostedParse(filePath, options.fileName, options);
|
|
257
|
+
this.localTasks.set(taskId, { ...result, sourceUrl: options.url });
|
|
258
|
+
return { taskId };
|
|
259
|
+
}
|
|
260
|
+
async invokeSelfHostedParse(filePath, fileName, options) {
|
|
261
|
+
const parseUrl = this.buildApiUrl('file_parse');
|
|
262
|
+
const form = new FormData();
|
|
263
|
+
form.append('files', fs.createReadStream(filePath), {
|
|
264
|
+
filename: fileName,
|
|
265
|
+
});
|
|
266
|
+
// form.append('files', fileBuffer, { filename: fileName, contentType: contentType || 'application/pdf' });
|
|
267
|
+
form.append('parse_method', options.parseMethod ?? 'auto');
|
|
268
|
+
form.append('return_md', 'true');
|
|
269
|
+
form.append('return_model_output', 'false');
|
|
270
|
+
form.append('return_content_list', 'true');
|
|
271
|
+
// form.append('lang_list', JSON.stringify(this.buildLanguageList(options.language)));
|
|
272
|
+
form.append('return_images', 'true');
|
|
273
|
+
form.append('backend', options.backend ?? options.modelVersion ?? 'pipeline');
|
|
274
|
+
form.append('formula_enable', this.booleanToString(options.enableFormula ?? true));
|
|
275
|
+
form.append('table_enable', this.booleanToString(options.enableTable ?? true));
|
|
276
|
+
form.append('return_middle_json', this.booleanToString(options.returnMiddleJson ?? false));
|
|
277
|
+
if (options.serverUrl) {
|
|
278
|
+
form.append('server_url', options.serverUrl);
|
|
279
|
+
}
|
|
280
|
+
const headers = {
|
|
281
|
+
...this.getSelfHostedHeaders(),
|
|
282
|
+
...form.getHeaders(),
|
|
283
|
+
};
|
|
284
|
+
const response = await axios.post(parseUrl, form, {
|
|
285
|
+
headers,
|
|
286
|
+
maxBodyLength: Infinity,
|
|
287
|
+
validateStatus: () => true,
|
|
288
|
+
});
|
|
289
|
+
if (this.isSelfHostedApiV1(response)) {
|
|
290
|
+
return this.invokeSelfHostedParseV1(filePath, fileName, options);
|
|
291
|
+
}
|
|
292
|
+
if (response.status === 400) {
|
|
293
|
+
throw new BadRequestException(`MinerU self-hosted parse failed: ${response.status} ${getErrorMessage(response.data)}`);
|
|
294
|
+
}
|
|
295
|
+
if (response.status !== 200) {
|
|
296
|
+
console.error(response.data);
|
|
297
|
+
throw new Error(`MinerU self-hosted parse failed: ${response.status} ${response.statusText}`);
|
|
298
|
+
}
|
|
299
|
+
return this.normalizeSelfHostedResponse(response.data);
|
|
300
|
+
}
|
|
301
|
+
async invokeSelfHostedParseV1(filePath, fileName, options) {
|
|
302
|
+
const parseUrl = this.buildApiUrl('file_parse');
|
|
303
|
+
const form = new FormData();
|
|
304
|
+
form.append('files', fs.createReadStream(filePath), {
|
|
305
|
+
filename: fileName,
|
|
306
|
+
});
|
|
307
|
+
const params = {
|
|
308
|
+
parse_method: options.parseMethod ?? 'auto',
|
|
309
|
+
return_layout: false,
|
|
310
|
+
return_info: false,
|
|
311
|
+
return_content_list: true,
|
|
312
|
+
return_images: true,
|
|
313
|
+
};
|
|
314
|
+
const headers = {
|
|
315
|
+
...this.getSelfHostedHeaders(),
|
|
316
|
+
...form.getHeaders(),
|
|
317
|
+
};
|
|
318
|
+
try {
|
|
319
|
+
const response = await axios.post(parseUrl, form, {
|
|
320
|
+
headers,
|
|
321
|
+
params,
|
|
322
|
+
maxBodyLength: Infinity,
|
|
323
|
+
validateStatus: () => true,
|
|
324
|
+
});
|
|
325
|
+
if (response.status !== 200) {
|
|
326
|
+
throw new Error(`MinerU self-hosted legacy parse failed: ${response.status} ${response.statusText}`);
|
|
327
|
+
}
|
|
328
|
+
return this.normalizeSelfHostedResponse(response.data);
|
|
329
|
+
}
|
|
330
|
+
catch (error) {
|
|
331
|
+
this.logger.error('invokeSelfHostedParseV1 error', error instanceof Error ? error.stack : error);
|
|
332
|
+
throw error;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
isSelfHostedApiV1(response) {
|
|
336
|
+
if (response.status !== 422) {
|
|
337
|
+
return false;
|
|
338
|
+
}
|
|
339
|
+
const detail = response.data?.detail;
|
|
340
|
+
if (!Array.isArray(detail)) {
|
|
341
|
+
return false;
|
|
342
|
+
}
|
|
343
|
+
return detail.some((item) => {
|
|
344
|
+
const loc = item?.loc;
|
|
345
|
+
return item?.type === 'missing' && Array.isArray(loc) && loc[0] === 'body' && loc[1] === 'file';
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
normalizeSelfHostedResponse(payload) {
|
|
349
|
+
if (!payload) {
|
|
350
|
+
throw new Error('MinerU self-hosted parse returned empty payload');
|
|
351
|
+
}
|
|
352
|
+
if (payload.results && typeof payload.results === 'object') {
|
|
353
|
+
const [firstKey] = Object.keys(payload.results);
|
|
354
|
+
if (firstKey) {
|
|
355
|
+
return this.normalizeSelfHostedFileResult(payload.results[firstKey], firstKey);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
return this.normalizeSelfHostedFileResult(payload);
|
|
359
|
+
}
|
|
360
|
+
normalizeSelfHostedFileResult(result, fileName) {
|
|
361
|
+
const mdContent = result?.md_content ?? '';
|
|
362
|
+
const contentList = this.parseJsonSafe(result?.content_list);
|
|
363
|
+
const images = this.normalizeImageMap(result?.images);
|
|
364
|
+
return {
|
|
365
|
+
mdContent,
|
|
366
|
+
contentList,
|
|
367
|
+
images,
|
|
368
|
+
raw: result,
|
|
369
|
+
fileName,
|
|
370
|
+
};
|
|
371
|
+
}
|
|
372
|
+
normalizeImageMap(map) {
|
|
373
|
+
if (!map) {
|
|
374
|
+
return [];
|
|
375
|
+
}
|
|
376
|
+
return Object.entries(map).map(([name, dataUrl]) => ({ name, dataUrl }));
|
|
377
|
+
}
|
|
378
|
+
parseJsonSafe(value) {
|
|
379
|
+
if (typeof value !== 'string') {
|
|
380
|
+
return value;
|
|
381
|
+
}
|
|
382
|
+
try {
|
|
383
|
+
return JSON.parse(value);
|
|
384
|
+
}
|
|
385
|
+
catch (error) {
|
|
386
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
387
|
+
this.logger.warn(`Failed to parse MinerU content_list JSON: ${message}`);
|
|
388
|
+
return value;
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
buildLanguageList(language) {
|
|
392
|
+
if (!language || language === 'auto') {
|
|
393
|
+
return ['zh'];
|
|
394
|
+
}
|
|
395
|
+
return [language];
|
|
396
|
+
}
|
|
397
|
+
booleanToString(value) {
|
|
398
|
+
return value ? 'true' : 'false';
|
|
399
|
+
}
|
|
400
|
+
async downloadFile(url) {
|
|
401
|
+
try {
|
|
402
|
+
const response = await axios.get(url, { responseType: 'arraybuffer' });
|
|
403
|
+
const buffer = Buffer.from(response.data);
|
|
404
|
+
const contentType = response.headers['content-type'];
|
|
405
|
+
const contentDisposition = response.headers['content-disposition'];
|
|
406
|
+
return {
|
|
407
|
+
buffer,
|
|
408
|
+
fileName: this.extractFileName(url, contentDisposition),
|
|
409
|
+
contentType,
|
|
410
|
+
};
|
|
411
|
+
}
|
|
412
|
+
catch (error) {
|
|
413
|
+
this.logger.error(`Failed to download file for MinerU from ${url}`, error instanceof Error ? error.stack : error);
|
|
414
|
+
throw error;
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
extractFileName(sourceUrl, contentDisposition) {
|
|
418
|
+
const dispositionMatch = /filename\*=UTF-8''([^;]+)|filename="?([^";]+)"?/i.exec(contentDisposition || '');
|
|
419
|
+
if (dispositionMatch) {
|
|
420
|
+
const encodedName = dispositionMatch[1] || dispositionMatch[2];
|
|
421
|
+
if (encodedName) {
|
|
422
|
+
try {
|
|
423
|
+
return decodeURIComponent(encodedName);
|
|
424
|
+
}
|
|
425
|
+
catch {
|
|
426
|
+
return encodedName;
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
try {
|
|
431
|
+
const pathname = new URL(sourceUrl).pathname;
|
|
432
|
+
const name = basename(pathname);
|
|
433
|
+
if (name) {
|
|
434
|
+
return name;
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
catch {
|
|
438
|
+
// Ignore URL parsing errors and fallback to default
|
|
439
|
+
}
|
|
440
|
+
return `mineru-${Date.now()}.pdf`;
|
|
441
|
+
}
|
|
442
|
+
getSelfHostedOpenApiSpec() {
|
|
443
|
+
const url = this.buildApiUrl('openapi.json');
|
|
444
|
+
return axios.get(url, { headers: this.getSelfHostedHeaders() });
|
|
445
|
+
}
|
|
446
|
+
async validateOfficialApiToken() {
|
|
447
|
+
const url = this.buildApiUrl('/extract/task/xxxxxxx');
|
|
448
|
+
const response = await axios.get(url, { headers: this.getOfficialHeaders() });
|
|
449
|
+
if (response.status !== 200) {
|
|
450
|
+
throw new BadRequestException(`MinerU official API token validation failed: ${getErrorMessage(response.data)}`);
|
|
451
|
+
}
|
|
452
|
+
if (response.data.code !== -60012) {
|
|
453
|
+
throw new BadRequestException('MinerU official Base URL or API token is invalid');
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { type IIntegration, LanguagesEnum } from '@metad/contracts';
|
|
2
|
+
export declare class MinerUController {
|
|
3
|
+
private readonly minerUIntegrationStrategy;
|
|
4
|
+
connect(integration: IIntegration, languageCode: LanguagesEnum): Promise<{
|
|
5
|
+
success: boolean;
|
|
6
|
+
}>;
|
|
7
|
+
private formatErrorMessage;
|
|
8
|
+
}
|
|
9
|
+
//# sourceMappingURL=mineru.controller.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mineru.controller.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.controller.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,YAAY,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAA;AAKnE,qBACa,gBAAgB;IAG5B,OAAO,CAAC,QAAQ,CAAC,yBAAyB,CAA2B;IAI/D,OAAO,CAAS,WAAW,EAAE,YAAY,EAAc,YAAY,EAAE,aAAa;;;IAWxF,OAAO,CAAC,kBAAkB;CAW1B"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { __decorate, __metadata, __param } from "tslib";
|
|
2
|
+
import { LanguagesEnum } from '@metad/contracts';
|
|
3
|
+
import { BadRequestException, Body, Controller, Inject, Post } from '@nestjs/common';
|
|
4
|
+
import { I18nLang } from 'nestjs-i18n';
|
|
5
|
+
import { MinerUIntegrationStrategy } from './integration.strategy.js';
|
|
6
|
+
let MinerUController = class MinerUController {
|
|
7
|
+
async connect(integration, languageCode) {
|
|
8
|
+
try {
|
|
9
|
+
await this.minerUIntegrationStrategy.validateConfig(integration.options);
|
|
10
|
+
}
|
|
11
|
+
catch (error) {
|
|
12
|
+
console.error(error);
|
|
13
|
+
throw new BadRequestException(this.formatErrorMessage(languageCode, error));
|
|
14
|
+
}
|
|
15
|
+
return { success: true };
|
|
16
|
+
}
|
|
17
|
+
formatErrorMessage(languageCode, error) {
|
|
18
|
+
const baseMessage = {
|
|
19
|
+
[LanguagesEnum.English]: 'Failed to connect to MinerU. Please verify your API configuration.',
|
|
20
|
+
[LanguagesEnum.SimplifiedChinese]: '无法连接到 MinerU。请检查您的 API 配置。',
|
|
21
|
+
}[languageCode] ?? 'Failed to connect to MinerU.';
|
|
22
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
23
|
+
return [baseMessage, detail].filter(Boolean).join(' ');
|
|
24
|
+
}
|
|
25
|
+
};
|
|
26
|
+
__decorate([
|
|
27
|
+
Inject(MinerUIntegrationStrategy),
|
|
28
|
+
__metadata("design:type", MinerUIntegrationStrategy)
|
|
29
|
+
], MinerUController.prototype, "minerUIntegrationStrategy", void 0);
|
|
30
|
+
__decorate([
|
|
31
|
+
Post('test'),
|
|
32
|
+
__param(0, Body()),
|
|
33
|
+
__param(1, I18nLang()),
|
|
34
|
+
__metadata("design:type", Function),
|
|
35
|
+
__metadata("design:paramtypes", [Object, String]),
|
|
36
|
+
__metadata("design:returntype", Promise)
|
|
37
|
+
], MinerUController.prototype, "connect", null);
|
|
38
|
+
MinerUController = __decorate([
|
|
39
|
+
Controller('mineru')
|
|
40
|
+
], MinerUController);
|
|
41
|
+
export { MinerUController };
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { IOnPluginBootstrap, IOnPluginDestroy } from '@xpert-ai/plugin-sdk';
|
|
2
|
+
export declare class MinerUPlugin implements IOnPluginBootstrap, IOnPluginDestroy {
|
|
3
|
+
private logEnabled;
|
|
4
|
+
/**
|
|
5
|
+
* Called when the plugin is being initialized.
|
|
6
|
+
*/
|
|
7
|
+
onPluginBootstrap(): void | Promise<void>;
|
|
8
|
+
/**
|
|
9
|
+
* Called when the plugin is being destroyed.
|
|
10
|
+
*/
|
|
11
|
+
onPluginDestroy(): void | Promise<void>;
|
|
12
|
+
}
|
|
13
|
+
//# sourceMappingURL=mineru.plugin.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mineru.plugin.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.plugin.ts"],"names":[],"mappings":"AACA,OAAO,EAAqB,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAQ/F,qBAkBa,YAAa,YAAW,kBAAkB,EAAE,gBAAgB;IAExE,OAAO,CAAC,UAAU,CAAQ;IAE1B;;OAEG;IACH,iBAAiB,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAMzC;;OAEG;IACH,eAAe,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;CAKvC"}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
var MinerUPlugin_1;
|
|
2
|
+
import { __decorate } from "tslib";
|
|
3
|
+
import chalk from 'chalk';
|
|
4
|
+
import { XpertServerPlugin } from '@xpert-ai/plugin-sdk';
|
|
5
|
+
import { ConfigModule } from '@nestjs/config';
|
|
6
|
+
import { MinerUTransformerStrategy } from './transformer-mineru.strategy.js';
|
|
7
|
+
import { MinerUResultParserService } from './result-parser.service.js';
|
|
8
|
+
import { MinerUIntegrationStrategy } from './integration.strategy.js';
|
|
9
|
+
import { MinerUController } from './mineru.controller.js';
|
|
10
|
+
import { MinerUToolsetStrategy } from './mineru-toolset.strategy.js';
|
|
11
|
+
let MinerUPlugin = MinerUPlugin_1 = class MinerUPlugin {
|
|
12
|
+
constructor() {
|
|
13
|
+
// We disable by default additional logging for each event to avoid cluttering the logs
|
|
14
|
+
this.logEnabled = true;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Called when the plugin is being initialized.
|
|
18
|
+
*/
|
|
19
|
+
onPluginBootstrap() {
|
|
20
|
+
if (this.logEnabled) {
|
|
21
|
+
console.log(chalk.green(`${MinerUPlugin_1.name} is being bootstrapped...`));
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Called when the plugin is being destroyed.
|
|
26
|
+
*/
|
|
27
|
+
onPluginDestroy() {
|
|
28
|
+
if (this.logEnabled) {
|
|
29
|
+
console.log(chalk.green(`${MinerUPlugin_1.name} is being destroyed...`));
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
};
|
|
33
|
+
MinerUPlugin = MinerUPlugin_1 = __decorate([
|
|
34
|
+
XpertServerPlugin({
|
|
35
|
+
/**
|
|
36
|
+
* An array of modules that will be imported and registered with the plugin.
|
|
37
|
+
*/
|
|
38
|
+
imports: [
|
|
39
|
+
ConfigModule,
|
|
40
|
+
],
|
|
41
|
+
providers: [
|
|
42
|
+
MinerUIntegrationStrategy,
|
|
43
|
+
MinerUTransformerStrategy,
|
|
44
|
+
MinerUResultParserService,
|
|
45
|
+
MinerUToolsetStrategy,
|
|
46
|
+
],
|
|
47
|
+
controllers: [
|
|
48
|
+
MinerUController
|
|
49
|
+
]
|
|
50
|
+
})
|
|
51
|
+
], MinerUPlugin);
|
|
52
|
+
export { MinerUPlugin };
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import { XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
import { ConfigService } from '@nestjs/config';
|
|
4
|
+
import { MinerUResultParserService } from './result-parser.service.js';
|
|
5
|
+
import { MinerUIntegrationOptions } from './types.js';
|
|
6
|
+
/**
|
|
7
|
+
* Default parsing settings for MinerU tool
|
|
8
|
+
*/
|
|
9
|
+
export interface MinerUToolDefaults {
|
|
10
|
+
isOcr?: boolean;
|
|
11
|
+
enableFormula?: boolean;
|
|
12
|
+
enableTable?: boolean;
|
|
13
|
+
language?: 'en' | 'ch';
|
|
14
|
+
modelVersion?: 'pipeline' | 'vlm';
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Build MinerU PDF parser tool
|
|
18
|
+
* This tool converts PDF files to markdown format using MinerU service
|
|
19
|
+
*/
|
|
20
|
+
export declare function buildMinerUTool(configService: ConfigService, resultParser: MinerUResultParserService, options?: MinerUIntegrationOptions, fileSystem?: XpFileSystem, defaults?: MinerUToolDefaults): import("@langchain/core/tools").DynamicStructuredTool<z.ZodObject<{
|
|
21
|
+
doc_url: z.ZodString;
|
|
22
|
+
token: z.ZodNullable<z.ZodOptional<z.ZodString>>;
|
|
23
|
+
url: z.ZodNullable<z.ZodOptional<z.ZodString>>;
|
|
24
|
+
server_type: z.ZodNullable<z.ZodOptional<z.ZodEnum<["official", "self-hosted"]>>>;
|
|
25
|
+
enable_formula: z.ZodNullable<z.ZodOptional<z.ZodBoolean>>;
|
|
26
|
+
enable_table: z.ZodNullable<z.ZodOptional<z.ZodBoolean>>;
|
|
27
|
+
is_ocr: z.ZodNullable<z.ZodOptional<z.ZodBoolean>>;
|
|
28
|
+
language: z.ZodNullable<z.ZodOptional<z.ZodEnum<["en", "ch"]>>>;
|
|
29
|
+
model_version: z.ZodNullable<z.ZodOptional<z.ZodEnum<["pipeline", "vlm"]>>>;
|
|
30
|
+
}, "strip", z.ZodTypeAny, {
|
|
31
|
+
token?: string;
|
|
32
|
+
enable_formula?: boolean;
|
|
33
|
+
enable_table?: boolean;
|
|
34
|
+
language?: "en" | "ch";
|
|
35
|
+
model_version?: "vlm" | "pipeline";
|
|
36
|
+
url?: string;
|
|
37
|
+
is_ocr?: boolean;
|
|
38
|
+
doc_url?: string;
|
|
39
|
+
server_type?: "official" | "self-hosted";
|
|
40
|
+
}, {
|
|
41
|
+
token?: string;
|
|
42
|
+
enable_formula?: boolean;
|
|
43
|
+
enable_table?: boolean;
|
|
44
|
+
language?: "en" | "ch";
|
|
45
|
+
model_version?: "vlm" | "pipeline";
|
|
46
|
+
url?: string;
|
|
47
|
+
is_ocr?: boolean;
|
|
48
|
+
doc_url?: string;
|
|
49
|
+
server_type?: "official" | "self-hosted";
|
|
50
|
+
}>, {
|
|
51
|
+
token?: string;
|
|
52
|
+
enable_formula?: boolean;
|
|
53
|
+
enable_table?: boolean;
|
|
54
|
+
language?: "en" | "ch";
|
|
55
|
+
model_version?: "vlm" | "pipeline";
|
|
56
|
+
url?: string;
|
|
57
|
+
is_ocr?: boolean;
|
|
58
|
+
doc_url?: string;
|
|
59
|
+
server_type?: "official" | "self-hosted";
|
|
60
|
+
}, {
|
|
61
|
+
token?: string;
|
|
62
|
+
enable_formula?: boolean;
|
|
63
|
+
enable_table?: boolean;
|
|
64
|
+
language?: "en" | "ch";
|
|
65
|
+
model_version?: "vlm" | "pipeline";
|
|
66
|
+
url?: string;
|
|
67
|
+
is_ocr?: boolean;
|
|
68
|
+
doc_url?: string;
|
|
69
|
+
server_type?: "official" | "self-hosted";
|
|
70
|
+
}, (string | {
|
|
71
|
+
files: any[];
|
|
72
|
+
taskId: string;
|
|
73
|
+
metadata: any;
|
|
74
|
+
})[]>;
|
|
75
|
+
//# sourceMappingURL=mineru.tool.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mineru.tool.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.tool.ts"],"names":[],"mappings":"AAEA,OAAO,EAAmB,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACrE,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAE/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAEvE,OAAO,EAAqB,wBAAwB,EAAE,MAAM,YAAY,CAAC;AAEzE;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,QAAQ,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACvB,YAAY,CAAC,EAAE,UAAU,GAAG,KAAK,CAAC;CACnC;AAED;;;GAGG;AACH,wBAAgB,eAAe,CAC7B,aAAa,EAAE,aAAa,EAC5B,YAAY,EAAE,yBAAyB,EACvC,OAAO,CAAC,EAAE,wBAAwB,EAClC,UAAU,CAAC,EAAE,YAAY,EACzB,QAAQ,CAAC,EAAE,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MA2K9B"}
|