@helloxiaohu/plugin-mineru 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +39 -0
- package/dist/lib/integration.strategy.d.ts +10 -0
- package/dist/lib/integration.strategy.d.ts.map +1 -0
- package/dist/lib/integration.strategy.js +118 -0
- package/dist/lib/mineru-toolset.strategy.d.ts +69 -0
- package/dist/lib/mineru-toolset.strategy.d.ts.map +1 -0
- package/dist/lib/mineru-toolset.strategy.js +109 -0
- package/dist/lib/mineru.client.d.ts +120 -0
- package/dist/lib/mineru.client.d.ts.map +1 -0
- package/dist/lib/mineru.client.js +456 -0
- package/dist/lib/mineru.controller.d.ts +9 -0
- package/dist/lib/mineru.controller.d.ts.map +1 -0
- package/dist/lib/mineru.controller.js +41 -0
- package/dist/lib/mineru.plugin.d.ts +13 -0
- package/dist/lib/mineru.plugin.d.ts.map +1 -0
- package/dist/lib/mineru.plugin.js +52 -0
- package/dist/lib/mineru.tool.d.ts +61 -0
- package/dist/lib/mineru.tool.d.ts.map +1 -0
- package/dist/lib/mineru.tool.js +132 -0
- package/dist/lib/mineru.toolset.d.ts +40 -0
- package/dist/lib/mineru.toolset.d.ts.map +1 -0
- package/dist/lib/mineru.toolset.js +47 -0
- package/dist/lib/path-meta.d.ts +5 -0
- package/dist/lib/path-meta.d.ts.map +1 -0
- package/dist/lib/path-meta.js +8 -0
- package/dist/lib/result-parser.service.d.ts +18 -0
- package/dist/lib/result-parser.service.d.ts.map +1 -0
- package/dist/lib/result-parser.service.js +142 -0
- package/dist/lib/transformer-mineru.strategy.d.ts +95 -0
- package/dist/lib/transformer-mineru.strategy.d.ts.map +1 -0
- package/dist/lib/transformer-mineru.strategy.js +163 -0
- package/dist/lib/types.d.ts +40 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +27 -0
- package/package.json +60 -0
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import { tool } from '@langchain/core/tools';
|
|
2
|
+
import { getCurrentTaskInput } from '@langchain/langgraph';
|
|
3
|
+
import { getErrorMessage } from '@xpert-ai/plugin-sdk';
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
import { MinerUClient } from './mineru.client.js';
|
|
6
|
+
/**
|
|
7
|
+
* Build MinerU PDF parser tool
|
|
8
|
+
* This tool converts PDF files to markdown format using MinerU service
|
|
9
|
+
*/
|
|
10
|
+
export function buildMinerUTool(configService, resultParser, integration, fileSystem) {
|
|
11
|
+
return tool(async (input) => {
|
|
12
|
+
try {
|
|
13
|
+
const { fileUrl, filePath, fileName, isOcr, enableFormula, enableTable, language, modelVersion } = input;
|
|
14
|
+
if (!fileUrl && !filePath) {
|
|
15
|
+
throw new Error('Either fileUrl or filePath must be provided');
|
|
16
|
+
}
|
|
17
|
+
// Get workspace context from current task
|
|
18
|
+
const currentState = getCurrentTaskInput();
|
|
19
|
+
const workspacePath = currentState?.['sys']?.['volume'] ?? '/tmp/xpert';
|
|
20
|
+
const baseUrl = currentState?.['sys']?.['workspace_url'];
|
|
21
|
+
// Create MinerU client with integration and file system permissions
|
|
22
|
+
const mineruClient = new MinerUClient(configService, {
|
|
23
|
+
fileSystem,
|
|
24
|
+
integration,
|
|
25
|
+
});
|
|
26
|
+
// Determine file name if not provided
|
|
27
|
+
let finalFileName = fileName;
|
|
28
|
+
if (!finalFileName) {
|
|
29
|
+
if (fileUrl) {
|
|
30
|
+
try {
|
|
31
|
+
const url = new URL(fileUrl);
|
|
32
|
+
finalFileName = url.pathname.split('/').pop() || 'document.pdf';
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
finalFileName = 'document.pdf';
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
else if (filePath) {
|
|
39
|
+
const pathParts = filePath.split(/[/\\]/);
|
|
40
|
+
finalFileName = pathParts[pathParts.length - 1] || 'document.pdf';
|
|
41
|
+
}
|
|
42
|
+
else {
|
|
43
|
+
finalFileName = 'document.pdf';
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
// Create MinerU task
|
|
47
|
+
const { taskId } = await mineruClient.createTask({
|
|
48
|
+
url: fileUrl,
|
|
49
|
+
filePath: filePath,
|
|
50
|
+
fileName: finalFileName,
|
|
51
|
+
isOcr: isOcr ?? true,
|
|
52
|
+
enableFormula: enableFormula ?? true,
|
|
53
|
+
enableTable: enableTable ?? true,
|
|
54
|
+
language: language ?? 'ch',
|
|
55
|
+
modelVersion: modelVersion ?? 'pipeline',
|
|
56
|
+
});
|
|
57
|
+
let parsedResult;
|
|
58
|
+
if (mineruClient.serverType === 'self-hosted') {
|
|
59
|
+
// Self-hosted: get result immediately
|
|
60
|
+
const taskResult = mineruClient.getSelfHostedTask(taskId);
|
|
61
|
+
if (!taskResult) {
|
|
62
|
+
throw new Error('Failed to get MinerU task result');
|
|
63
|
+
}
|
|
64
|
+
parsedResult = await resultParser.parseLocalTask(taskResult, taskId, {
|
|
65
|
+
fileUrl,
|
|
66
|
+
filePath,
|
|
67
|
+
name: finalFileName,
|
|
68
|
+
folder: workspacePath,
|
|
69
|
+
}, fileSystem);
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
// Official API: wait for completion
|
|
73
|
+
const result = await mineruClient.waitForTask(taskId, 5 * 60 * 1000, 5000);
|
|
74
|
+
parsedResult = await resultParser.parseFromUrl(result.full_zip_url, taskId, {
|
|
75
|
+
fileUrl,
|
|
76
|
+
filePath,
|
|
77
|
+
name: finalFileName,
|
|
78
|
+
folder: workspacePath,
|
|
79
|
+
}, fileSystem);
|
|
80
|
+
}
|
|
81
|
+
// Build file artifacts from parsed result
|
|
82
|
+
const fileArtifacts = [];
|
|
83
|
+
if (parsedResult.metadata?.assets) {
|
|
84
|
+
for (const asset of parsedResult.metadata.assets) {
|
|
85
|
+
if (asset.type === 'file' || asset.type === 'image') {
|
|
86
|
+
const fileName = asset.filePath?.split(/[/\\]/).pop() || asset.url?.split('/').pop() || 'file';
|
|
87
|
+
const extension = fileName.split('.').pop()?.toLowerCase() || 'md';
|
|
88
|
+
const mimeType = asset.type === 'image'
|
|
89
|
+
? (extension === 'png' ? 'image/png' : 'image/jpeg')
|
|
90
|
+
: (extension === 'md' ? 'text/markdown' : 'application/json');
|
|
91
|
+
fileArtifacts.push({
|
|
92
|
+
fileName: fileName,
|
|
93
|
+
filePath: asset.filePath,
|
|
94
|
+
fileUrl: asset.url,
|
|
95
|
+
mimeType: mimeType,
|
|
96
|
+
extension: extension,
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
// Extract markdown content from chunks
|
|
102
|
+
const markdownContent = parsedResult.chunks
|
|
103
|
+
?.map((chunk) => chunk.pageContent)
|
|
104
|
+
.join('\n\n') || '';
|
|
105
|
+
return [
|
|
106
|
+
`PDF processed successfully by MinerU.\n\nTask ID: ${taskId}\n\nMarkdown Content:\n${markdownContent.substring(0, 1000)}${markdownContent.length > 1000 ? '...' : ''}`,
|
|
107
|
+
{
|
|
108
|
+
files: fileArtifacts,
|
|
109
|
+
taskId,
|
|
110
|
+
metadata: parsedResult.metadata,
|
|
111
|
+
},
|
|
112
|
+
];
|
|
113
|
+
}
|
|
114
|
+
catch (error) {
|
|
115
|
+
throw new Error(`MinerU processing failed: ${getErrorMessage(error)}`);
|
|
116
|
+
}
|
|
117
|
+
}, {
|
|
118
|
+
name: 'mineru_pdf_parser',
|
|
119
|
+
description: 'Convert PDF files to markdown format using MinerU. Supports OCR, formula recognition, and table extraction. Returns markdown content and extracted files (images, JSON, etc.).',
|
|
120
|
+
schema: z.object({
|
|
121
|
+
fileUrl: z.string().optional().nullable().describe('URL of the PDF file to process'),
|
|
122
|
+
filePath: z.string().optional().nullable().describe('Local file path of the PDF file'),
|
|
123
|
+
fileName: z.string().optional().nullable().describe('Name of the PDF file'),
|
|
124
|
+
isOcr: z.boolean().optional().nullable().describe('Enable OCR for image-based PDFs (default: true)'),
|
|
125
|
+
enableFormula: z.boolean().optional().nullable().describe('Enable formula recognition (default: true)'),
|
|
126
|
+
enableTable: z.boolean().optional().nullable().describe('Enable table recognition (default: true)'),
|
|
127
|
+
language: z.enum(['en', 'ch']).optional().nullable().describe('Document language: "en" for English, "ch" for Chinese (default: "ch")'),
|
|
128
|
+
modelVersion: z.enum(['pipeline', 'vlm']).optional().nullable().describe('Model version: "pipeline" or "vlm" (default: "pipeline")'),
|
|
129
|
+
}),
|
|
130
|
+
responseFormat: 'content_and_artifact',
|
|
131
|
+
});
|
|
132
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { StructuredToolInterface, ToolSchemaBase } from '@langchain/core/tools';
|
|
2
|
+
import { BuiltinToolset, XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
3
|
+
import { ConfigService } from '@nestjs/config';
|
|
4
|
+
import { MinerUResultParserService } from './result-parser.service.js';
|
|
5
|
+
import { IIntegration } from '@metad/contracts';
|
|
6
|
+
import { MinerUIntegrationOptions } from './types.js';
|
|
7
|
+
/**
|
|
8
|
+
* Configuration for MinerU Toolset
|
|
9
|
+
*/
|
|
10
|
+
export interface MinerUToolsetConfig {
|
|
11
|
+
integration?: Partial<IIntegration<MinerUIntegrationOptions>>;
|
|
12
|
+
fileSystem?: XpFileSystem;
|
|
13
|
+
configService?: ConfigService;
|
|
14
|
+
resultParser?: MinerUResultParserService;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* MinerU Toolset implementation
|
|
18
|
+
* Provides PDF to markdown conversion tool using MinerU service
|
|
19
|
+
*/
|
|
20
|
+
export declare class MinerUToolset extends BuiltinToolset<StructuredToolInterface, MinerUToolsetConfig> {
|
|
21
|
+
private readonly config;
|
|
22
|
+
/**
|
|
23
|
+
* Constructor for MinerU Toolset
|
|
24
|
+
* Accepts config which contains credentials and dependencies
|
|
25
|
+
* Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
|
|
26
|
+
* properties (tenantId, env) that are added at runtime
|
|
27
|
+
*/
|
|
28
|
+
constructor(config: MinerUToolsetConfig);
|
|
29
|
+
/**
|
|
30
|
+
* Validate credentials for MinerU toolset
|
|
31
|
+
* Requires integration configuration to be provided
|
|
32
|
+
*/
|
|
33
|
+
_validateCredentials(credentials: MinerUToolsetConfig): Promise<void>;
|
|
34
|
+
/**
|
|
35
|
+
* Initialize tools for MinerU toolset
|
|
36
|
+
* Creates the PDF parser tool with necessary dependencies
|
|
37
|
+
*/
|
|
38
|
+
initTools(): Promise<StructuredToolInterface<ToolSchemaBase, any, any>[]>;
|
|
39
|
+
}
|
|
40
|
+
//# sourceMappingURL=mineru.toolset.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mineru.toolset.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.toolset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAChF,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACpE,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAEvE,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAChD,OAAO,EAAE,wBAAwB,EAAE,MAAM,YAAY,CAAC;AAEtD;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,WAAW,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,wBAAwB,CAAC,CAAC,CAAC;IAC9D,UAAU,CAAC,EAAE,YAAY,CAAC;IAC1B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,YAAY,CAAC,EAAE,yBAAyB,CAAC;CAC1C;AAED;;;GAGG;AACH,qBAAa,aAAc,SAAQ,cAAc,CAAC,uBAAuB,EAAE,mBAAmB,CAAC;IAC7F,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAsB;IAE7C;;;;;OAKG;gBACS,MAAM,EAAE,mBAAmB;IAKvC;;;OAGG;IACY,oBAAoB,CAAC,WAAW,EAAE,mBAAmB,GAAG,OAAO,CAAC,IAAI,CAAC;IAYpF;;;OAGG;IACY,SAAS,IAAI,OAAO,CAAC,uBAAuB,CAAC,cAAc,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;CAezF"}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { BuiltinToolset } from '@xpert-ai/plugin-sdk';
|
|
2
|
+
import { buildMinerUTool } from './mineru.tool.js';
|
|
3
|
+
/**
|
|
4
|
+
* MinerU Toolset implementation
|
|
5
|
+
* Provides PDF to markdown conversion tool using MinerU service
|
|
6
|
+
*/
|
|
7
|
+
export class MinerUToolset extends BuiltinToolset {
|
|
8
|
+
/**
|
|
9
|
+
* Constructor for MinerU Toolset
|
|
10
|
+
* Accepts config which contains credentials and dependencies
|
|
11
|
+
* Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
|
|
12
|
+
* properties (tenantId, env) that are added at runtime
|
|
13
|
+
*/
|
|
14
|
+
constructor(config) {
|
|
15
|
+
super('mineru', undefined, config);
|
|
16
|
+
this.config = config;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Validate credentials for MinerU toolset
|
|
20
|
+
* Requires integration configuration to be provided
|
|
21
|
+
*/
|
|
22
|
+
async _validateCredentials(credentials) {
|
|
23
|
+
if (!credentials.integration) {
|
|
24
|
+
throw new Error('MinerU integration is required');
|
|
25
|
+
}
|
|
26
|
+
if (!credentials.configService) {
|
|
27
|
+
throw new Error('ConfigService is required');
|
|
28
|
+
}
|
|
29
|
+
if (!credentials.resultParser) {
|
|
30
|
+
throw new Error('MinerUResultParserService is required');
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Initialize tools for MinerU toolset
|
|
35
|
+
* Creates the PDF parser tool with necessary dependencies
|
|
36
|
+
*/
|
|
37
|
+
async initTools() {
|
|
38
|
+
const { configService, resultParser, integration, fileSystem } = this.config;
|
|
39
|
+
if (!configService || !resultParser) {
|
|
40
|
+
throw new Error('ConfigService and MinerUResultParserService are required');
|
|
41
|
+
}
|
|
42
|
+
this.tools = [
|
|
43
|
+
buildMinerUTool(configService, resultParser, integration, fileSystem),
|
|
44
|
+
];
|
|
45
|
+
return this.tools;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"path-meta.d.ts","sourceRoot":"","sources":["../../src/lib/path-meta.ts"],"names":[],"mappings":"AAGA,wBAAgB,aAAa,CAAC,IAAI,CAAC,EAAE,UAAU;;;EAK9C"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { dirname } from 'path';
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
export function getModuleMeta(meta) {
|
|
4
|
+
const isESM = typeof require === 'undefined';
|
|
5
|
+
const filename = isESM ? fileURLToPath(meta.url) : __filename;
|
|
6
|
+
const dir = isESM ? dirname(filename) : __dirname;
|
|
7
|
+
return { __filename: filename, __dirname: dir };
|
|
8
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { Document } from '@langchain/core/documents';
|
|
2
|
+
import { IKnowledgeDocument } from '@metad/contracts';
|
|
3
|
+
import { ChunkMetadata, XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
4
|
+
import { MinerUDocumentMetadata, MineruSelfHostedTaskResult } from './types.js';
|
|
5
|
+
export declare class MinerUResultParserService {
|
|
6
|
+
private readonly logger;
|
|
7
|
+
parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
|
|
8
|
+
id?: string;
|
|
9
|
+
chunks: Document<ChunkMetadata>[];
|
|
10
|
+
metadata: MinerUDocumentMetadata;
|
|
11
|
+
}>;
|
|
12
|
+
parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
|
|
13
|
+
id?: string;
|
|
14
|
+
chunks: Document<ChunkMetadata>[];
|
|
15
|
+
metadata: MinerUDocumentMetadata;
|
|
16
|
+
}>;
|
|
17
|
+
}
|
|
18
|
+
//# sourceMappingURL=result-parser.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;IAqFI,cAAc,CAClB,MAAM,EAAE,0BAA0B,EAClC,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;CAkDH"}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
var MinerUResultParserService_1;
|
|
2
|
+
import { __decorate } from "tslib";
|
|
3
|
+
import { Document } from '@langchain/core/documents';
|
|
4
|
+
import { Injectable, Logger } from '@nestjs/common';
|
|
5
|
+
import axios from 'axios';
|
|
6
|
+
import { join } from 'path';
|
|
7
|
+
import unzipper from 'unzipper';
|
|
8
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
9
|
+
import { MinerU, } from './types.js';
|
|
10
|
+
let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResultParserService {
|
|
11
|
+
constructor() {
|
|
12
|
+
this.logger = new Logger(MinerUResultParserService_1.name);
|
|
13
|
+
}
|
|
14
|
+
async parseFromUrl(fullZipUrl, taskId, document, fileSystem) {
|
|
15
|
+
this.logger.debug(`Downloading MinerU result from: ${fullZipUrl}`);
|
|
16
|
+
// 1. Download the zip file to memory
|
|
17
|
+
const response = await axios.get(fullZipUrl, {
|
|
18
|
+
responseType: 'arraybuffer',
|
|
19
|
+
});
|
|
20
|
+
const zipBuffer = Buffer.from(response.data);
|
|
21
|
+
const metadata = {
|
|
22
|
+
parser: MinerU,
|
|
23
|
+
taskId,
|
|
24
|
+
};
|
|
25
|
+
// 2. Unzip the file
|
|
26
|
+
const zipEntries = [];
|
|
27
|
+
const assets = [];
|
|
28
|
+
const directory = await unzipper.Open.buffer(zipBuffer);
|
|
29
|
+
const pathMap = new Map();
|
|
30
|
+
let fullMd = '';
|
|
31
|
+
let layoutJson = null;
|
|
32
|
+
for (const entry of directory.files) {
|
|
33
|
+
if (!entry.type || entry.type !== 'File')
|
|
34
|
+
continue;
|
|
35
|
+
const data = await entry.buffer();
|
|
36
|
+
zipEntries.push({ entryName: entry.path, data });
|
|
37
|
+
const fileName = entry.path;
|
|
38
|
+
const filePath = join(document.folder || '', entry.path);
|
|
39
|
+
const url = await fileSystem.writeFile(filePath, data);
|
|
40
|
+
pathMap.set(fileName, url);
|
|
41
|
+
// Write images to local file system
|
|
42
|
+
if (fileName.startsWith('images/')) {
|
|
43
|
+
assets.push({
|
|
44
|
+
type: 'image',
|
|
45
|
+
url: url,
|
|
46
|
+
filePath: filePath,
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
else if (fileName.endsWith('layout.json')) {
|
|
50
|
+
layoutJson = JSON.parse(data.toString('utf-8'));
|
|
51
|
+
metadata.mineruBackend = layoutJson?._backend;
|
|
52
|
+
metadata.mineruVersion = layoutJson?._version_name;
|
|
53
|
+
assets.push({
|
|
54
|
+
type: 'file',
|
|
55
|
+
url,
|
|
56
|
+
filePath: filePath,
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
else if (fileName.endsWith('content_list.json')) {
|
|
60
|
+
assets.push({
|
|
61
|
+
type: 'file',
|
|
62
|
+
url,
|
|
63
|
+
filePath: filePath,
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
else if (fileName.endsWith('full.md')) {
|
|
67
|
+
fullMd = data.toString('utf-8');
|
|
68
|
+
assets.push({
|
|
69
|
+
type: 'file',
|
|
70
|
+
url,
|
|
71
|
+
filePath: filePath,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
else if (fileName.endsWith('origin.pdf')) {
|
|
75
|
+
metadata.originPdfUrl = fileName;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
metadata.assets = assets;
|
|
79
|
+
// 3. Replace image relative path in full.md with file url
|
|
80
|
+
fullMd = fullMd.replace(/!\[(.*)\]\((images\/.+?)\)/g, (match, p1, p2) => {
|
|
81
|
+
const localPath = pathMap.get(p2);
|
|
82
|
+
return localPath ? `` : match;
|
|
83
|
+
});
|
|
84
|
+
const chunks = [
|
|
85
|
+
new Document({
|
|
86
|
+
pageContent: fullMd,
|
|
87
|
+
metadata: { parser: MinerU, taskId, chunkId: uuidv4() },
|
|
88
|
+
}),
|
|
89
|
+
];
|
|
90
|
+
return {
|
|
91
|
+
chunks,
|
|
92
|
+
metadata,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
async parseLocalTask(result, taskId, document, fileSystem) {
|
|
96
|
+
const metadata = {
|
|
97
|
+
parser: MinerU,
|
|
98
|
+
taskId,
|
|
99
|
+
};
|
|
100
|
+
const assets = [];
|
|
101
|
+
const pathMap = new Map();
|
|
102
|
+
for (const image of result.images) {
|
|
103
|
+
const filePath = join(document.folder || '', 'images', image.name);
|
|
104
|
+
const url = await fileSystem.writeFile(filePath, Buffer.from(image.dataUrl.split(',')[1], 'base64'));
|
|
105
|
+
pathMap.set(`images/${image.name}`, url);
|
|
106
|
+
assets.push({
|
|
107
|
+
type: 'image',
|
|
108
|
+
url: url,
|
|
109
|
+
filePath: filePath,
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
if (result.sourceUrl) {
|
|
113
|
+
assets.push({
|
|
114
|
+
type: 'file',
|
|
115
|
+
url: result.sourceUrl,
|
|
116
|
+
filePath: join(document.folder || '', result.fileName || 'source.pdf'),
|
|
117
|
+
});
|
|
118
|
+
metadata.originPdfUrl = result.sourceUrl;
|
|
119
|
+
}
|
|
120
|
+
metadata.assets = assets;
|
|
121
|
+
let fullMd = result.mdContent;
|
|
122
|
+
// 3. Replace image relative path in full.md with file url
|
|
123
|
+
fullMd = fullMd.replace(/!\[(.*)\]\((images\/.+?)\)/g, (match, p1, p2) => {
|
|
124
|
+
const localPath = pathMap.get(p2);
|
|
125
|
+
return localPath ? `` : match;
|
|
126
|
+
});
|
|
127
|
+
const chunks = [
|
|
128
|
+
new Document({
|
|
129
|
+
pageContent: fullMd,
|
|
130
|
+
metadata: { parser: MinerU, taskId, chunkId: uuidv4() },
|
|
131
|
+
}),
|
|
132
|
+
];
|
|
133
|
+
return {
|
|
134
|
+
chunks,
|
|
135
|
+
metadata,
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
};
|
|
139
|
+
MinerUResultParserService = MinerUResultParserService_1 = __decorate([
|
|
140
|
+
Injectable()
|
|
141
|
+
], MinerUResultParserService);
|
|
142
|
+
export { MinerUResultParserService };
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import { IconType, IKnowledgeDocument } from '@metad/contracts';
|
|
2
|
+
import { ChunkMetadata, FileSystemPermission, IDocumentTransformerStrategy, IntegrationPermission } from '@xpert-ai/plugin-sdk';
|
|
3
|
+
import { TMinerUTransformerConfig } from './types.js';
|
|
4
|
+
export declare class MinerUTransformerStrategy implements IDocumentTransformerStrategy<TMinerUTransformerConfig> {
|
|
5
|
+
private readonly resultParser;
|
|
6
|
+
private readonly configService;
|
|
7
|
+
readonly permissions: (IntegrationPermission | FileSystemPermission)[];
|
|
8
|
+
readonly meta: {
|
|
9
|
+
name: string;
|
|
10
|
+
label: {
|
|
11
|
+
en_US: string;
|
|
12
|
+
zh_Hans: string;
|
|
13
|
+
};
|
|
14
|
+
description: {
|
|
15
|
+
en_US: string;
|
|
16
|
+
zh_Hans: string;
|
|
17
|
+
};
|
|
18
|
+
icon: {
|
|
19
|
+
type: IconType;
|
|
20
|
+
value: string;
|
|
21
|
+
color: string;
|
|
22
|
+
};
|
|
23
|
+
helpUrl: string;
|
|
24
|
+
configSchema: {
|
|
25
|
+
type: string;
|
|
26
|
+
properties: {
|
|
27
|
+
isOcr: {
|
|
28
|
+
type: string;
|
|
29
|
+
title: {
|
|
30
|
+
en_US: string;
|
|
31
|
+
zh_Hans: string;
|
|
32
|
+
};
|
|
33
|
+
description: {
|
|
34
|
+
en_US: string;
|
|
35
|
+
zh_Hans: string;
|
|
36
|
+
};
|
|
37
|
+
default: boolean;
|
|
38
|
+
};
|
|
39
|
+
enableFormula: {
|
|
40
|
+
type: string;
|
|
41
|
+
title: {
|
|
42
|
+
en_US: string;
|
|
43
|
+
zh_Hans: string;
|
|
44
|
+
};
|
|
45
|
+
description: {
|
|
46
|
+
en_US: string;
|
|
47
|
+
zh_Hans: string;
|
|
48
|
+
};
|
|
49
|
+
default: boolean;
|
|
50
|
+
};
|
|
51
|
+
enableTable: {
|
|
52
|
+
type: string;
|
|
53
|
+
title: {
|
|
54
|
+
en_US: string;
|
|
55
|
+
zh_Hans: string;
|
|
56
|
+
};
|
|
57
|
+
description: {
|
|
58
|
+
en_US: string;
|
|
59
|
+
zh_Hans: string;
|
|
60
|
+
};
|
|
61
|
+
default: boolean;
|
|
62
|
+
};
|
|
63
|
+
language: {
|
|
64
|
+
type: string;
|
|
65
|
+
title: {
|
|
66
|
+
en_US: string;
|
|
67
|
+
zh_Hans: string;
|
|
68
|
+
};
|
|
69
|
+
description: {
|
|
70
|
+
en_US: string;
|
|
71
|
+
zh_Hans: string;
|
|
72
|
+
};
|
|
73
|
+
default: string;
|
|
74
|
+
};
|
|
75
|
+
modelVersion: {
|
|
76
|
+
type: string;
|
|
77
|
+
title: {
|
|
78
|
+
en_US: string;
|
|
79
|
+
zh_Hans: string;
|
|
80
|
+
};
|
|
81
|
+
description: {
|
|
82
|
+
en_US: string;
|
|
83
|
+
zh_Hans: string;
|
|
84
|
+
};
|
|
85
|
+
enum: string[];
|
|
86
|
+
default: string;
|
|
87
|
+
};
|
|
88
|
+
};
|
|
89
|
+
required: any[];
|
|
90
|
+
};
|
|
91
|
+
};
|
|
92
|
+
validateConfig(config: any): Promise<void>;
|
|
93
|
+
transformDocuments(documents: Partial<IKnowledgeDocument>[], config: TMinerUTransformerConfig): Promise<Partial<IKnowledgeDocument<ChunkMetadata>>[]>;
|
|
94
|
+
}
|
|
95
|
+
//# sourceMappingURL=transformer-mineru.strategy.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAEb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EACtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,EAAgB,wBAAwB,EAAE,MAAM,YAAY,CAAA;AAEnE,qBAEa,yBAA0B,YAAW,4BAA4B,CAAC,wBAAwB,CAAC;IAEtG,OAAO,CAAC,QAAQ,CAAC,YAAY,CAA2B;IAGxD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;IAE7C,QAAQ,CAAC,WAAW,mDAWnB;IAED,QAAQ,CAAC,IAAI;;;;;;;;;;;kBAWM,QAAQ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAwE1B;IAED,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAIpC,kBAAkB,CACtB,SAAS,EAAE,OAAO,CAAC,kBAAkB,CAAC,EAAE,EACxC,MAAM,EAAE,wBAAwB,GAC/B,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,EAAE,CAAC;CAsDzD"}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import { __decorate, __metadata } from "tslib";
|
|
2
|
+
import { forwardRef, Inject, Injectable } from '@nestjs/common';
|
|
3
|
+
import { ConfigService } from '@nestjs/config';
|
|
4
|
+
import { DocumentTransformerStrategy, } from '@xpert-ai/plugin-sdk';
|
|
5
|
+
import { isNil, omitBy, pick } from 'lodash-es';
|
|
6
|
+
import { MinerUClient } from './mineru.client.js';
|
|
7
|
+
import { MinerUResultParserService } from './result-parser.service.js';
|
|
8
|
+
import { icon, MinerU } from './types.js';
|
|
9
|
+
let MinerUTransformerStrategy = class MinerUTransformerStrategy {
|
|
10
|
+
constructor() {
|
|
11
|
+
this.permissions = [
|
|
12
|
+
{
|
|
13
|
+
type: 'integration',
|
|
14
|
+
service: MinerU,
|
|
15
|
+
description: 'Access to MinerU system integrations'
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
type: 'filesystem',
|
|
19
|
+
operations: ['read', 'write', 'list'],
|
|
20
|
+
scope: []
|
|
21
|
+
}
|
|
22
|
+
];
|
|
23
|
+
this.meta = {
|
|
24
|
+
name: MinerU,
|
|
25
|
+
label: {
|
|
26
|
+
en_US: 'MinerU',
|
|
27
|
+
zh_Hans: 'MinerU'
|
|
28
|
+
},
|
|
29
|
+
description: {
|
|
30
|
+
en_US: 'A high-quality tool for convert PDF to Markdown and JSON.',
|
|
31
|
+
zh_Hans: '一站式开源高质量数据提取工具,将PDF转换成Markdown和JSON格式。'
|
|
32
|
+
},
|
|
33
|
+
icon: {
|
|
34
|
+
type: 'svg',
|
|
35
|
+
value: icon,
|
|
36
|
+
color: '#14b8a6'
|
|
37
|
+
},
|
|
38
|
+
helpUrl: 'https://mineru.net/apiManage/docs',
|
|
39
|
+
configSchema: {
|
|
40
|
+
type: 'object',
|
|
41
|
+
properties: {
|
|
42
|
+
isOcr: {
|
|
43
|
+
type: 'boolean',
|
|
44
|
+
title: {
|
|
45
|
+
en_US: 'Enable OCR',
|
|
46
|
+
zh_Hans: '启用 OCR'
|
|
47
|
+
},
|
|
48
|
+
description: {
|
|
49
|
+
en_US: 'Enable OCR for image-based PDFs.',
|
|
50
|
+
zh_Hans: '对基于图像的 PDF 启用 OCR。'
|
|
51
|
+
},
|
|
52
|
+
default: true
|
|
53
|
+
},
|
|
54
|
+
enableFormula: {
|
|
55
|
+
type: 'boolean',
|
|
56
|
+
title: {
|
|
57
|
+
en_US: 'Enable Formula Recognition',
|
|
58
|
+
zh_Hans: '启用公式识别'
|
|
59
|
+
},
|
|
60
|
+
description: {
|
|
61
|
+
en_US: 'Enable recognition of mathematical formulas in documents.',
|
|
62
|
+
zh_Hans: '启用对文档中数学公式的识别。'
|
|
63
|
+
},
|
|
64
|
+
default: true
|
|
65
|
+
},
|
|
66
|
+
enableTable: {
|
|
67
|
+
type: 'boolean',
|
|
68
|
+
title: {
|
|
69
|
+
en_US: 'Enable Table Recognition',
|
|
70
|
+
zh_Hans: '启用表格识别'
|
|
71
|
+
},
|
|
72
|
+
description: {
|
|
73
|
+
en_US: 'Enable recognition of tables in documents.',
|
|
74
|
+
zh_Hans: '启用对文档中表格的识别。'
|
|
75
|
+
},
|
|
76
|
+
default: true
|
|
77
|
+
},
|
|
78
|
+
language: {
|
|
79
|
+
type: 'string',
|
|
80
|
+
title: {
|
|
81
|
+
en_US: 'Document Language',
|
|
82
|
+
zh_Hans: '文档语言'
|
|
83
|
+
},
|
|
84
|
+
description: {
|
|
85
|
+
en_US: 'The primary language of the document (e.g., "en" for English, "ch" for Chinese).',
|
|
86
|
+
zh_Hans: '文档的主要语言(例如,英文为 "en",中文为 "ch")。'
|
|
87
|
+
},
|
|
88
|
+
default: 'ch'
|
|
89
|
+
},
|
|
90
|
+
modelVersion: {
|
|
91
|
+
type: 'string',
|
|
92
|
+
title: {
|
|
93
|
+
en_US: 'Model Version',
|
|
94
|
+
zh_Hans: '模型版本'
|
|
95
|
+
},
|
|
96
|
+
description: {
|
|
97
|
+
en_US: 'The model version to use for extraction (e.g., "vlm" or "pipeline").',
|
|
98
|
+
zh_Hans: '用于提取的模型版本(例如,“vlm”或“pipeline”)。'
|
|
99
|
+
},
|
|
100
|
+
enum: ['pipeline', 'vlm'],
|
|
101
|
+
default: 'pipeline'
|
|
102
|
+
}
|
|
103
|
+
},
|
|
104
|
+
required: []
|
|
105
|
+
}
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
validateConfig(config) {
|
|
109
|
+
throw new Error('Method not implemented.');
|
|
110
|
+
}
|
|
111
|
+
async transformDocuments(documents, config) {
|
|
112
|
+
const mineru = new MinerUClient(this.configService, config.permissions);
|
|
113
|
+
const parsedResults = [];
|
|
114
|
+
for await (const document of documents) {
|
|
115
|
+
if (mineru.serverType === 'self-hosted') {
|
|
116
|
+
const { taskId } = await mineru.createTask({
|
|
117
|
+
url: document.fileUrl,
|
|
118
|
+
filePath: document.filePath,
|
|
119
|
+
fileName: document.name,
|
|
120
|
+
isOcr: true,
|
|
121
|
+
enableFormula: true,
|
|
122
|
+
enableTable: true,
|
|
123
|
+
// language: 'ch',
|
|
124
|
+
// modelVersion: 'vlm'
|
|
125
|
+
});
|
|
126
|
+
const result = mineru.getSelfHostedTask(taskId);
|
|
127
|
+
const parsedResult = await this.resultParser.parseLocalTask(result, taskId, document, config.permissions.fileSystem);
|
|
128
|
+
parsedResult.id = document.id;
|
|
129
|
+
parsedResults.push(parsedResult);
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
const { taskId } = await mineru.createTask({
|
|
133
|
+
url: document.fileUrl,
|
|
134
|
+
isOcr: true,
|
|
135
|
+
enableFormula: true,
|
|
136
|
+
enableTable: true,
|
|
137
|
+
language: 'ch',
|
|
138
|
+
modelVersion: 'vlm',
|
|
139
|
+
...omitBy(pick(config, ['isOcr', 'enableFormula', 'enableTable', 'language', 'modelVersion']), isNil)
|
|
140
|
+
});
|
|
141
|
+
// Waiting for completion
|
|
142
|
+
const result = await mineru.waitForTask(taskId, 5 * 60 * 1000, 5000);
|
|
143
|
+
const parsedResult = await this.resultParser.parseFromUrl(result.full_zip_url, taskId, document, config.permissions.fileSystem);
|
|
144
|
+
parsedResult.id = document.id;
|
|
145
|
+
parsedResults.push(parsedResult);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
return parsedResults;
|
|
149
|
+
}
|
|
150
|
+
};
|
|
151
|
+
__decorate([
|
|
152
|
+
Inject(MinerUResultParserService),
|
|
153
|
+
__metadata("design:type", MinerUResultParserService)
|
|
154
|
+
], MinerUTransformerStrategy.prototype, "resultParser", void 0);
|
|
155
|
+
__decorate([
|
|
156
|
+
Inject(forwardRef(() => ConfigService)),
|
|
157
|
+
__metadata("design:type", ConfigService)
|
|
158
|
+
], MinerUTransformerStrategy.prototype, "configService", void 0);
|
|
159
|
+
MinerUTransformerStrategy = __decorate([
|
|
160
|
+
Injectable(),
|
|
161
|
+
DocumentTransformerStrategy(MinerU)
|
|
162
|
+
], MinerUTransformerStrategy);
|
|
163
|
+
export { MinerUTransformerStrategy };
|