@chenchaolong/plugin-mineru 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +101 -0
  2. package/dist/index.d.ts +6 -0
  3. package/dist/index.d.ts.map +1 -0
  4. package/dist/index.js +39 -0
  5. package/dist/lib/integration.strategy.d.ts +10 -0
  6. package/dist/lib/integration.strategy.d.ts.map +1 -0
  7. package/dist/lib/integration.strategy.js +118 -0
  8. package/dist/lib/mineru-toolset.d.ts +10 -0
  9. package/dist/lib/mineru-toolset.d.ts.map +1 -0
  10. package/dist/lib/mineru-toolset.js +23 -0
  11. package/dist/lib/mineru-toolset.strategy.d.ts +34 -0
  12. package/dist/lib/mineru-toolset.strategy.d.ts.map +1 -0
  13. package/dist/lib/mineru-toolset.strategy.js +58 -0
  14. package/dist/lib/mineru.client.d.ts +120 -0
  15. package/dist/lib/mineru.client.d.ts.map +1 -0
  16. package/dist/lib/mineru.client.js +456 -0
  17. package/dist/lib/mineru.controller.d.ts +9 -0
  18. package/dist/lib/mineru.controller.d.ts.map +1 -0
  19. package/dist/lib/mineru.controller.js +41 -0
  20. package/dist/lib/mineru.plugin.d.ts +13 -0
  21. package/dist/lib/mineru.plugin.d.ts.map +1 -0
  22. package/dist/lib/mineru.plugin.js +52 -0
  23. package/dist/lib/path-meta.d.ts +5 -0
  24. package/dist/lib/path-meta.d.ts.map +1 -0
  25. package/dist/lib/path-meta.js +8 -0
  26. package/dist/lib/pdf-to-markdown.tool.d.ts +90 -0
  27. package/dist/lib/pdf-to-markdown.tool.d.ts.map +1 -0
  28. package/dist/lib/pdf-to-markdown.tool.js +146 -0
  29. package/dist/lib/result-parser.service.d.ts +18 -0
  30. package/dist/lib/result-parser.service.d.ts.map +1 -0
  31. package/dist/lib/result-parser.service.js +142 -0
  32. package/dist/lib/transformer-mineru.strategy.d.ts +95 -0
  33. package/dist/lib/transformer-mineru.strategy.d.ts.map +1 -0
  34. package/dist/lib/transformer-mineru.strategy.js +163 -0
  35. package/dist/lib/types.d.ts +40 -0
  36. package/dist/lib/types.d.ts.map +1 -0
  37. package/dist/lib/types.js +27 -0
  38. package/package.json +50 -0
@@ -0,0 +1,90 @@
1
+ import { z } from 'zod';
2
+ import { ConfigService } from '@nestjs/config';
3
+ import { MinerUResultParserService } from './result-parser.service.js';
4
+ export declare function buildPdfToMarkdownTool(configService: ConfigService, resultParser: MinerUResultParserService): import("@langchain/core/tools").DynamicStructuredTool<z.ZodObject<{
5
+ file: z.ZodObject<{
6
+ name: z.ZodOptional<z.ZodString>;
7
+ filename: z.ZodOptional<z.ZodString>;
8
+ content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodType<Buffer<ArrayBufferLike>, z.ZodTypeDef, Buffer<ArrayBufferLike>>, z.ZodType<Uint8Array<ArrayBuffer>, z.ZodTypeDef, Uint8Array<ArrayBuffer>>]>>;
9
+ filePath: z.ZodOptional<z.ZodString>;
10
+ fileUrl: z.ZodOptional<z.ZodString>;
11
+ }, "strip", z.ZodTypeAny, {
12
+ name?: string;
13
+ filename?: string;
14
+ content?: string | Uint8Array<ArrayBuffer> | Buffer<ArrayBufferLike>;
15
+ filePath?: string;
16
+ fileUrl?: string;
17
+ }, {
18
+ name?: string;
19
+ filename?: string;
20
+ content?: string | Uint8Array<ArrayBuffer> | Buffer<ArrayBufferLike>;
21
+ filePath?: string;
22
+ fileUrl?: string;
23
+ }>;
24
+ isOcr: z.ZodOptional<z.ZodBoolean>;
25
+ enableFormula: z.ZodOptional<z.ZodBoolean>;
26
+ enableTable: z.ZodOptional<z.ZodBoolean>;
27
+ language: z.ZodOptional<z.ZodEnum<["en", "ch"]>>;
28
+ modelVersion: z.ZodOptional<z.ZodEnum<["pipeline", "vlm"]>>;
29
+ }, "strip", z.ZodTypeAny, {
30
+ file?: {
31
+ name?: string;
32
+ filename?: string;
33
+ content?: string | Uint8Array<ArrayBuffer> | Buffer<ArrayBufferLike>;
34
+ filePath?: string;
35
+ fileUrl?: string;
36
+ };
37
+ isOcr?: boolean;
38
+ enableFormula?: boolean;
39
+ enableTable?: boolean;
40
+ language?: "en" | "ch";
41
+ modelVersion?: "vlm" | "pipeline";
42
+ }, {
43
+ file?: {
44
+ name?: string;
45
+ filename?: string;
46
+ content?: string | Uint8Array<ArrayBuffer> | Buffer<ArrayBufferLike>;
47
+ filePath?: string;
48
+ fileUrl?: string;
49
+ };
50
+ isOcr?: boolean;
51
+ enableFormula?: boolean;
52
+ enableTable?: boolean;
53
+ language?: "en" | "ch";
54
+ modelVersion?: "vlm" | "pipeline";
55
+ }>, {
56
+ file?: {
57
+ name?: string;
58
+ filename?: string;
59
+ content?: string | Uint8Array<ArrayBuffer> | Buffer<ArrayBufferLike>;
60
+ filePath?: string;
61
+ fileUrl?: string;
62
+ };
63
+ isOcr?: boolean;
64
+ enableFormula?: boolean;
65
+ enableTable?: boolean;
66
+ language?: "en" | "ch";
67
+ modelVersion?: "vlm" | "pipeline";
68
+ }, {
69
+ file?: {
70
+ name?: string;
71
+ filename?: string;
72
+ content?: string | Uint8Array<ArrayBuffer> | Buffer<ArrayBufferLike>;
73
+ filePath?: string;
74
+ fileUrl?: string;
75
+ };
76
+ isOcr?: boolean;
77
+ enableFormula?: boolean;
78
+ enableTable?: boolean;
79
+ language?: "en" | "ch";
80
+ modelVersion?: "vlm" | "pipeline";
81
+ }, (string | {
82
+ files: {
83
+ mimeType: string;
84
+ fileName: string;
85
+ filePath: string;
86
+ fileUrl: string;
87
+ extension: string;
88
+ }[];
89
+ })[]>;
90
+ //# sourceMappingURL=pdf-to-markdown.tool.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdf-to-markdown.tool.d.ts","sourceRoot":"","sources":["../../src/lib/pdf-to-markdown.tool.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAE/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAIvE,wBAAgB,sBAAsB,CACpC,aAAa,EAAE,aAAa,EAC5B,YAAY,EAAE,yBAAyB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAqKxC"}
@@ -0,0 +1,146 @@
1
+ import { tool } from '@langchain/core/tools';
2
+ import { getCurrentTaskInput } from '@langchain/langgraph';
3
+ import { getErrorMessage } from '@xpert-ai/plugin-sdk';
4
+ import { z } from 'zod';
5
+ import { MinerUClient } from './mineru.client.js';
6
+ export function buildPdfToMarkdownTool(configService, resultParser) {
7
+ return tool(async (input) => {
8
+ try {
9
+ const { file, isOcr, enableFormula, enableTable, language, modelVersion } = input;
10
+ if (!file) {
11
+ throw new Error('No file provided');
12
+ }
13
+ const currentState = getCurrentTaskInput();
14
+ const workspacePath = currentState?.[`sys`]?.['volume'] ?? '/tmp/xpert';
15
+ const baseUrl = currentState?.[`sys`]?.['workspace_url'] ?? 'http://localhost:3000';
16
+ // Get permissions from current state
17
+ const permissions = currentState?.[`sys`]?.['permissions'];
18
+ if (!permissions?.fileSystem) {
19
+ throw new Error('File system permission is required for MinerU tool');
20
+ }
21
+ // Get file content
22
+ let fileContent;
23
+ let fileName;
24
+ let filePath;
25
+ let fileUrl;
26
+ if (file.content) {
27
+ if (typeof file.content === 'string') {
28
+ // Base64 string
29
+ fileContent = Buffer.from(file.content, 'base64');
30
+ }
31
+ else if (Buffer.isBuffer(file.content)) {
32
+ fileContent = file.content;
33
+ }
34
+ else if (file.content instanceof Uint8Array) {
35
+ fileContent = Buffer.from(file.content);
36
+ }
37
+ else {
38
+ throw new Error('Invalid file content format');
39
+ }
40
+ fileName = file.name || file.filename || 'document.pdf';
41
+ }
42
+ else if (file.filePath) {
43
+ filePath = file.filePath;
44
+ fileContent = await permissions.fileSystem.readFile(filePath);
45
+ fileName = file.name || file.filename || filePath.split('/').pop() || 'document.pdf';
46
+ }
47
+ else if (file.fileUrl) {
48
+ fileUrl = file.fileUrl;
49
+ const response = await fetch(fileUrl);
50
+ if (!response.ok) {
51
+ throw new Error(`Failed to download file from URL: ${response.statusText}`);
52
+ }
53
+ const arrayBuffer = await response.arrayBuffer();
54
+ fileContent = Buffer.from(arrayBuffer);
55
+ fileName = file.name || file.filename || fileUrl.split('/').pop() || 'document.pdf';
56
+ }
57
+ else {
58
+ throw new Error('File must provide content, filePath, or fileUrl');
59
+ }
60
+ // Save file to workspace if not already there
61
+ if (!filePath) {
62
+ const relativePath = `mineru-input/${fileName}`;
63
+ filePath = relativePath;
64
+ fileUrl = await permissions.fileSystem.writeFile(relativePath, fileContent);
65
+ }
66
+ // Create MinerU client
67
+ const mineruClient = new MinerUClient(configService, {
68
+ fileSystem: permissions.fileSystem,
69
+ integration: permissions.integration,
70
+ });
71
+ // Create task
72
+ const { taskId } = await mineruClient.createTask({
73
+ url: fileUrl || file.fileUrl,
74
+ filePath: filePath,
75
+ fileName: fileName,
76
+ isOcr: isOcr ?? true,
77
+ enableFormula: enableFormula ?? true,
78
+ enableTable: enableTable ?? true,
79
+ language: language || 'ch',
80
+ modelVersion: modelVersion || 'pipeline',
81
+ });
82
+ // Get result
83
+ let result;
84
+ if (mineruClient.serverType === 'self-hosted') {
85
+ result = mineruClient.getSelfHostedTask(taskId);
86
+ if (!result) {
87
+ throw new Error('Failed to get MinerU task result');
88
+ }
89
+ }
90
+ else {
91
+ result = await mineruClient.waitForTask(taskId, 5 * 60 * 1000, 5000);
92
+ }
93
+ // Parse result
94
+ const parsedResult = mineruClient.serverType === 'self-hosted'
95
+ ? await resultParser.parseLocalTask(result, taskId, { folder: 'mineru-output', name: fileName }, permissions.fileSystem)
96
+ : await resultParser.parseFromUrl(result.full_zip_url, taskId, { folder: 'mineru-output', name: fileName }, permissions.fileSystem);
97
+ // Get markdown content
98
+ const markdownContent = parsedResult.chunks[0]?.pageContent || '';
99
+ const outputFileName = fileName.replace(/\.pdf$/i, '.md');
100
+ const outputPath = `mineru-output/${outputFileName}`;
101
+ const outputUrl = await permissions.fileSystem.writeFile(outputPath, Buffer.from(markdownContent, 'utf-8'));
102
+ return [
103
+ `Successfully converted PDF to Markdown: ${outputFileName}`,
104
+ {
105
+ files: [
106
+ {
107
+ mimeType: 'text/markdown',
108
+ fileName: outputPath,
109
+ filePath: permissions.fileSystem.fullPath(outputPath),
110
+ fileUrl: outputUrl,
111
+ extension: 'md',
112
+ },
113
+ ...(parsedResult.metadata.assets || []).map((asset) => ({
114
+ mimeType: asset.type === 'image' ? 'image/png' : 'application/json',
115
+ fileName: asset.filePath,
116
+ filePath: permissions.fileSystem.fullPath(asset.filePath),
117
+ fileUrl: asset.url,
118
+ extension: asset.type === 'image' ? 'png' : 'json',
119
+ })),
120
+ ],
121
+ },
122
+ ];
123
+ }
124
+ catch (error) {
125
+ throw new Error(`Error converting PDF to Markdown: ${getErrorMessage(error)}`);
126
+ }
127
+ }, {
128
+ name: 'pdf_to_markdown',
129
+ description: `Convert PDF file to Markdown format using MinerU. Supports OCR, formula recognition, and table extraction.`,
130
+ schema: z.object({
131
+ file: z.object({
132
+ name: z.string().optional(),
133
+ filename: z.string().optional(),
134
+ content: z.union([z.string(), z.instanceof(Buffer), z.instanceof(Uint8Array)]).optional(),
135
+ filePath: z.string().optional(),
136
+ fileUrl: z.string().optional(),
137
+ }),
138
+ isOcr: z.boolean().optional().describe('Enable OCR for image-based PDFs'),
139
+ enableFormula: z.boolean().optional().describe('Enable recognition of mathematical formulas'),
140
+ enableTable: z.boolean().optional().describe('Enable recognition of tables'),
141
+ language: z.enum(['en', 'ch']).optional().describe('Document language (en for English, ch for Chinese)'),
142
+ modelVersion: z.enum(['pipeline', 'vlm']).optional().describe('MinerU model version'),
143
+ }),
144
+ responseFormat: 'content_and_artifact',
145
+ });
146
+ }
@@ -0,0 +1,18 @@
1
+ import { Document } from '@langchain/core/documents';
2
+ import { IKnowledgeDocument } from '@metad/contracts';
3
+ import { ChunkMetadata, XpFileSystem } from '@xpert-ai/plugin-sdk';
4
+ import { MinerUDocumentMetadata, MineruSelfHostedTaskResult } from './types.js';
5
+ export declare class MinerUResultParserService {
6
+ private readonly logger;
7
+ parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
8
+ id?: string;
9
+ chunks: Document<ChunkMetadata>[];
10
+ metadata: MinerUDocumentMetadata;
11
+ }>;
12
+ parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem: XpFileSystem): Promise<{
13
+ id?: string;
14
+ chunks: Document<ChunkMetadata>[];
15
+ metadata: MinerUDocumentMetadata;
16
+ }>;
17
+ }
18
+ //# sourceMappingURL=result-parser.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;IAqFI,cAAc,CAClB,MAAM,EAAE,0BAA0B,EAClC,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;CAkDH"}
@@ -0,0 +1,142 @@
1
+ var MinerUResultParserService_1;
2
+ import { __decorate } from "tslib";
3
+ import { Document } from '@langchain/core/documents';
4
+ import { Injectable, Logger } from '@nestjs/common';
5
+ import axios from 'axios';
6
+ import { join } from 'path';
7
+ import unzipper from 'unzipper';
8
+ import { v4 as uuidv4 } from 'uuid';
9
+ import { MinerU, } from './types.js';
10
+ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResultParserService {
11
+ constructor() {
12
+ this.logger = new Logger(MinerUResultParserService_1.name);
13
+ }
14
+ async parseFromUrl(fullZipUrl, taskId, document, fileSystem) {
15
+ this.logger.debug(`Downloading MinerU result from: ${fullZipUrl}`);
16
+ // 1. Download the zip file to memory
17
+ const response = await axios.get(fullZipUrl, {
18
+ responseType: 'arraybuffer',
19
+ });
20
+ const zipBuffer = Buffer.from(response.data);
21
+ const metadata = {
22
+ parser: MinerU,
23
+ taskId,
24
+ };
25
+ // 2. Unzip the file
26
+ const zipEntries = [];
27
+ const assets = [];
28
+ const directory = await unzipper.Open.buffer(zipBuffer);
29
+ const pathMap = new Map();
30
+ let fullMd = '';
31
+ let layoutJson = null;
32
+ for (const entry of directory.files) {
33
+ if (!entry.type || entry.type !== 'File')
34
+ continue;
35
+ const data = await entry.buffer();
36
+ zipEntries.push({ entryName: entry.path, data });
37
+ const fileName = entry.path;
38
+ const filePath = join(document.folder || '', entry.path);
39
+ const url = await fileSystem.writeFile(filePath, data);
40
+ pathMap.set(fileName, url);
41
+ // Write images to local file system
42
+ if (fileName.startsWith('images/')) {
43
+ assets.push({
44
+ type: 'image',
45
+ url: url,
46
+ filePath: filePath,
47
+ });
48
+ }
49
+ else if (fileName.endsWith('layout.json')) {
50
+ layoutJson = JSON.parse(data.toString('utf-8'));
51
+ metadata.mineruBackend = layoutJson?._backend;
52
+ metadata.mineruVersion = layoutJson?._version_name;
53
+ assets.push({
54
+ type: 'file',
55
+ url,
56
+ filePath: filePath,
57
+ });
58
+ }
59
+ else if (fileName.endsWith('content_list.json')) {
60
+ assets.push({
61
+ type: 'file',
62
+ url,
63
+ filePath: filePath,
64
+ });
65
+ }
66
+ else if (fileName.endsWith('full.md')) {
67
+ fullMd = data.toString('utf-8');
68
+ assets.push({
69
+ type: 'file',
70
+ url,
71
+ filePath: filePath,
72
+ });
73
+ }
74
+ else if (fileName.endsWith('origin.pdf')) {
75
+ metadata.originPdfUrl = fileName;
76
+ }
77
+ }
78
+ metadata.assets = assets;
79
+ // 3. Replace image relative path in full.md with file url
80
+ fullMd = fullMd.replace(/!\[(.*)\]\((images\/.+?)\)/g, (match, p1, p2) => {
81
+ const localPath = pathMap.get(p2);
82
+ return localPath ? `![${p1}](${localPath})` : match;
83
+ });
84
+ const chunks = [
85
+ new Document({
86
+ pageContent: fullMd,
87
+ metadata: { parser: MinerU, taskId, chunkId: uuidv4() },
88
+ }),
89
+ ];
90
+ return {
91
+ chunks,
92
+ metadata,
93
+ };
94
+ }
95
+ async parseLocalTask(result, taskId, document, fileSystem) {
96
+ const metadata = {
97
+ parser: MinerU,
98
+ taskId,
99
+ };
100
+ const assets = [];
101
+ const pathMap = new Map();
102
+ for (const image of result.images) {
103
+ const filePath = join(document.folder || '', 'images', image.name);
104
+ const url = await fileSystem.writeFile(filePath, Buffer.from(image.dataUrl.split(',')[1], 'base64'));
105
+ pathMap.set(`images/${image.name}`, url);
106
+ assets.push({
107
+ type: 'image',
108
+ url: url,
109
+ filePath: filePath,
110
+ });
111
+ }
112
+ if (result.sourceUrl) {
113
+ assets.push({
114
+ type: 'file',
115
+ url: result.sourceUrl,
116
+ filePath: join(document.folder || '', result.fileName || 'source.pdf'),
117
+ });
118
+ metadata.originPdfUrl = result.sourceUrl;
119
+ }
120
+ metadata.assets = assets;
121
+ let fullMd = result.mdContent;
122
+ // 3. Replace image relative path in full.md with file url
123
+ fullMd = fullMd.replace(/!\[(.*)\]\((images\/.+?)\)/g, (match, p1, p2) => {
124
+ const localPath = pathMap.get(p2);
125
+ return localPath ? `![${p1}](${localPath})` : match;
126
+ });
127
+ const chunks = [
128
+ new Document({
129
+ pageContent: fullMd,
130
+ metadata: { parser: MinerU, taskId, chunkId: uuidv4() },
131
+ }),
132
+ ];
133
+ return {
134
+ chunks,
135
+ metadata,
136
+ };
137
+ }
138
+ };
139
+ MinerUResultParserService = MinerUResultParserService_1 = __decorate([
140
+ Injectable()
141
+ ], MinerUResultParserService);
142
+ export { MinerUResultParserService };
@@ -0,0 +1,95 @@
1
+ import { IconType, IKnowledgeDocument } from '@metad/contracts';
2
+ import { ChunkMetadata, FileSystemPermission, IDocumentTransformerStrategy, IntegrationPermission } from '@xpert-ai/plugin-sdk';
3
+ import { TMinerUTransformerConfig } from './types.js';
4
+ export declare class MinerUTransformerStrategy implements IDocumentTransformerStrategy<TMinerUTransformerConfig> {
5
+ private readonly resultParser;
6
+ private readonly configService;
7
+ readonly permissions: (IntegrationPermission | FileSystemPermission)[];
8
+ readonly meta: {
9
+ name: string;
10
+ label: {
11
+ en_US: string;
12
+ zh_Hans: string;
13
+ };
14
+ description: {
15
+ en_US: string;
16
+ zh_Hans: string;
17
+ };
18
+ icon: {
19
+ type: IconType;
20
+ value: string;
21
+ color: string;
22
+ };
23
+ helpUrl: string;
24
+ configSchema: {
25
+ type: string;
26
+ properties: {
27
+ isOcr: {
28
+ type: string;
29
+ title: {
30
+ en_US: string;
31
+ zh_Hans: string;
32
+ };
33
+ description: {
34
+ en_US: string;
35
+ zh_Hans: string;
36
+ };
37
+ default: boolean;
38
+ };
39
+ enableFormula: {
40
+ type: string;
41
+ title: {
42
+ en_US: string;
43
+ zh_Hans: string;
44
+ };
45
+ description: {
46
+ en_US: string;
47
+ zh_Hans: string;
48
+ };
49
+ default: boolean;
50
+ };
51
+ enableTable: {
52
+ type: string;
53
+ title: {
54
+ en_US: string;
55
+ zh_Hans: string;
56
+ };
57
+ description: {
58
+ en_US: string;
59
+ zh_Hans: string;
60
+ };
61
+ default: boolean;
62
+ };
63
+ language: {
64
+ type: string;
65
+ title: {
66
+ en_US: string;
67
+ zh_Hans: string;
68
+ };
69
+ description: {
70
+ en_US: string;
71
+ zh_Hans: string;
72
+ };
73
+ default: string;
74
+ };
75
+ modelVersion: {
76
+ type: string;
77
+ title: {
78
+ en_US: string;
79
+ zh_Hans: string;
80
+ };
81
+ description: {
82
+ en_US: string;
83
+ zh_Hans: string;
84
+ };
85
+ enum: string[];
86
+ default: string;
87
+ };
88
+ };
89
+ required: any[];
90
+ };
91
+ };
92
+ validateConfig(config: any): Promise<void>;
93
+ transformDocuments(documents: Partial<IKnowledgeDocument>[], config: TMinerUTransformerConfig): Promise<Partial<IKnowledgeDocument<ChunkMetadata>>[]>;
94
+ }
95
+ //# sourceMappingURL=transformer-mineru.strategy.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAEb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EACtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,EAAgB,wBAAwB,EAAE,MAAM,YAAY,CAAA;AAEnE,qBAEa,yBAA0B,YAAW,4BAA4B,CAAC,wBAAwB,CAAC;IAEtG,OAAO,CAAC,QAAQ,CAAC,YAAY,CAA2B;IAGxD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;IAE7C,QAAQ,CAAC,WAAW,mDAWnB;IAED,QAAQ,CAAC,IAAI;;;;;;;;;;;kBAWM,QAAQ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAwE1B;IAED,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAIpC,kBAAkB,CACtB,SAAS,EAAE,OAAO,CAAC,kBAAkB,CAAC,EAAE,EACxC,MAAM,EAAE,wBAAwB,GAC/B,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,EAAE,CAAC;CAsDzD"}
@@ -0,0 +1,163 @@
1
+ import { __decorate, __metadata } from "tslib";
2
+ import { forwardRef, Inject, Injectable } from '@nestjs/common';
3
+ import { ConfigService } from '@nestjs/config';
4
+ import { DocumentTransformerStrategy, } from '@xpert-ai/plugin-sdk';
5
+ import { isNil, omitBy, pick } from 'lodash-es';
6
+ import { MinerUClient } from './mineru.client.js';
7
+ import { MinerUResultParserService } from './result-parser.service.js';
8
+ import { icon, MinerU } from './types.js';
9
+ let MinerUTransformerStrategy = class MinerUTransformerStrategy {
10
+ constructor() {
11
+ this.permissions = [
12
+ {
13
+ type: 'integration',
14
+ service: MinerU,
15
+ description: 'Access to MinerU system integrations'
16
+ },
17
+ {
18
+ type: 'filesystem',
19
+ operations: ['read', 'write', 'list'],
20
+ scope: []
21
+ }
22
+ ];
23
+ this.meta = {
24
+ name: MinerU,
25
+ label: {
26
+ en_US: 'MinerU',
27
+ zh_Hans: 'MinerU'
28
+ },
29
+ description: {
30
+ en_US: 'A high-quality tool for convert PDF to Markdown and JSON.',
31
+ zh_Hans: '一站式开源高质量数据提取工具,将PDF转换成Markdown和JSON格式。'
32
+ },
33
+ icon: {
34
+ type: 'svg',
35
+ value: icon,
36
+ color: '#14b8a6'
37
+ },
38
+ helpUrl: 'https://mineru.net/apiManage/docs',
39
+ configSchema: {
40
+ type: 'object',
41
+ properties: {
42
+ isOcr: {
43
+ type: 'boolean',
44
+ title: {
45
+ en_US: 'Enable OCR',
46
+ zh_Hans: '启用 OCR'
47
+ },
48
+ description: {
49
+ en_US: 'Enable OCR for image-based PDFs.',
50
+ zh_Hans: '对基于图像的 PDF 启用 OCR。'
51
+ },
52
+ default: true
53
+ },
54
+ enableFormula: {
55
+ type: 'boolean',
56
+ title: {
57
+ en_US: 'Enable Formula Recognition',
58
+ zh_Hans: '启用公式识别'
59
+ },
60
+ description: {
61
+ en_US: 'Enable recognition of mathematical formulas in documents.',
62
+ zh_Hans: '启用对文档中数学公式的识别。'
63
+ },
64
+ default: true
65
+ },
66
+ enableTable: {
67
+ type: 'boolean',
68
+ title: {
69
+ en_US: 'Enable Table Recognition',
70
+ zh_Hans: '启用表格识别'
71
+ },
72
+ description: {
73
+ en_US: 'Enable recognition of tables in documents.',
74
+ zh_Hans: '启用对文档中表格的识别。'
75
+ },
76
+ default: true
77
+ },
78
+ language: {
79
+ type: 'string',
80
+ title: {
81
+ en_US: 'Document Language',
82
+ zh_Hans: '文档语言'
83
+ },
84
+ description: {
85
+ en_US: 'The primary language of the document (e.g., "en" for English, "ch" for Chinese).',
86
+ zh_Hans: '文档的主要语言(例如,英文为 "en",中文为 "ch")。'
87
+ },
88
+ default: 'ch'
89
+ },
90
+ modelVersion: {
91
+ type: 'string',
92
+ title: {
93
+ en_US: 'Model Version',
94
+ zh_Hans: '模型版本'
95
+ },
96
+ description: {
97
+ en_US: 'The model version to use for extraction (e.g., "vlm" or "pipeline").',
98
+ zh_Hans: '用于提取的模型版本(例如,“vlm”或“pipeline”)。'
99
+ },
100
+ enum: ['pipeline', 'vlm'],
101
+ default: 'pipeline'
102
+ }
103
+ },
104
+ required: []
105
+ }
106
+ };
107
+ }
108
+ validateConfig(config) {
109
+ throw new Error('Method not implemented.');
110
+ }
111
+ async transformDocuments(documents, config) {
112
+ const mineru = new MinerUClient(this.configService, config.permissions);
113
+ const parsedResults = [];
114
+ for await (const document of documents) {
115
+ if (mineru.serverType === 'self-hosted') {
116
+ const { taskId } = await mineru.createTask({
117
+ url: document.fileUrl,
118
+ filePath: document.filePath,
119
+ fileName: document.name,
120
+ isOcr: true,
121
+ enableFormula: true,
122
+ enableTable: true,
123
+ // language: 'ch',
124
+ // modelVersion: 'vlm'
125
+ });
126
+ const result = mineru.getSelfHostedTask(taskId);
127
+ const parsedResult = await this.resultParser.parseLocalTask(result, taskId, document, config.permissions.fileSystem);
128
+ parsedResult.id = document.id;
129
+ parsedResults.push(parsedResult);
130
+ }
131
+ else {
132
+ const { taskId } = await mineru.createTask({
133
+ url: document.fileUrl,
134
+ isOcr: true,
135
+ enableFormula: true,
136
+ enableTable: true,
137
+ language: 'ch',
138
+ modelVersion: 'vlm',
139
+ ...omitBy(pick(config, ['isOcr', 'enableFormula', 'enableTable', 'language', 'modelVersion']), isNil)
140
+ });
141
+ // Waiting for completion
142
+ const result = await mineru.waitForTask(taskId, 5 * 60 * 1000, 5000);
143
+ const parsedResult = await this.resultParser.parseFromUrl(result.full_zip_url, taskId, document, config.permissions.fileSystem);
144
+ parsedResult.id = document.id;
145
+ parsedResults.push(parsedResult);
146
+ }
147
+ }
148
+ return parsedResults;
149
+ }
150
+ };
151
+ __decorate([
152
+ Inject(MinerUResultParserService),
153
+ __metadata("design:type", MinerUResultParserService)
154
+ ], MinerUTransformerStrategy.prototype, "resultParser", void 0);
155
+ __decorate([
156
+ Inject(forwardRef(() => ConfigService)),
157
+ __metadata("design:type", ConfigService)
158
+ ], MinerUTransformerStrategy.prototype, "configService", void 0);
159
+ MinerUTransformerStrategy = __decorate([
160
+ Injectable(),
161
+ DocumentTransformerStrategy(MinerU)
162
+ ], MinerUTransformerStrategy);
163
+ export { MinerUTransformerStrategy };