@helloxiaohu/plugin-mineru6 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +101 -0
  2. package/dist/index.d.ts +6 -0
  3. package/dist/index.d.ts.map +1 -0
  4. package/dist/index.js +40 -0
  5. package/dist/lib/integration.strategy.d.ts +10 -0
  6. package/dist/lib/integration.strategy.d.ts.map +1 -0
  7. package/dist/lib/integration.strategy.js +118 -0
  8. package/dist/lib/mineru-toolset.strategy.d.ts +221 -0
  9. package/dist/lib/mineru-toolset.strategy.d.ts.map +1 -0
  10. package/dist/lib/mineru-toolset.strategy.js +236 -0
  11. package/dist/lib/mineru.client.d.ts +120 -0
  12. package/dist/lib/mineru.client.d.ts.map +1 -0
  13. package/dist/lib/mineru.client.js +456 -0
  14. package/dist/lib/mineru.controller.d.ts +9 -0
  15. package/dist/lib/mineru.controller.d.ts.map +1 -0
  16. package/dist/lib/mineru.controller.js +41 -0
  17. package/dist/lib/mineru.plugin.d.ts +13 -0
  18. package/dist/lib/mineru.plugin.d.ts.map +1 -0
  19. package/dist/lib/mineru.plugin.js +52 -0
  20. package/dist/lib/mineru.tool.d.ts +75 -0
  21. package/dist/lib/mineru.tool.d.ts.map +1 -0
  22. package/dist/lib/mineru.tool.js +141 -0
  23. package/dist/lib/mineru.toolset.d.ts +51 -0
  24. package/dist/lib/mineru.toolset.d.ts.map +1 -0
  25. package/dist/lib/mineru.toolset.js +52 -0
  26. package/dist/lib/path-meta.d.ts +5 -0
  27. package/dist/lib/path-meta.d.ts.map +1 -0
  28. package/dist/lib/path-meta.js +8 -0
  29. package/dist/lib/result-parser.service.d.ts +18 -0
  30. package/dist/lib/result-parser.service.d.ts.map +1 -0
  31. package/dist/lib/result-parser.service.js +171 -0
  32. package/dist/lib/transformer-mineru.strategy.d.ts +95 -0
  33. package/dist/lib/transformer-mineru.strategy.d.ts.map +1 -0
  34. package/dist/lib/transformer-mineru.strategy.js +163 -0
  35. package/dist/lib/types.d.ts +53 -0
  36. package/dist/lib/types.d.ts.map +1 -0
  37. package/dist/lib/types.js +40 -0
  38. package/package.json +62 -0
@@ -0,0 +1,141 @@
1
+ import { tool } from '@langchain/core/tools';
2
+ import { getCurrentTaskInput } from '@langchain/langgraph';
3
+ import { getErrorMessage } from '@xpert-ai/plugin-sdk';
4
+ import { z } from 'zod';
5
+ import { MinerUClient } from './mineru.client.js';
6
+ import { MinerUIntegration } from './types.js';
7
+ /**
8
+ * Build MinerU PDF parser tool
9
+ * This tool converts PDF files to markdown format using MinerU service
10
+ */
11
+ export function buildMinerUTool(configService, resultParser, options, fileSystem, defaults) {
12
+ return tool(async (input) => {
13
+ try {
14
+ const {
15
+ // Coze-like parameters (snake_case)
16
+ doc_url, token, url, server_type, enable_formula, enable_table, is_ocr, language, model_version } = input;
17
+ // Use defaults from toolset config if not provided in input
18
+ const finalIsOcr = is_ocr ?? defaults?.isOcr ?? true;
19
+ const finalEnableFormula = enable_formula ?? defaults?.enableFormula ?? true;
20
+ const finalEnableTable = enable_table ?? defaults?.enableTable ?? true;
21
+ const finalLanguage = language ?? defaults?.language ?? 'ch';
22
+ const finalModelVersion = model_version ?? defaults?.modelVersion ?? 'pipeline';
23
+ if (!doc_url) {
24
+ throw new Error('doc_url is required');
25
+ }
26
+ // Get workspace context from current task
27
+ const currentState = getCurrentTaskInput();
28
+ const workspacePath = currentState?.['sys']?.['volume'] ?? '/tmp/xpert';
29
+ const baseUrl = currentState?.['sys']?.['workspace_url'];
30
+ // Merge toolset credentials with per-call overrides
31
+ const effectiveOptions = {
32
+ // Default to the official MinerU v4 API base URL used by MinerUClient
33
+ apiUrl: url ?? options?.apiUrl ?? 'https://mineru.net/api/v4',
34
+ apiKey: token ?? options?.apiKey,
35
+ serverType: (server_type ?? options?.serverType ?? 'official'),
36
+ };
37
+ if (effectiveOptions.serverType === 'official' && !effectiveOptions.apiKey) {
38
+ throw new Error('token (API Token) is required when server_type is "official"');
39
+ }
40
+ const integration = {
41
+ provider: MinerUIntegration,
42
+ options: effectiveOptions,
43
+ };
44
+ const mineruClient = new MinerUClient(configService, {
45
+ fileSystem,
46
+ integration,
47
+ });
48
+ // Determine file name from URL
49
+ let finalFileName = 'document.pdf';
50
+ try {
51
+ const parsed = new URL(doc_url);
52
+ finalFileName = parsed.pathname.split('/').pop() || 'document.pdf';
53
+ }
54
+ catch {
55
+ // ignore
56
+ }
57
+ // Create MinerU task
58
+ const { taskId } = await mineruClient.createTask({
59
+ url: doc_url,
60
+ fileName: finalFileName,
61
+ isOcr: finalIsOcr,
62
+ enableFormula: finalEnableFormula,
63
+ enableTable: finalEnableTable,
64
+ language: finalLanguage,
65
+ modelVersion: finalModelVersion,
66
+ });
67
+ let parsedResult;
68
+ if (mineruClient.serverType === 'self-hosted') {
69
+ // Self-hosted: get result immediately
70
+ const taskResult = mineruClient.getSelfHostedTask(taskId);
71
+ if (!taskResult) {
72
+ throw new Error('Failed to get MinerU task result');
73
+ }
74
+ parsedResult = await resultParser.parseLocalTask(taskResult, taskId, {
75
+ fileUrl: doc_url,
76
+ name: finalFileName,
77
+ folder: workspacePath,
78
+ }, fileSystem);
79
+ }
80
+ else {
81
+ // Official API: wait for completion
82
+ const result = await mineruClient.waitForTask(taskId, 5 * 60 * 1000, 5000);
83
+ parsedResult = await resultParser.parseFromUrl(result.full_zip_url, taskId, {
84
+ fileUrl: doc_url,
85
+ name: finalFileName,
86
+ folder: workspacePath,
87
+ }, fileSystem);
88
+ }
89
+ // Build file artifacts from parsed result
90
+ const fileArtifacts = [];
91
+ if (parsedResult.metadata?.assets) {
92
+ for (const asset of parsedResult.metadata.assets) {
93
+ if (asset.type === 'file' || asset.type === 'image') {
94
+ const fileName = asset.filePath?.split(/[/\\]/).pop() || asset.url?.split('/').pop() || 'file';
95
+ const extension = fileName.split('.').pop()?.toLowerCase() || 'md';
96
+ const mimeType = asset.type === 'image'
97
+ ? (extension === 'png' ? 'image/png' : 'image/jpeg')
98
+ : (extension === 'md' ? 'text/markdown' : 'application/json');
99
+ fileArtifacts.push({
100
+ fileName: fileName,
101
+ filePath: asset.filePath,
102
+ fileUrl: asset.url,
103
+ mimeType: mimeType,
104
+ extension: extension,
105
+ });
106
+ }
107
+ }
108
+ }
109
+ // Extract markdown content from chunks
110
+ const markdownContent = parsedResult.chunks
111
+ ?.map((chunk) => chunk.pageContent)
112
+ .join('\n\n') || '';
113
+ return [
114
+ `PDF processed successfully by MinerU.\n\nTask ID: ${taskId}\n\nMarkdown Content:\n${markdownContent.substring(0, 1000)}${markdownContent.length > 1000 ? '...' : ''}`,
115
+ {
116
+ files: fileArtifacts,
117
+ taskId,
118
+ metadata: parsedResult.metadata,
119
+ },
120
+ ];
121
+ }
122
+ catch (error) {
123
+ throw new Error(`MinerU processing failed: ${getErrorMessage(error)}`);
124
+ }
125
+ }, {
126
+ name: 'mineru_pdf_parser',
127
+ description: 'Convert PDF files to markdown format using MinerU. Supports OCR, formula recognition, and table extraction. Returns markdown content and extracted files (images, JSON, etc.).',
128
+ schema: z.object({
129
+ doc_url: z.string().min(1).describe('PDF URL (required)'),
130
+ token: z.string().optional().nullable().describe('MinerU API token (required when server_type is "official")'),
131
+ url: z.string().optional().nullable().describe('MinerU API base url (optional, default: https://mineru.net/api/v4)'),
132
+ server_type: z.enum(['official', 'self-hosted']).optional().nullable().describe('MinerU service type (default: "official")'),
133
+ enable_formula: z.boolean().optional().nullable().describe('Enable formula recognition (default: true)'),
134
+ enable_table: z.boolean().optional().nullable().describe('Enable table recognition (default: true)'),
135
+ is_ocr: z.boolean().optional().nullable().describe('Enable OCR for image-based PDFs (default: true)'),
136
+ language: z.enum(['en', 'ch']).optional().nullable().describe('Document language: "en" for English, "ch" for Chinese (default: "ch")'),
137
+ model_version: z.enum(['pipeline', 'vlm']).optional().nullable().describe('Model version: "pipeline" or "vlm" (default: "pipeline")'),
138
+ }),
139
+ responseFormat: 'content_and_artifact',
140
+ });
141
+ }
@@ -0,0 +1,51 @@
1
+ import { StructuredToolInterface, ToolSchemaBase } from '@langchain/core/tools';
2
+ import { BuiltinToolset, XpFileSystem } from '@xpert-ai/plugin-sdk';
3
+ import { ConfigService } from '@nestjs/config';
4
+ import { MinerUResultParserService } from './result-parser.service.js';
5
+ import { MinerUIntegrationOptions } from './types.js';
6
+ /**
7
+ * Configuration for MinerU Toolset
8
+ */
9
+ export interface MinerUToolsetConfig {
10
+ /**
11
+ * MinerU API options stored in toolset credentials
12
+ */
13
+ apiUrl?: string;
14
+ apiKey?: string;
15
+ serverType?: MinerUIntegrationOptions['serverType'];
16
+ fileSystem?: XpFileSystem;
17
+ configService?: ConfigService;
18
+ resultParser?: MinerUResultParserService;
19
+ isOcr?: boolean;
20
+ enableFormula?: boolean;
21
+ enableTable?: boolean;
22
+ language?: 'en' | 'ch';
23
+ modelVersion?: 'pipeline' | 'vlm';
24
+ }
25
+ /**
26
+ * MinerU Toolset implementation
27
+ * Provides PDF to markdown conversion tool using MinerU service
28
+ */
29
+ export declare class MinerUToolset extends BuiltinToolset<StructuredToolInterface, MinerUToolsetConfig> {
30
+ private readonly config;
31
+ /**
32
+ * Constructor for MinerU Toolset
33
+ * Accepts config which contains credentials and dependencies
34
+ * Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
35
+ * properties (tenantId, env) that are added at runtime
36
+ */
37
+ constructor(config: MinerUToolsetConfig);
38
+ /**
39
+ * Validate credentials for MinerU toolset
40
+ * Note: During authorization phase, credentials may be incomplete.
41
+ * configService and resultParser are runtime dependencies injected by the strategy.
42
+ * We don't validate anything here to allow authorization to proceed.
43
+ */
44
+ _validateCredentials(credentials: MinerUToolsetConfig): Promise<void>;
45
+ /**
46
+ * Initialize tools for MinerU toolset
47
+ * Creates the PDF parser tool with necessary dependencies
48
+ */
49
+ initTools(): Promise<StructuredToolInterface<ToolSchemaBase, any, any>[]>;
50
+ }
51
+ //# sourceMappingURL=mineru.toolset.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"mineru.toolset.d.ts","sourceRoot":"","sources":["../../src/lib/mineru.toolset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAChF,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACpE,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAEvE,OAAO,EAAE,wBAAwB,EAAE,MAAM,YAAY,CAAC;AAEtD;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC;;OAEG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,wBAAwB,CAAC,YAAY,CAAC,CAAC;IACpD,UAAU,CAAC,EAAE,YAAY,CAAC;IAC1B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,YAAY,CAAC,EAAE,yBAAyB,CAAC;IAEzC,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,QAAQ,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACvB,YAAY,CAAC,EAAE,UAAU,GAAG,KAAK,CAAC;CACnC;AAED;;;GAGG;AACH,qBAAa,aAAc,SAAQ,cAAc,CAAC,uBAAuB,EAAE,mBAAmB,CAAC;IAC7F,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAsB;IAE7C;;;;;OAKG;gBACS,MAAM,EAAE,mBAAmB;IAKvC;;;;;OAKG;IACY,oBAAoB,CAAC,WAAW,EAAE,mBAAmB,GAAG,OAAO,CAAC,IAAI,CAAC;IAKpF;;;OAGG;IACY,SAAS,IAAI,OAAO,CAAC,uBAAuB,CAAC,cAAc,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;CA0BzF"}
@@ -0,0 +1,52 @@
1
+ import { BuiltinToolset } from '@xpert-ai/plugin-sdk';
2
+ import { buildMinerUTool } from './mineru.tool.js';
3
+ /**
4
+ * MinerU Toolset implementation
5
+ * Provides PDF to markdown conversion tool using MinerU service
6
+ */
7
+ export class MinerUToolset extends BuiltinToolset {
8
+ /**
9
+ * Constructor for MinerU Toolset
10
+ * Accepts config which contains credentials and dependencies
11
+ * Note: Using 'as any' for params because TBuiltinToolsetParams requires system-provided
12
+ * properties (tenantId, env) that are added at runtime
13
+ */
14
+ constructor(config) {
15
+ super('mineru', undefined, config);
16
+ this.config = config;
17
+ }
18
+ /**
19
+ * Validate credentials for MinerU toolset
20
+ * Note: During authorization phase, credentials may be incomplete.
21
+ * configService and resultParser are runtime dependencies injected by the strategy.
22
+ * We don't validate anything here to allow authorization to proceed.
23
+ */
24
+ async _validateCredentials(credentials) {
25
+ // No validation needed during authorization phase
26
+ // API key validity will be enforced by MinerU server when tool is used
27
+ }
28
+ /**
29
+ * Initialize tools for MinerU toolset
30
+ * Creates the PDF parser tool with necessary dependencies
31
+ */
32
+ async initTools() {
33
+ const { configService, resultParser, apiUrl, apiKey, serverType, fileSystem, isOcr, enableFormula, enableTable, language, modelVersion } = this.config;
34
+ if (!configService || !resultParser) {
35
+ throw new Error('ConfigService and MinerUResultParserService are required');
36
+ }
37
+ this.tools = [
38
+ buildMinerUTool(configService, resultParser, {
39
+ apiUrl,
40
+ apiKey,
41
+ serverType: serverType ?? 'official',
42
+ }, fileSystem, {
43
+ isOcr,
44
+ enableFormula,
45
+ enableTable,
46
+ language,
47
+ modelVersion,
48
+ }),
49
+ ];
50
+ return this.tools;
51
+ }
52
+ }
@@ -0,0 +1,5 @@
1
+ export declare function getModuleMeta(meta?: ImportMeta): {
2
+ __filename: string;
3
+ __dirname: string;
4
+ };
5
+ //# sourceMappingURL=path-meta.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"path-meta.d.ts","sourceRoot":"","sources":["../../src/lib/path-meta.ts"],"names":[],"mappings":"AAGA,wBAAgB,aAAa,CAAC,IAAI,CAAC,EAAE,UAAU;;;EAK9C"}
@@ -0,0 +1,8 @@
1
+ import { dirname } from 'path';
2
+ import { fileURLToPath } from 'url';
3
+ export function getModuleMeta(meta) {
4
+ const isESM = typeof require === 'undefined';
5
+ const filename = isESM ? fileURLToPath(meta.url) : __filename;
6
+ const dir = isESM ? dirname(filename) : __dirname;
7
+ return { __filename: filename, __dirname: dir };
8
+ }
@@ -0,0 +1,18 @@
1
+ import { Document } from '@langchain/core/documents';
2
+ import { IKnowledgeDocument } from '@metad/contracts';
3
+ import { ChunkMetadata, XpFileSystem } from '@xpert-ai/plugin-sdk';
4
+ import { MinerUDocumentMetadata, MineruSelfHostedTaskResult } from './types.js';
5
+ export declare class MinerUResultParserService {
6
+ private readonly logger;
7
+ parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem?: XpFileSystem): Promise<{
8
+ id?: string;
9
+ chunks: Document<ChunkMetadata>[];
10
+ metadata: MinerUDocumentMetadata;
11
+ }>;
12
+ parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem?: XpFileSystem): Promise<{
13
+ id?: string;
14
+ chunks: Document<ChunkMetadata>[];
15
+ metadata: MinerUDocumentMetadata;
16
+ }>;
17
+ }
18
+ //# sourceMappingURL=result-parser.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,CAAC,EAAE,YAAY,GACxB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;IAqGI,cAAc,CAClB,MAAM,EAAE,0BAA0B,EAClC,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,CAAC,EAAE,YAAY,GACxB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;CA4DH"}
@@ -0,0 +1,171 @@
1
+ var MinerUResultParserService_1;
2
+ import { __decorate } from "tslib";
3
+ import { Document } from '@langchain/core/documents';
4
+ import { Injectable, Logger } from '@nestjs/common';
5
+ import axios from 'axios';
6
+ import { join } from 'path';
7
+ import unzipper from 'unzipper';
8
+ import { v4 as uuidv4 } from 'uuid';
9
+ import { MinerU, } from './types.js';
10
+ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResultParserService {
11
+ constructor() {
12
+ this.logger = new Logger(MinerUResultParserService_1.name);
13
+ }
14
+ async parseFromUrl(fullZipUrl, taskId, document, fileSystem) {
15
+ this.logger.debug(`Downloading MinerU result from: ${fullZipUrl}`);
16
+ // 1. Download the zip file to memory
17
+ const response = await axios.get(fullZipUrl, {
18
+ responseType: 'arraybuffer',
19
+ });
20
+ const zipBuffer = Buffer.from(response.data);
21
+ const metadata = {
22
+ parser: MinerU,
23
+ taskId,
24
+ };
25
+ // 2. Unzip the file
26
+ const zipEntries = [];
27
+ const assets = [];
28
+ const directory = await unzipper.Open.buffer(zipBuffer);
29
+ const pathMap = new Map();
30
+ let fullMd = '';
31
+ let layoutJson = null;
32
+ for (const entry of directory.files) {
33
+ if (!entry.type || entry.type !== 'File')
34
+ continue;
35
+ const data = await entry.buffer();
36
+ zipEntries.push({ entryName: entry.path, data });
37
+ const fileName = entry.path;
38
+ const filePath = join(document.folder || '', entry.path);
39
+ // If platform didn't provide filesystem permission, still parse markdown but skip persisting files.
40
+ // This avoids runtime crashes like: "Cannot read properties of undefined (reading 'writeFile')".
41
+ if (fileSystem) {
42
+ const url = await fileSystem.writeFile(filePath, data);
43
+ pathMap.set(fileName, url);
44
+ // Write images to local file system
45
+ if (fileName.startsWith('images/')) {
46
+ assets.push({
47
+ type: 'image',
48
+ url: url,
49
+ filePath: filePath,
50
+ });
51
+ }
52
+ else if (fileName.endsWith('layout.json')) {
53
+ layoutJson = JSON.parse(data.toString('utf-8'));
54
+ metadata.mineruBackend = layoutJson?._backend;
55
+ metadata.mineruVersion = layoutJson?._version_name;
56
+ assets.push({
57
+ type: 'file',
58
+ url,
59
+ filePath: filePath,
60
+ });
61
+ }
62
+ else if (fileName.endsWith('content_list.json')) {
63
+ assets.push({
64
+ type: 'file',
65
+ url,
66
+ filePath: filePath,
67
+ });
68
+ }
69
+ else if (fileName.endsWith('full.md')) {
70
+ fullMd = data.toString('utf-8');
71
+ assets.push({
72
+ type: 'file',
73
+ url,
74
+ filePath: filePath,
75
+ });
76
+ }
77
+ else if (fileName.endsWith('origin.pdf')) {
78
+ metadata.originPdfUrl = fileName;
79
+ }
80
+ }
81
+ else {
82
+ // Still extract key metadata & markdown without writing to filesystem
83
+ if (fileName.endsWith('layout.json')) {
84
+ layoutJson = JSON.parse(data.toString('utf-8'));
85
+ metadata.mineruBackend = layoutJson?._backend;
86
+ metadata.mineruVersion = layoutJson?._version_name;
87
+ }
88
+ else if (fileName.endsWith('full.md')) {
89
+ fullMd = data.toString('utf-8');
90
+ }
91
+ else if (fileName.endsWith('origin.pdf')) {
92
+ metadata.originPdfUrl = fileName;
93
+ }
94
+ }
95
+ }
96
+ metadata.assets = assets;
97
+ // 3. Replace image relative path in full.md with file url
98
+ fullMd = fullMd.replace(/!\[(.*)\]\((images\/.+?)\)/g, (match, p1, p2) => {
99
+ const localPath = pathMap.get(p2);
100
+ return localPath ? `![${p1}](${localPath})` : match;
101
+ });
102
+ const chunks = [
103
+ new Document({
104
+ pageContent: fullMd,
105
+ metadata: { parser: MinerU, taskId, chunkId: uuidv4() },
106
+ }),
107
+ ];
108
+ return {
109
+ chunks,
110
+ metadata,
111
+ };
112
+ }
113
+ async parseLocalTask(result, taskId, document, fileSystem) {
114
+ const metadata = {
115
+ parser: MinerU,
116
+ taskId,
117
+ };
118
+ const assets = [];
119
+ const pathMap = new Map();
120
+ for (const image of result.images) {
121
+ const filePath = join(document.folder || '', 'images', image.name);
122
+ if (fileSystem) {
123
+ const url = await fileSystem.writeFile(filePath, Buffer.from(image.dataUrl.split(',')[1], 'base64'));
124
+ pathMap.set(`images/${image.name}`, url);
125
+ assets.push({
126
+ type: 'image',
127
+ url: url,
128
+ filePath: filePath,
129
+ });
130
+ }
131
+ else {
132
+ // Fallback: keep images as data URLs so markdown can still render without filesystem permission
133
+ pathMap.set(`images/${image.name}`, image.dataUrl);
134
+ assets.push({
135
+ type: 'image',
136
+ url: image.dataUrl,
137
+ filePath: filePath,
138
+ });
139
+ }
140
+ }
141
+ if (result.sourceUrl) {
142
+ assets.push({
143
+ type: 'file',
144
+ url: result.sourceUrl,
145
+ filePath: join(document.folder || '', result.fileName || 'source.pdf'),
146
+ });
147
+ metadata.originPdfUrl = result.sourceUrl;
148
+ }
149
+ metadata.assets = assets;
150
+ let fullMd = result.mdContent;
151
+ // 3. Replace image relative path in full.md with file url
152
+ fullMd = fullMd.replace(/!\[(.*)\]\((images\/.+?)\)/g, (match, p1, p2) => {
153
+ const localPath = pathMap.get(p2);
154
+ return localPath ? `![${p1}](${localPath})` : match;
155
+ });
156
+ const chunks = [
157
+ new Document({
158
+ pageContent: fullMd,
159
+ metadata: { parser: MinerU, taskId, chunkId: uuidv4() },
160
+ }),
161
+ ];
162
+ return {
163
+ chunks,
164
+ metadata,
165
+ };
166
+ }
167
+ };
168
+ MinerUResultParserService = MinerUResultParserService_1 = __decorate([
169
+ Injectable()
170
+ ], MinerUResultParserService);
171
+ export { MinerUResultParserService };
@@ -0,0 +1,95 @@
1
+ import { IconType, IKnowledgeDocument } from '@metad/contracts';
2
+ import { ChunkMetadata, FileSystemPermission, IDocumentTransformerStrategy, IntegrationPermission } from '@xpert-ai/plugin-sdk';
3
+ import { TMinerUTransformerConfig } from './types.js';
4
+ export declare class MinerUTransformerStrategy implements IDocumentTransformerStrategy<TMinerUTransformerConfig> {
5
+ private readonly resultParser;
6
+ private readonly configService;
7
+ readonly permissions: (IntegrationPermission | FileSystemPermission)[];
8
+ readonly meta: {
9
+ name: string;
10
+ label: {
11
+ en_US: string;
12
+ zh_Hans: string;
13
+ };
14
+ description: {
15
+ en_US: string;
16
+ zh_Hans: string;
17
+ };
18
+ icon: {
19
+ type: IconType;
20
+ value: string;
21
+ color: string;
22
+ };
23
+ helpUrl: string;
24
+ configSchema: {
25
+ type: string;
26
+ properties: {
27
+ isOcr: {
28
+ type: string;
29
+ title: {
30
+ en_US: string;
31
+ zh_Hans: string;
32
+ };
33
+ description: {
34
+ en_US: string;
35
+ zh_Hans: string;
36
+ };
37
+ default: boolean;
38
+ };
39
+ enableFormula: {
40
+ type: string;
41
+ title: {
42
+ en_US: string;
43
+ zh_Hans: string;
44
+ };
45
+ description: {
46
+ en_US: string;
47
+ zh_Hans: string;
48
+ };
49
+ default: boolean;
50
+ };
51
+ enableTable: {
52
+ type: string;
53
+ title: {
54
+ en_US: string;
55
+ zh_Hans: string;
56
+ };
57
+ description: {
58
+ en_US: string;
59
+ zh_Hans: string;
60
+ };
61
+ default: boolean;
62
+ };
63
+ language: {
64
+ type: string;
65
+ title: {
66
+ en_US: string;
67
+ zh_Hans: string;
68
+ };
69
+ description: {
70
+ en_US: string;
71
+ zh_Hans: string;
72
+ };
73
+ default: string;
74
+ };
75
+ modelVersion: {
76
+ type: string;
77
+ title: {
78
+ en_US: string;
79
+ zh_Hans: string;
80
+ };
81
+ description: {
82
+ en_US: string;
83
+ zh_Hans: string;
84
+ };
85
+ enum: string[];
86
+ default: string;
87
+ };
88
+ };
89
+ required: any[];
90
+ };
91
+ };
92
+ validateConfig(config: any): Promise<void>;
93
+ transformDocuments(documents: Partial<IKnowledgeDocument>[], config: TMinerUTransformerConfig): Promise<Partial<IKnowledgeDocument<ChunkMetadata>>[]>;
94
+ }
95
+ //# sourceMappingURL=transformer-mineru.strategy.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"transformer-mineru.strategy.d.ts","sourceRoot":"","sources":["../../src/lib/transformer-mineru.strategy.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AAG/D,OAAO,EACL,aAAa,EAEb,oBAAoB,EACpB,4BAA4B,EAC5B,qBAAqB,EACtB,MAAM,sBAAsB,CAAA;AAI7B,OAAO,EAA8C,wBAAwB,EAAE,MAAM,YAAY,CAAA;AAEjG,qBAEa,yBAA0B,YAAW,4BAA4B,CAAC,wBAAwB,CAAC;IAEtG,OAAO,CAAC,QAAQ,CAAC,YAAY,CAA2B;IAGxD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAe;IAE7C,QAAQ,CAAC,WAAW,mDAWnB;IAED,QAAQ,CAAC,IAAI;;;;;;;;;;;kBAWM,QAAQ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAwE1B;IAED,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC;IAIpC,kBAAkB,CACtB,SAAS,EAAE,OAAO,CAAC,kBAAkB,CAAC,EAAE,EACxC,MAAM,EAAE,wBAAwB,GAC/B,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,EAAE,CAAC;CAsDzD"}