@helloxiaohu/plugin-mineru 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -4,12 +4,12 @@ import { ChunkMetadata, XpFileSystem } from '@xpert-ai/plugin-sdk';
|
|
|
4
4
|
import { MinerUDocumentMetadata, MineruSelfHostedTaskResult } from './types.js';
|
|
5
5
|
export declare class MinerUResultParserService {
|
|
6
6
|
private readonly logger;
|
|
7
|
-
parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem
|
|
7
|
+
parseFromUrl(fullZipUrl: string, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem?: XpFileSystem): Promise<{
|
|
8
8
|
id?: string;
|
|
9
9
|
chunks: Document<ChunkMetadata>[];
|
|
10
10
|
metadata: MinerUDocumentMetadata;
|
|
11
11
|
}>;
|
|
12
|
-
parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem
|
|
12
|
+
parseLocalTask(result: MineruSelfHostedTaskResult, taskId: string, document: Partial<IKnowledgeDocument>, fileSystem?: XpFileSystem): Promise<{
|
|
13
13
|
id?: string;
|
|
14
14
|
chunks: Document<ChunkMetadata>[];
|
|
15
15
|
metadata: MinerUDocumentMetadata;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,EAAE,YAAY,
|
|
1
|
+
{"version":3,"file":"result-parser.service.d.ts","sourceRoot":"","sources":["../../src/lib/result-parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EACL,aAAa,EAEb,YAAY,EACb,MAAM,sBAAsB,CAAC;AAK9B,OAAO,EAEL,sBAAsB,EACtB,0BAA0B,EAC3B,MAAM,YAAY,CAAC;AAEpB,qBACa,yBAAyB;IACpC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA8C;IAE/D,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,CAAC,EAAE,YAAY,GACxB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;IAmGI,cAAc,CAClB,MAAM,EAAE,0BAA0B,EAClC,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACrC,UAAU,CAAC,EAAE,YAAY,GACxB,OAAO,CAAC;QACT,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,MAAM,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAClC,QAAQ,EAAE,sBAAsB,CAAC;KAClC,CAAC;CA2DH"}
|
|
@@ -37,43 +37,59 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
|
|
|
37
37
|
zipEntries.push({ entryName: entry.path, data });
|
|
38
38
|
const fileName = entry.path;
|
|
39
39
|
const filePath = join(document.folder || '', entry.path);
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
40
|
+
if (fileSystem) {
|
|
41
|
+
const url = await fileSystem.writeFile(filePath, data);
|
|
42
|
+
pathMap.set(fileName, url);
|
|
43
|
+
// Write images to local file system
|
|
44
|
+
if (fileName.startsWith('images/')) {
|
|
45
|
+
assets.push({
|
|
46
|
+
type: 'image',
|
|
47
|
+
url: url,
|
|
48
|
+
filePath: filePath,
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
else if (fileName.endsWith('layout.json')) {
|
|
52
|
+
layoutJson = JSON.parse(data.toString('utf-8'));
|
|
53
|
+
metadata.mineruBackend = layoutJson?._backend;
|
|
54
|
+
metadata.mineruVersion = layoutJson?._version_name;
|
|
55
|
+
assets.push({
|
|
56
|
+
type: 'file',
|
|
57
|
+
url,
|
|
58
|
+
filePath: filePath,
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
else if (fileName.endsWith('content_list.json')) {
|
|
62
|
+
assets.push({
|
|
63
|
+
type: 'file',
|
|
64
|
+
url,
|
|
65
|
+
filePath: filePath,
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
else if (fileName.endsWith('full.md')) {
|
|
69
|
+
fullMd = data.toString('utf-8');
|
|
70
|
+
assets.push({
|
|
71
|
+
type: 'file',
|
|
72
|
+
url,
|
|
73
|
+
filePath: filePath,
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
else if (fileName.endsWith('origin.pdf')) {
|
|
77
|
+
metadata.originPdfUrl = fileName;
|
|
78
|
+
}
|
|
74
79
|
}
|
|
75
|
-
else
|
|
76
|
-
metadata
|
|
80
|
+
else {
|
|
81
|
+
// Still extract key metadata & markdown without writing to filesystem.
|
|
82
|
+
if (fileName.endsWith('layout.json')) {
|
|
83
|
+
layoutJson = JSON.parse(data.toString('utf-8'));
|
|
84
|
+
metadata.mineruBackend = layoutJson?._backend;
|
|
85
|
+
metadata.mineruVersion = layoutJson?._version_name;
|
|
86
|
+
}
|
|
87
|
+
else if (fileName.endsWith('full.md')) {
|
|
88
|
+
fullMd = data.toString('utf-8');
|
|
89
|
+
}
|
|
90
|
+
else if (fileName.endsWith('origin.pdf')) {
|
|
91
|
+
metadata.originPdfUrl = fileName;
|
|
92
|
+
}
|
|
77
93
|
}
|
|
78
94
|
}
|
|
79
95
|
metadata.assets = assets;
|
|
@@ -102,13 +118,23 @@ let MinerUResultParserService = MinerUResultParserService_1 = class MinerUResult
|
|
|
102
118
|
const pathMap = new Map();
|
|
103
119
|
for (const image of result.images) {
|
|
104
120
|
const filePath = join(document.folder || '', 'images', image.name);
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
121
|
+
if (fileSystem) {
|
|
122
|
+
const url = await fileSystem.writeFile(filePath, Buffer.from(image.dataUrl.split(',')[1], 'base64'));
|
|
123
|
+
pathMap.set(`images/${image.name}`, url);
|
|
124
|
+
assets.push({
|
|
125
|
+
type: 'image',
|
|
126
|
+
url: url,
|
|
127
|
+
filePath: filePath,
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
pathMap.set(`images/${image.name}`, image.dataUrl);
|
|
132
|
+
assets.push({
|
|
133
|
+
type: 'image',
|
|
134
|
+
url: image.dataUrl,
|
|
135
|
+
filePath: filePath,
|
|
136
|
+
});
|
|
137
|
+
}
|
|
112
138
|
}
|
|
113
139
|
if (result.sourceUrl) {
|
|
114
140
|
assets.push({
|