@ontos-ai/knowhere-sdk 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +73 -7
- package/dist/index.d.ts +73 -7
- package/dist/index.js +80 -3
- package/dist/index.mjs +80 -3
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -216,7 +216,7 @@ interface ParseParams {
|
|
|
216
216
|
/** Generate table summaries */
|
|
217
217
|
summaryTable?: boolean;
|
|
218
218
|
/** Generate text summaries */
|
|
219
|
-
|
|
219
|
+
summaryTxt?: boolean;
|
|
220
220
|
/** Custom data identifier */
|
|
221
221
|
dataId?: string;
|
|
222
222
|
/** Additional fragment description */
|
|
@@ -282,6 +282,30 @@ interface Statistics {
|
|
|
282
282
|
interface FileIndex {
|
|
283
283
|
[chunkId: string]: string;
|
|
284
284
|
}
|
|
285
|
+
/**
|
|
286
|
+
* Processing cost details emitted by manifest v2
|
|
287
|
+
*/
|
|
288
|
+
interface ProcessingCost {
|
|
289
|
+
microDollars?: number;
|
|
290
|
+
credits?: number;
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* Processing timing details emitted by manifest v2
|
|
294
|
+
*/
|
|
295
|
+
interface ProcessingTiming {
|
|
296
|
+
startedAt?: Date;
|
|
297
|
+
completedAt?: Date;
|
|
298
|
+
durationMs?: number;
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Processing metadata emitted by manifest v2
|
|
302
|
+
*/
|
|
303
|
+
interface ProcessingMetadata {
|
|
304
|
+
pageCount?: number;
|
|
305
|
+
billingStatus?: string;
|
|
306
|
+
cost?: ProcessingCost;
|
|
307
|
+
timing?: ProcessingTiming;
|
|
308
|
+
}
|
|
285
309
|
/**
|
|
286
310
|
* Manifest containing metadata about the parse result
|
|
287
311
|
*/
|
|
@@ -295,11 +319,29 @@ interface Manifest {
|
|
|
295
319
|
/** Original source file name */
|
|
296
320
|
sourceFileName: string;
|
|
297
321
|
/** Processing completion date */
|
|
298
|
-
|
|
322
|
+
/** Processing completion date (optional: only present if emitted by the worker) */
|
|
323
|
+
processingDate?: Date;
|
|
324
|
+
/** Worker-side processing metadata emitted by manifest v2 */
|
|
325
|
+
processing?: ProcessingMetadata;
|
|
299
326
|
/** Statistics */
|
|
300
327
|
statistics: Statistics;
|
|
301
|
-
/**
|
|
302
|
-
files
|
|
328
|
+
/** Legacy file index from earlier ZIP manifests */
|
|
329
|
+
files?: FileIndex;
|
|
330
|
+
}
|
|
331
|
+
/**
|
|
332
|
+
* Chunk relationship entry (metadata.connect_to per schema v2.1)
|
|
333
|
+
*/
|
|
334
|
+
interface ConnectTo {
|
|
335
|
+
/** Target chunk_id */
|
|
336
|
+
target: string;
|
|
337
|
+
/** Relationship type */
|
|
338
|
+
relation: 'embeds' | 'related';
|
|
339
|
+
/** Placeholder ref in content, e.g. '[images/a.png]' (embeds only) */
|
|
340
|
+
ref?: string;
|
|
341
|
+
/** Semantic similarity score (related only) */
|
|
342
|
+
score?: number;
|
|
343
|
+
/** Shared keywords (related only) */
|
|
344
|
+
keywords?: string[];
|
|
303
345
|
}
|
|
304
346
|
/**
|
|
305
347
|
* Base chunk properties
|
|
@@ -313,6 +355,17 @@ interface BaseChunk {
|
|
|
313
355
|
content: string;
|
|
314
356
|
/** Relative path in ZIP */
|
|
315
357
|
path: string;
|
|
358
|
+
/** Page numbers spanned by this chunk when provided by the backend */
|
|
359
|
+
pageNums?: number[];
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Minimal chunk representation emitted in chunks_slim.json
|
|
363
|
+
*/
|
|
364
|
+
interface SlimChunk {
|
|
365
|
+
type: 'text' | 'image' | 'table';
|
|
366
|
+
path: string;
|
|
367
|
+
content: string;
|
|
368
|
+
summary?: string;
|
|
316
369
|
}
|
|
317
370
|
/**
|
|
318
371
|
* Text chunk
|
|
@@ -321,13 +374,18 @@ interface TextChunk extends BaseChunk {
|
|
|
321
374
|
type: 'text';
|
|
322
375
|
/** Content length */
|
|
323
376
|
length: number;
|
|
324
|
-
/**
|
|
325
|
-
tokens?:
|
|
377
|
+
/** Extracted tokens from the current backend payload */
|
|
378
|
+
tokens?: string[];
|
|
326
379
|
/** Extracted keywords */
|
|
327
380
|
keywords?: string[];
|
|
328
381
|
/** Generated summary */
|
|
329
382
|
summary?: string;
|
|
330
|
-
/**
|
|
383
|
+
/** Chunk relationships (schema v2.1: metadata.connect_to) */
|
|
384
|
+
connectTo?: ConnectTo[];
|
|
385
|
+
/**
|
|
386
|
+
* @deprecated Use connectTo instead. Retained for backward compatibility.
|
|
387
|
+
* Previously populated from metadata.relationships which is no longer emitted by the API.
|
|
388
|
+
*/
|
|
331
389
|
relationships?: string[];
|
|
332
390
|
}
|
|
333
391
|
/**
|
|
@@ -378,10 +436,18 @@ interface ParseResult {
|
|
|
378
436
|
manifest: Manifest;
|
|
379
437
|
/** All chunks */
|
|
380
438
|
chunks: Chunk[];
|
|
439
|
+
/** Minimal chunk projection from chunks_slim.json (if available) */
|
|
440
|
+
chunksSlim?: SlimChunk[];
|
|
381
441
|
/** Full document as Markdown (if available) */
|
|
382
442
|
fullMarkdown?: string;
|
|
383
443
|
/** Document hierarchy (if available) */
|
|
384
444
|
hierarchy?: unknown;
|
|
445
|
+
/** Table-of-contents hierarchy hints (if available) */
|
|
446
|
+
tocHierarchies?: unknown;
|
|
447
|
+
/** Knowledge-base CSV export (if available) */
|
|
448
|
+
kbCsv?: string;
|
|
449
|
+
/** Pre-rendered hierarchy HTML view (if available) */
|
|
450
|
+
hierarchyViewHtml?: string;
|
|
385
451
|
/** Raw ZIP buffer */
|
|
386
452
|
rawZip: Buffer;
|
|
387
453
|
/** Text chunks only */
|
package/dist/index.d.ts
CHANGED
|
@@ -216,7 +216,7 @@ interface ParseParams {
|
|
|
216
216
|
/** Generate table summaries */
|
|
217
217
|
summaryTable?: boolean;
|
|
218
218
|
/** Generate text summaries */
|
|
219
|
-
|
|
219
|
+
summaryTxt?: boolean;
|
|
220
220
|
/** Custom data identifier */
|
|
221
221
|
dataId?: string;
|
|
222
222
|
/** Additional fragment description */
|
|
@@ -282,6 +282,30 @@ interface Statistics {
|
|
|
282
282
|
interface FileIndex {
|
|
283
283
|
[chunkId: string]: string;
|
|
284
284
|
}
|
|
285
|
+
/**
|
|
286
|
+
* Processing cost details emitted by manifest v2
|
|
287
|
+
*/
|
|
288
|
+
interface ProcessingCost {
|
|
289
|
+
microDollars?: number;
|
|
290
|
+
credits?: number;
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* Processing timing details emitted by manifest v2
|
|
294
|
+
*/
|
|
295
|
+
interface ProcessingTiming {
|
|
296
|
+
startedAt?: Date;
|
|
297
|
+
completedAt?: Date;
|
|
298
|
+
durationMs?: number;
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Processing metadata emitted by manifest v2
|
|
302
|
+
*/
|
|
303
|
+
interface ProcessingMetadata {
|
|
304
|
+
pageCount?: number;
|
|
305
|
+
billingStatus?: string;
|
|
306
|
+
cost?: ProcessingCost;
|
|
307
|
+
timing?: ProcessingTiming;
|
|
308
|
+
}
|
|
285
309
|
/**
|
|
286
310
|
* Manifest containing metadata about the parse result
|
|
287
311
|
*/
|
|
@@ -295,11 +319,29 @@ interface Manifest {
|
|
|
295
319
|
/** Original source file name */
|
|
296
320
|
sourceFileName: string;
|
|
297
321
|
/** Processing completion date */
|
|
298
|
-
|
|
322
|
+
/** Processing completion date (optional: only present if emitted by the worker) */
|
|
323
|
+
processingDate?: Date;
|
|
324
|
+
/** Worker-side processing metadata emitted by manifest v2 */
|
|
325
|
+
processing?: ProcessingMetadata;
|
|
299
326
|
/** Statistics */
|
|
300
327
|
statistics: Statistics;
|
|
301
|
-
/**
|
|
302
|
-
files
|
|
328
|
+
/** Legacy file index from earlier ZIP manifests */
|
|
329
|
+
files?: FileIndex;
|
|
330
|
+
}
|
|
331
|
+
/**
|
|
332
|
+
* Chunk relationship entry (metadata.connect_to per schema v2.1)
|
|
333
|
+
*/
|
|
334
|
+
interface ConnectTo {
|
|
335
|
+
/** Target chunk_id */
|
|
336
|
+
target: string;
|
|
337
|
+
/** Relationship type */
|
|
338
|
+
relation: 'embeds' | 'related';
|
|
339
|
+
/** Placeholder ref in content, e.g. '[images/a.png]' (embeds only) */
|
|
340
|
+
ref?: string;
|
|
341
|
+
/** Semantic similarity score (related only) */
|
|
342
|
+
score?: number;
|
|
343
|
+
/** Shared keywords (related only) */
|
|
344
|
+
keywords?: string[];
|
|
303
345
|
}
|
|
304
346
|
/**
|
|
305
347
|
* Base chunk properties
|
|
@@ -313,6 +355,17 @@ interface BaseChunk {
|
|
|
313
355
|
content: string;
|
|
314
356
|
/** Relative path in ZIP */
|
|
315
357
|
path: string;
|
|
358
|
+
/** Page numbers spanned by this chunk when provided by the backend */
|
|
359
|
+
pageNums?: number[];
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Minimal chunk representation emitted in chunks_slim.json
|
|
363
|
+
*/
|
|
364
|
+
interface SlimChunk {
|
|
365
|
+
type: 'text' | 'image' | 'table';
|
|
366
|
+
path: string;
|
|
367
|
+
content: string;
|
|
368
|
+
summary?: string;
|
|
316
369
|
}
|
|
317
370
|
/**
|
|
318
371
|
* Text chunk
|
|
@@ -321,13 +374,18 @@ interface TextChunk extends BaseChunk {
|
|
|
321
374
|
type: 'text';
|
|
322
375
|
/** Content length */
|
|
323
376
|
length: number;
|
|
324
|
-
/**
|
|
325
|
-
tokens?:
|
|
377
|
+
/** Extracted tokens from the current backend payload */
|
|
378
|
+
tokens?: string[];
|
|
326
379
|
/** Extracted keywords */
|
|
327
380
|
keywords?: string[];
|
|
328
381
|
/** Generated summary */
|
|
329
382
|
summary?: string;
|
|
330
|
-
/**
|
|
383
|
+
/** Chunk relationships (schema v2.1: metadata.connect_to) */
|
|
384
|
+
connectTo?: ConnectTo[];
|
|
385
|
+
/**
|
|
386
|
+
* @deprecated Use connectTo instead. Retained for backward compatibility.
|
|
387
|
+
* Previously populated from metadata.relationships which is no longer emitted by the API.
|
|
388
|
+
*/
|
|
331
389
|
relationships?: string[];
|
|
332
390
|
}
|
|
333
391
|
/**
|
|
@@ -378,10 +436,18 @@ interface ParseResult {
|
|
|
378
436
|
manifest: Manifest;
|
|
379
437
|
/** All chunks */
|
|
380
438
|
chunks: Chunk[];
|
|
439
|
+
/** Minimal chunk projection from chunks_slim.json (if available) */
|
|
440
|
+
chunksSlim?: SlimChunk[];
|
|
381
441
|
/** Full document as Markdown (if available) */
|
|
382
442
|
fullMarkdown?: string;
|
|
383
443
|
/** Document hierarchy (if available) */
|
|
384
444
|
hierarchy?: unknown;
|
|
445
|
+
/** Table-of-contents hierarchy hints (if available) */
|
|
446
|
+
tocHierarchies?: unknown;
|
|
447
|
+
/** Knowledge-base CSV export (if available) */
|
|
448
|
+
kbCsv?: string;
|
|
449
|
+
/** Pre-rendered hierarchy HTML view (if available) */
|
|
450
|
+
hierarchyViewHtml?: string;
|
|
385
451
|
/** Raw ZIP buffer */
|
|
386
452
|
rawZip: Buffer;
|
|
387
453
|
/** Text chunks only */
|
package/dist/index.js
CHANGED
|
@@ -839,11 +839,39 @@ async function parseResult(httpClient, resultUrl, options) {
|
|
|
839
839
|
const hierarchyContent = await hierarchyFile.async("string");
|
|
840
840
|
hierarchy = JSON.parse(hierarchyContent);
|
|
841
841
|
}
|
|
842
|
+
let chunksSlim;
|
|
843
|
+
const chunksSlimFile = zip.file("chunks_slim.json");
|
|
844
|
+
if (chunksSlimFile) {
|
|
845
|
+
const chunksSlimContent = await chunksSlimFile.async("string");
|
|
846
|
+
let chunksSlimData = JSON.parse(chunksSlimContent);
|
|
847
|
+
chunksSlimData = keysToCamel(chunksSlimData);
|
|
848
|
+
chunksSlim = extractSlimChunks(chunksSlimData);
|
|
849
|
+
}
|
|
850
|
+
let tocHierarchies;
|
|
851
|
+
const tocHierarchiesFile = zip.file("toc_hierarchies.json");
|
|
852
|
+
if (tocHierarchiesFile) {
|
|
853
|
+
const tocHierarchiesContent = await tocHierarchiesFile.async("string");
|
|
854
|
+
tocHierarchies = keysToCamel(JSON.parse(tocHierarchiesContent));
|
|
855
|
+
}
|
|
856
|
+
let kbCsv;
|
|
857
|
+
const kbCsvFile = zip.file("kb.csv");
|
|
858
|
+
if (kbCsvFile) {
|
|
859
|
+
kbCsv = await kbCsvFile.async("string");
|
|
860
|
+
}
|
|
861
|
+
let hierarchyViewHtml;
|
|
862
|
+
const hierarchyViewFile = zip.file("hierarchy_view.html");
|
|
863
|
+
if (hierarchyViewFile) {
|
|
864
|
+
hierarchyViewHtml = await hierarchyViewFile.async("string");
|
|
865
|
+
}
|
|
842
866
|
const result = {
|
|
843
867
|
manifest,
|
|
844
868
|
chunks,
|
|
869
|
+
chunksSlim,
|
|
845
870
|
fullMarkdown,
|
|
846
871
|
hierarchy,
|
|
872
|
+
tocHierarchies,
|
|
873
|
+
kbCsv,
|
|
874
|
+
hierarchyViewHtml,
|
|
847
875
|
rawZip: zipBuffer,
|
|
848
876
|
get textChunks() {
|
|
849
877
|
return chunks.filter((c) => c.type === "text");
|
|
@@ -867,12 +895,30 @@ async function parseResult(httpClient, resultUrl, options) {
|
|
|
867
895
|
await import_fs2.promises.mkdir(directory, { recursive: true });
|
|
868
896
|
await import_fs2.promises.writeFile((0, import_path.join)(directory, "manifest.json"), JSON.stringify(manifest, null, 2));
|
|
869
897
|
await import_fs2.promises.writeFile((0, import_path.join)(directory, "chunks.json"), JSON.stringify(chunks, null, 2));
|
|
898
|
+
if (chunksSlim) {
|
|
899
|
+
await import_fs2.promises.writeFile(
|
|
900
|
+
(0, import_path.join)(directory, "chunks_slim.json"),
|
|
901
|
+
JSON.stringify({ chunks: chunksSlim }, null, 2)
|
|
902
|
+
);
|
|
903
|
+
}
|
|
870
904
|
if (fullMarkdown) {
|
|
871
905
|
await import_fs2.promises.writeFile((0, import_path.join)(directory, "full.md"), fullMarkdown);
|
|
872
906
|
}
|
|
873
907
|
if (hierarchy) {
|
|
874
908
|
await import_fs2.promises.writeFile((0, import_path.join)(directory, "hierarchy.json"), JSON.stringify(hierarchy, null, 2));
|
|
875
909
|
}
|
|
910
|
+
if (tocHierarchies) {
|
|
911
|
+
await import_fs2.promises.writeFile(
|
|
912
|
+
(0, import_path.join)(directory, "toc_hierarchies.json"),
|
|
913
|
+
JSON.stringify(tocHierarchies, null, 2)
|
|
914
|
+
);
|
|
915
|
+
}
|
|
916
|
+
if (kbCsv) {
|
|
917
|
+
await import_fs2.promises.writeFile((0, import_path.join)(directory, "kb.csv"), kbCsv);
|
|
918
|
+
}
|
|
919
|
+
if (hierarchyViewHtml) {
|
|
920
|
+
await import_fs2.promises.writeFile((0, import_path.join)(directory, "hierarchy_view.html"), hierarchyViewHtml);
|
|
921
|
+
}
|
|
876
922
|
for (const imageChunk of this.imageChunks) {
|
|
877
923
|
await imageChunk.save(directory);
|
|
878
924
|
}
|
|
@@ -894,6 +940,15 @@ function extractChunks(payload) {
|
|
|
894
940
|
}
|
|
895
941
|
return [];
|
|
896
942
|
}
|
|
943
|
+
function extractSlimChunks(payload) {
|
|
944
|
+
if (Array.isArray(payload)) {
|
|
945
|
+
return payload;
|
|
946
|
+
}
|
|
947
|
+
if (Array.isArray(payload.chunks)) {
|
|
948
|
+
return payload.chunks;
|
|
949
|
+
}
|
|
950
|
+
return [];
|
|
951
|
+
}
|
|
897
952
|
function getChunkMetadata(chunkData) {
|
|
898
953
|
if (!chunkData.metadata) {
|
|
899
954
|
return {};
|
|
@@ -904,18 +959,38 @@ function getChunkFilePath(chunkData) {
|
|
|
904
959
|
const metadata = getChunkMetadata(chunkData);
|
|
905
960
|
return chunkData.filePath ?? metadata.filePath ?? chunkData.path;
|
|
906
961
|
}
|
|
962
|
+
function normalizePageNums(pageNums) {
|
|
963
|
+
if (!Array.isArray(pageNums)) {
|
|
964
|
+
return void 0;
|
|
965
|
+
}
|
|
966
|
+
const normalized = pageNums.filter((pageNum) => typeof pageNum === "number");
|
|
967
|
+
return normalized.length > 0 ? normalized : void 0;
|
|
968
|
+
}
|
|
969
|
+
function normalizeTokens(tokens) {
|
|
970
|
+
if (!Array.isArray(tokens)) {
|
|
971
|
+
return void 0;
|
|
972
|
+
}
|
|
973
|
+
if (!tokens.every((token) => typeof token === "string")) {
|
|
974
|
+
return void 0;
|
|
975
|
+
}
|
|
976
|
+
return tokens;
|
|
977
|
+
}
|
|
907
978
|
function normalizeTextChunk(chunkData) {
|
|
908
979
|
const metadata = getChunkMetadata(chunkData);
|
|
980
|
+
const connectTo = metadata.connectTo ?? chunkData.connectTo;
|
|
981
|
+
const relationships = metadata.relationships ?? chunkData.relationships;
|
|
909
982
|
return {
|
|
910
983
|
chunkId: chunkData.chunkId ?? "",
|
|
911
984
|
type: "text",
|
|
912
985
|
content: chunkData.content ?? "",
|
|
913
986
|
path: chunkData.path ?? "",
|
|
987
|
+
pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
|
|
914
988
|
length: metadata.length ?? chunkData.length ?? 0,
|
|
915
|
-
tokens: metadata.tokens ?? chunkData.tokens,
|
|
989
|
+
tokens: normalizeTokens(metadata.tokens ?? chunkData.tokens),
|
|
916
990
|
keywords: metadata.keywords ?? chunkData.keywords,
|
|
917
991
|
summary: metadata.summary ?? chunkData.summary,
|
|
918
|
-
|
|
992
|
+
...connectTo !== void 0 && { connectTo },
|
|
993
|
+
...relationships !== void 0 && { relationships }
|
|
919
994
|
};
|
|
920
995
|
}
|
|
921
996
|
async function processChunk(zip, chunkData) {
|
|
@@ -939,6 +1014,7 @@ async function processChunk(zip, chunkData) {
|
|
|
939
1014
|
type: "image",
|
|
940
1015
|
content: chunkData.content ?? "",
|
|
941
1016
|
path: chunkData.path ?? "",
|
|
1017
|
+
pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
|
|
942
1018
|
length: metadata.length ?? chunkData.length ?? 0,
|
|
943
1019
|
filePath,
|
|
944
1020
|
summary: metadata.summary ?? chunkData.summary,
|
|
@@ -973,6 +1049,7 @@ async function processChunk(zip, chunkData) {
|
|
|
973
1049
|
type: "table",
|
|
974
1050
|
content: chunkData.content ?? "",
|
|
975
1051
|
path: chunkData.path ?? "",
|
|
1052
|
+
pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
|
|
976
1053
|
length: metadata.length ?? chunkData.length ?? 0,
|
|
977
1054
|
filePath,
|
|
978
1055
|
tableType: metadata.tableType ?? chunkData.tableType,
|
|
@@ -1181,7 +1258,7 @@ var Knowhere = class {
|
|
|
1181
1258
|
smartTitleParse: params.smartTitleParse,
|
|
1182
1259
|
summaryImage: params.summaryImage,
|
|
1183
1260
|
summaryTable: params.summaryTable,
|
|
1184
|
-
summaryTxt: params.
|
|
1261
|
+
summaryTxt: params.summaryTxt,
|
|
1185
1262
|
addFragDesc: params.addFragDesc,
|
|
1186
1263
|
kbDir: params.kbDir
|
|
1187
1264
|
};
|
package/dist/index.mjs
CHANGED
|
@@ -781,11 +781,39 @@ async function parseResult(httpClient, resultUrl, options) {
|
|
|
781
781
|
const hierarchyContent = await hierarchyFile.async("string");
|
|
782
782
|
hierarchy = JSON.parse(hierarchyContent);
|
|
783
783
|
}
|
|
784
|
+
let chunksSlim;
|
|
785
|
+
const chunksSlimFile = zip.file("chunks_slim.json");
|
|
786
|
+
if (chunksSlimFile) {
|
|
787
|
+
const chunksSlimContent = await chunksSlimFile.async("string");
|
|
788
|
+
let chunksSlimData = JSON.parse(chunksSlimContent);
|
|
789
|
+
chunksSlimData = keysToCamel(chunksSlimData);
|
|
790
|
+
chunksSlim = extractSlimChunks(chunksSlimData);
|
|
791
|
+
}
|
|
792
|
+
let tocHierarchies;
|
|
793
|
+
const tocHierarchiesFile = zip.file("toc_hierarchies.json");
|
|
794
|
+
if (tocHierarchiesFile) {
|
|
795
|
+
const tocHierarchiesContent = await tocHierarchiesFile.async("string");
|
|
796
|
+
tocHierarchies = keysToCamel(JSON.parse(tocHierarchiesContent));
|
|
797
|
+
}
|
|
798
|
+
let kbCsv;
|
|
799
|
+
const kbCsvFile = zip.file("kb.csv");
|
|
800
|
+
if (kbCsvFile) {
|
|
801
|
+
kbCsv = await kbCsvFile.async("string");
|
|
802
|
+
}
|
|
803
|
+
let hierarchyViewHtml;
|
|
804
|
+
const hierarchyViewFile = zip.file("hierarchy_view.html");
|
|
805
|
+
if (hierarchyViewFile) {
|
|
806
|
+
hierarchyViewHtml = await hierarchyViewFile.async("string");
|
|
807
|
+
}
|
|
784
808
|
const result = {
|
|
785
809
|
manifest,
|
|
786
810
|
chunks,
|
|
811
|
+
chunksSlim,
|
|
787
812
|
fullMarkdown,
|
|
788
813
|
hierarchy,
|
|
814
|
+
tocHierarchies,
|
|
815
|
+
kbCsv,
|
|
816
|
+
hierarchyViewHtml,
|
|
789
817
|
rawZip: zipBuffer,
|
|
790
818
|
get textChunks() {
|
|
791
819
|
return chunks.filter((c) => c.type === "text");
|
|
@@ -809,12 +837,30 @@ async function parseResult(httpClient, resultUrl, options) {
|
|
|
809
837
|
await fs2.mkdir(directory, { recursive: true });
|
|
810
838
|
await fs2.writeFile(join(directory, "manifest.json"), JSON.stringify(manifest, null, 2));
|
|
811
839
|
await fs2.writeFile(join(directory, "chunks.json"), JSON.stringify(chunks, null, 2));
|
|
840
|
+
if (chunksSlim) {
|
|
841
|
+
await fs2.writeFile(
|
|
842
|
+
join(directory, "chunks_slim.json"),
|
|
843
|
+
JSON.stringify({ chunks: chunksSlim }, null, 2)
|
|
844
|
+
);
|
|
845
|
+
}
|
|
812
846
|
if (fullMarkdown) {
|
|
813
847
|
await fs2.writeFile(join(directory, "full.md"), fullMarkdown);
|
|
814
848
|
}
|
|
815
849
|
if (hierarchy) {
|
|
816
850
|
await fs2.writeFile(join(directory, "hierarchy.json"), JSON.stringify(hierarchy, null, 2));
|
|
817
851
|
}
|
|
852
|
+
if (tocHierarchies) {
|
|
853
|
+
await fs2.writeFile(
|
|
854
|
+
join(directory, "toc_hierarchies.json"),
|
|
855
|
+
JSON.stringify(tocHierarchies, null, 2)
|
|
856
|
+
);
|
|
857
|
+
}
|
|
858
|
+
if (kbCsv) {
|
|
859
|
+
await fs2.writeFile(join(directory, "kb.csv"), kbCsv);
|
|
860
|
+
}
|
|
861
|
+
if (hierarchyViewHtml) {
|
|
862
|
+
await fs2.writeFile(join(directory, "hierarchy_view.html"), hierarchyViewHtml);
|
|
863
|
+
}
|
|
818
864
|
for (const imageChunk of this.imageChunks) {
|
|
819
865
|
await imageChunk.save(directory);
|
|
820
866
|
}
|
|
@@ -836,6 +882,15 @@ function extractChunks(payload) {
|
|
|
836
882
|
}
|
|
837
883
|
return [];
|
|
838
884
|
}
|
|
885
|
+
function extractSlimChunks(payload) {
|
|
886
|
+
if (Array.isArray(payload)) {
|
|
887
|
+
return payload;
|
|
888
|
+
}
|
|
889
|
+
if (Array.isArray(payload.chunks)) {
|
|
890
|
+
return payload.chunks;
|
|
891
|
+
}
|
|
892
|
+
return [];
|
|
893
|
+
}
|
|
839
894
|
function getChunkMetadata(chunkData) {
|
|
840
895
|
if (!chunkData.metadata) {
|
|
841
896
|
return {};
|
|
@@ -846,18 +901,38 @@ function getChunkFilePath(chunkData) {
|
|
|
846
901
|
const metadata = getChunkMetadata(chunkData);
|
|
847
902
|
return chunkData.filePath ?? metadata.filePath ?? chunkData.path;
|
|
848
903
|
}
|
|
904
|
+
function normalizePageNums(pageNums) {
|
|
905
|
+
if (!Array.isArray(pageNums)) {
|
|
906
|
+
return void 0;
|
|
907
|
+
}
|
|
908
|
+
const normalized = pageNums.filter((pageNum) => typeof pageNum === "number");
|
|
909
|
+
return normalized.length > 0 ? normalized : void 0;
|
|
910
|
+
}
|
|
911
|
+
function normalizeTokens(tokens) {
|
|
912
|
+
if (!Array.isArray(tokens)) {
|
|
913
|
+
return void 0;
|
|
914
|
+
}
|
|
915
|
+
if (!tokens.every((token) => typeof token === "string")) {
|
|
916
|
+
return void 0;
|
|
917
|
+
}
|
|
918
|
+
return tokens;
|
|
919
|
+
}
|
|
849
920
|
function normalizeTextChunk(chunkData) {
|
|
850
921
|
const metadata = getChunkMetadata(chunkData);
|
|
922
|
+
const connectTo = metadata.connectTo ?? chunkData.connectTo;
|
|
923
|
+
const relationships = metadata.relationships ?? chunkData.relationships;
|
|
851
924
|
return {
|
|
852
925
|
chunkId: chunkData.chunkId ?? "",
|
|
853
926
|
type: "text",
|
|
854
927
|
content: chunkData.content ?? "",
|
|
855
928
|
path: chunkData.path ?? "",
|
|
929
|
+
pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
|
|
856
930
|
length: metadata.length ?? chunkData.length ?? 0,
|
|
857
|
-
tokens: metadata.tokens ?? chunkData.tokens,
|
|
931
|
+
tokens: normalizeTokens(metadata.tokens ?? chunkData.tokens),
|
|
858
932
|
keywords: metadata.keywords ?? chunkData.keywords,
|
|
859
933
|
summary: metadata.summary ?? chunkData.summary,
|
|
860
|
-
|
|
934
|
+
...connectTo !== void 0 && { connectTo },
|
|
935
|
+
...relationships !== void 0 && { relationships }
|
|
861
936
|
};
|
|
862
937
|
}
|
|
863
938
|
async function processChunk(zip, chunkData) {
|
|
@@ -881,6 +956,7 @@ async function processChunk(zip, chunkData) {
|
|
|
881
956
|
type: "image",
|
|
882
957
|
content: chunkData.content ?? "",
|
|
883
958
|
path: chunkData.path ?? "",
|
|
959
|
+
pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
|
|
884
960
|
length: metadata.length ?? chunkData.length ?? 0,
|
|
885
961
|
filePath,
|
|
886
962
|
summary: metadata.summary ?? chunkData.summary,
|
|
@@ -915,6 +991,7 @@ async function processChunk(zip, chunkData) {
|
|
|
915
991
|
type: "table",
|
|
916
992
|
content: chunkData.content ?? "",
|
|
917
993
|
path: chunkData.path ?? "",
|
|
994
|
+
pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
|
|
918
995
|
length: metadata.length ?? chunkData.length ?? 0,
|
|
919
996
|
filePath,
|
|
920
997
|
tableType: metadata.tableType ?? chunkData.tableType,
|
|
@@ -1123,7 +1200,7 @@ var Knowhere = class {
|
|
|
1123
1200
|
smartTitleParse: params.smartTitleParse,
|
|
1124
1201
|
summaryImage: params.summaryImage,
|
|
1125
1202
|
summaryTable: params.summaryTable,
|
|
1126
|
-
summaryTxt: params.
|
|
1203
|
+
summaryTxt: params.summaryTxt,
|
|
1127
1204
|
addFragDesc: params.addFragDesc,
|
|
1128
1205
|
kbDir: params.kbDir
|
|
1129
1206
|
};
|