@ontos-ai/knowhere-sdk 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -3
- package/dist/index.d.mts +84 -38
- package/dist/index.d.ts +84 -38
- package/dist/index.js +22 -48
- package/dist/index.mjs +22 -48
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -47,11 +47,11 @@ console.log(`Found ${result.textChunks.length} text chunks`);
|
|
|
47
47
|
console.log(`Found ${result.imageChunks.length} images`);
|
|
48
48
|
console.log(`Found ${result.tableChunks.length} tables`);
|
|
49
49
|
|
|
50
|
-
// Work with chunks
|
|
50
|
+
// Work with chunks — worker metadata is in chunk.metadata
|
|
51
51
|
result.textChunks.forEach((chunk) => {
|
|
52
52
|
console.log(chunk.content);
|
|
53
|
-
console.log(chunk.keywords);
|
|
54
|
-
console.log(chunk.summary);
|
|
53
|
+
console.log(chunk.metadata.keywords);
|
|
54
|
+
console.log(chunk.metadata.summary);
|
|
55
55
|
});
|
|
56
56
|
|
|
57
57
|
// Save results to disk
|
|
@@ -188,12 +188,17 @@ if (!documentId) {
|
|
|
188
188
|
|
|
189
189
|
console.log(documentId);
|
|
190
190
|
|
|
191
|
+
// Agentic mode (LLM navigation + answer synthesis)
|
|
191
192
|
const response = await client.retrieval.query({
|
|
192
193
|
namespace: 'support-center',
|
|
193
194
|
query: 'How do I reset Bluetooth pairing?',
|
|
194
195
|
topK: 5,
|
|
196
|
+
useAgentic: true,
|
|
195
197
|
});
|
|
196
198
|
|
|
199
|
+
console.log(response.answerText); // LLM-generated answer
|
|
200
|
+
console.log(response.referencedChunks); // cited evidence chunks
|
|
201
|
+
|
|
197
202
|
for (const result of response.results) {
|
|
198
203
|
console.log(result.content);
|
|
199
204
|
console.log(result.score);
|
package/dist/index.d.mts
CHANGED
|
@@ -332,7 +332,6 @@ interface Manifest {
|
|
|
332
332
|
dataId?: string;
|
|
333
333
|
/** Original source file name */
|
|
334
334
|
sourceFileName: string;
|
|
335
|
-
/** Processing completion date */
|
|
336
335
|
/** Processing completion date (optional: only present if emitted by the worker) */
|
|
337
336
|
processingDate?: Date;
|
|
338
337
|
/** Worker-side processing metadata emitted by manifest v2 */
|
|
@@ -341,6 +340,13 @@ interface Manifest {
|
|
|
341
340
|
statistics: Statistics;
|
|
342
341
|
/** Legacy file index from earlier ZIP manifests */
|
|
343
342
|
files?: FileIndex;
|
|
343
|
+
/**
|
|
344
|
+
* Document hierarchy emitted by the current worker.
|
|
345
|
+
*
|
|
346
|
+
* The key remains all-caps at runtime because ``keysToCamel()`` only
|
|
347
|
+
* transforms snake_case keys.
|
|
348
|
+
*/
|
|
349
|
+
HIERARCHY?: Record<string, unknown>;
|
|
344
350
|
}
|
|
345
351
|
/**
|
|
346
352
|
* Chunk relationship entry (metadata.connect_to per schema v2.1)
|
|
@@ -357,6 +363,58 @@ interface ConnectTo {
|
|
|
357
363
|
/** Shared keywords (related only) */
|
|
358
364
|
keywords?: string[];
|
|
359
365
|
}
|
|
366
|
+
/**
|
|
367
|
+
* A single image or table resource entry in ``doc_nav.json``.
|
|
368
|
+
*/
|
|
369
|
+
interface DocNavResourceItem {
|
|
370
|
+
path: string;
|
|
371
|
+
summary?: string;
|
|
372
|
+
}
|
|
373
|
+
/**
|
|
374
|
+
* Image and table resource summaries from ``doc_nav.json``.
|
|
375
|
+
*/
|
|
376
|
+
interface DocNavResources {
|
|
377
|
+
images: DocNavResourceItem[];
|
|
378
|
+
tables: DocNavResourceItem[];
|
|
379
|
+
}
|
|
380
|
+
/**
|
|
381
|
+
* A document section in the ``doc_nav.json`` navigation tree.
|
|
382
|
+
*/
|
|
383
|
+
interface DocNavSection {
|
|
384
|
+
title: string;
|
|
385
|
+
path: string;
|
|
386
|
+
level: number;
|
|
387
|
+
summary?: string;
|
|
388
|
+
chunkCount: number;
|
|
389
|
+
children: DocNavSection[];
|
|
390
|
+
}
|
|
391
|
+
/**
|
|
392
|
+
* Top-level document navigation structure from ``doc_nav.json``.
|
|
393
|
+
*/
|
|
394
|
+
interface DocNav {
|
|
395
|
+
sections: DocNavSection[];
|
|
396
|
+
resources?: DocNavResources;
|
|
397
|
+
}
|
|
398
|
+
/**
|
|
399
|
+
* Known worker metadata fields for a chunk.
|
|
400
|
+
*
|
|
401
|
+
* All fields are optional. Unknown fields added by future worker
|
|
402
|
+
* versions are accessible through the index signature.
|
|
403
|
+
*/
|
|
404
|
+
interface ChunkMetadata {
|
|
405
|
+
length?: number;
|
|
406
|
+
pageNums?: number[];
|
|
407
|
+
tokens?: string[];
|
|
408
|
+
keywords?: string[];
|
|
409
|
+
summary?: string;
|
|
410
|
+
connectTo?: ConnectTo[];
|
|
411
|
+
filePath?: string;
|
|
412
|
+
originalName?: string;
|
|
413
|
+
tableType?: string;
|
|
414
|
+
documentTopSummary?: string;
|
|
415
|
+
/** Allow forward-compatible access to unknown fields. */
|
|
416
|
+
[key: string]: unknown;
|
|
417
|
+
}
|
|
360
418
|
/**
|
|
361
419
|
* Base chunk properties
|
|
362
420
|
*/
|
|
@@ -369,50 +427,30 @@ interface BaseChunk {
|
|
|
369
427
|
content: string;
|
|
370
428
|
/** Relative path in ZIP */
|
|
371
429
|
path: string;
|
|
372
|
-
/**
|
|
373
|
-
|
|
430
|
+
/** Worker metadata for this chunk */
|
|
431
|
+
metadata: ChunkMetadata;
|
|
374
432
|
}
|
|
375
433
|
/**
|
|
376
|
-
* Minimal chunk representation emitted in chunks_slim.json
|
|
434
|
+
* Minimal chunk representation emitted in chunks_slim.json (legacy).
|
|
377
435
|
*/
|
|
378
436
|
interface SlimChunk {
|
|
379
437
|
type: 'text' | 'image' | 'table';
|
|
380
438
|
path: string;
|
|
381
439
|
content: string;
|
|
382
|
-
summary?: string;
|
|
383
440
|
}
|
|
384
441
|
/**
|
|
385
442
|
* Text chunk
|
|
386
443
|
*/
|
|
387
444
|
interface TextChunk extends BaseChunk {
|
|
388
445
|
type: 'text';
|
|
389
|
-
/** Content length */
|
|
390
|
-
length: number;
|
|
391
|
-
/** Extracted tokens from the current backend payload */
|
|
392
|
-
tokens?: string[];
|
|
393
|
-
/** Extracted keywords */
|
|
394
|
-
keywords?: string[];
|
|
395
|
-
/** Generated summary */
|
|
396
|
-
summary?: string;
|
|
397
|
-
/** Chunk relationships (schema v2.1: metadata.connect_to) */
|
|
398
|
-
connectTo?: ConnectTo[];
|
|
399
|
-
/**
|
|
400
|
-
* @deprecated Use connectTo instead. Retained for backward compatibility.
|
|
401
|
-
* Previously populated from metadata.relationships which is no longer emitted by the API.
|
|
402
|
-
*/
|
|
403
|
-
relationships?: string[];
|
|
404
446
|
}
|
|
405
447
|
/**
|
|
406
448
|
* Image chunk
|
|
407
449
|
*/
|
|
408
450
|
interface ImageChunk extends BaseChunk {
|
|
409
451
|
type: 'image';
|
|
410
|
-
/** Content length */
|
|
411
|
-
length: number;
|
|
412
452
|
/** Relative file path in ZIP */
|
|
413
453
|
filePath: string;
|
|
414
|
-
/** Generated summary */
|
|
415
|
-
summary?: string;
|
|
416
454
|
/** Image data buffer */
|
|
417
455
|
data: Buffer;
|
|
418
456
|
/** Image format (derived from file extension) */
|
|
@@ -425,14 +463,8 @@ interface ImageChunk extends BaseChunk {
|
|
|
425
463
|
*/
|
|
426
464
|
interface TableChunk extends BaseChunk {
|
|
427
465
|
type: 'table';
|
|
428
|
-
/** Content length */
|
|
429
|
-
length: number;
|
|
430
466
|
/** Relative file path in ZIP */
|
|
431
467
|
filePath: string;
|
|
432
|
-
/** Table type */
|
|
433
|
-
tableType?: string;
|
|
434
|
-
/** Generated summary */
|
|
435
|
-
summary?: string;
|
|
436
468
|
/** HTML representation */
|
|
437
469
|
html: string;
|
|
438
470
|
/** Save table HTML to disk */
|
|
@@ -450,20 +482,22 @@ interface ParseResult {
|
|
|
450
482
|
manifest: Manifest;
|
|
451
483
|
/** All chunks */
|
|
452
484
|
chunks: Chunk[];
|
|
453
|
-
/**
|
|
454
|
-
|
|
485
|
+
/** Document navigation tree from doc_nav.json (current worker output) */
|
|
486
|
+
docNav?: DocNav;
|
|
455
487
|
/** Full document as Markdown (if available) */
|
|
456
488
|
fullMarkdown?: string;
|
|
457
|
-
/**
|
|
489
|
+
/** Raw ZIP buffer */
|
|
490
|
+
rawZip: Buffer;
|
|
491
|
+
/** @deprecated Current worker no longer emits chunks_slim.json */
|
|
492
|
+
chunksSlim?: SlimChunk[];
|
|
493
|
+
/** @deprecated Current worker no longer emits hierarchy.json */
|
|
458
494
|
hierarchy?: unknown;
|
|
459
|
-
/** Table-of-contents hierarchy hints (if available) */
|
|
495
|
+
/** @deprecated Table-of-contents hierarchy hints (if available) */
|
|
460
496
|
tocHierarchies?: unknown;
|
|
461
|
-
/** Knowledge-base CSV export (if available) */
|
|
497
|
+
/** @deprecated Knowledge-base CSV export (if available) */
|
|
462
498
|
kbCsv?: string;
|
|
463
|
-
/** Pre-rendered hierarchy HTML view (if available) */
|
|
499
|
+
/** @deprecated Pre-rendered hierarchy HTML view (if available) */
|
|
464
500
|
hierarchyViewHtml?: string;
|
|
465
|
-
/** Raw ZIP buffer */
|
|
466
|
-
rawZip: Buffer;
|
|
467
501
|
/** Text chunks only */
|
|
468
502
|
readonly textChunks: TextChunk[];
|
|
469
503
|
/** Image chunks only */
|
|
@@ -611,6 +645,14 @@ interface RetrievalQueryParams {
|
|
|
611
645
|
namespace?: string;
|
|
612
646
|
/** Maximum number of results to return */
|
|
613
647
|
topK?: number;
|
|
648
|
+
/**
|
|
649
|
+
* Force retrieval mode.
|
|
650
|
+
*
|
|
651
|
+
* - ``true`` — agentic (LLM navigation + answer synthesis)
|
|
652
|
+
* - ``false`` — legacy 3-channel RRF only
|
|
653
|
+
* - ``undefined`` / omitted — server default
|
|
654
|
+
*/
|
|
655
|
+
useAgentic?: boolean;
|
|
614
656
|
/** Chunk type filter: 1=all, 2=text, 3=image, 4=table, 5=text+image, 6=text+table */
|
|
615
657
|
dataType?: 1 | 2 | 3 | 4 | 5 | 6;
|
|
616
658
|
/** Path keywords for include/exclude filtering */
|
|
@@ -668,6 +710,10 @@ interface RetrievalQueryResponse {
|
|
|
668
710
|
query: string;
|
|
669
711
|
/** Retrieval router path used by the API for this query */
|
|
670
712
|
routerUsed?: string;
|
|
713
|
+
/** LLM-generated natural-language answer (agentic mode only) */
|
|
714
|
+
answerText?: string | null;
|
|
715
|
+
/** Cited evidence chunks with asset URLs (agentic mode only) */
|
|
716
|
+
referencedChunks?: Array<Record<string, unknown>> | null;
|
|
671
717
|
/** Ranked retrieval results */
|
|
672
718
|
results: RetrievalResult[];
|
|
673
719
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -332,7 +332,6 @@ interface Manifest {
|
|
|
332
332
|
dataId?: string;
|
|
333
333
|
/** Original source file name */
|
|
334
334
|
sourceFileName: string;
|
|
335
|
-
/** Processing completion date */
|
|
336
335
|
/** Processing completion date (optional: only present if emitted by the worker) */
|
|
337
336
|
processingDate?: Date;
|
|
338
337
|
/** Worker-side processing metadata emitted by manifest v2 */
|
|
@@ -341,6 +340,13 @@ interface Manifest {
|
|
|
341
340
|
statistics: Statistics;
|
|
342
341
|
/** Legacy file index from earlier ZIP manifests */
|
|
343
342
|
files?: FileIndex;
|
|
343
|
+
/**
|
|
344
|
+
* Document hierarchy emitted by the current worker.
|
|
345
|
+
*
|
|
346
|
+
* The key remains all-caps at runtime because ``keysToCamel()`` only
|
|
347
|
+
* transforms snake_case keys.
|
|
348
|
+
*/
|
|
349
|
+
HIERARCHY?: Record<string, unknown>;
|
|
344
350
|
}
|
|
345
351
|
/**
|
|
346
352
|
* Chunk relationship entry (metadata.connect_to per schema v2.1)
|
|
@@ -357,6 +363,58 @@ interface ConnectTo {
|
|
|
357
363
|
/** Shared keywords (related only) */
|
|
358
364
|
keywords?: string[];
|
|
359
365
|
}
|
|
366
|
+
/**
|
|
367
|
+
* A single image or table resource entry in ``doc_nav.json``.
|
|
368
|
+
*/
|
|
369
|
+
interface DocNavResourceItem {
|
|
370
|
+
path: string;
|
|
371
|
+
summary?: string;
|
|
372
|
+
}
|
|
373
|
+
/**
|
|
374
|
+
* Image and table resource summaries from ``doc_nav.json``.
|
|
375
|
+
*/
|
|
376
|
+
interface DocNavResources {
|
|
377
|
+
images: DocNavResourceItem[];
|
|
378
|
+
tables: DocNavResourceItem[];
|
|
379
|
+
}
|
|
380
|
+
/**
|
|
381
|
+
* A document section in the ``doc_nav.json`` navigation tree.
|
|
382
|
+
*/
|
|
383
|
+
interface DocNavSection {
|
|
384
|
+
title: string;
|
|
385
|
+
path: string;
|
|
386
|
+
level: number;
|
|
387
|
+
summary?: string;
|
|
388
|
+
chunkCount: number;
|
|
389
|
+
children: DocNavSection[];
|
|
390
|
+
}
|
|
391
|
+
/**
|
|
392
|
+
* Top-level document navigation structure from ``doc_nav.json``.
|
|
393
|
+
*/
|
|
394
|
+
interface DocNav {
|
|
395
|
+
sections: DocNavSection[];
|
|
396
|
+
resources?: DocNavResources;
|
|
397
|
+
}
|
|
398
|
+
/**
|
|
399
|
+
* Known worker metadata fields for a chunk.
|
|
400
|
+
*
|
|
401
|
+
* All fields are optional. Unknown fields added by future worker
|
|
402
|
+
* versions are accessible through the index signature.
|
|
403
|
+
*/
|
|
404
|
+
interface ChunkMetadata {
|
|
405
|
+
length?: number;
|
|
406
|
+
pageNums?: number[];
|
|
407
|
+
tokens?: string[];
|
|
408
|
+
keywords?: string[];
|
|
409
|
+
summary?: string;
|
|
410
|
+
connectTo?: ConnectTo[];
|
|
411
|
+
filePath?: string;
|
|
412
|
+
originalName?: string;
|
|
413
|
+
tableType?: string;
|
|
414
|
+
documentTopSummary?: string;
|
|
415
|
+
/** Allow forward-compatible access to unknown fields. */
|
|
416
|
+
[key: string]: unknown;
|
|
417
|
+
}
|
|
360
418
|
/**
|
|
361
419
|
* Base chunk properties
|
|
362
420
|
*/
|
|
@@ -369,50 +427,30 @@ interface BaseChunk {
|
|
|
369
427
|
content: string;
|
|
370
428
|
/** Relative path in ZIP */
|
|
371
429
|
path: string;
|
|
372
|
-
/**
|
|
373
|
-
|
|
430
|
+
/** Worker metadata for this chunk */
|
|
431
|
+
metadata: ChunkMetadata;
|
|
374
432
|
}
|
|
375
433
|
/**
|
|
376
|
-
* Minimal chunk representation emitted in chunks_slim.json
|
|
434
|
+
* Minimal chunk representation emitted in chunks_slim.json (legacy).
|
|
377
435
|
*/
|
|
378
436
|
interface SlimChunk {
|
|
379
437
|
type: 'text' | 'image' | 'table';
|
|
380
438
|
path: string;
|
|
381
439
|
content: string;
|
|
382
|
-
summary?: string;
|
|
383
440
|
}
|
|
384
441
|
/**
|
|
385
442
|
* Text chunk
|
|
386
443
|
*/
|
|
387
444
|
interface TextChunk extends BaseChunk {
|
|
388
445
|
type: 'text';
|
|
389
|
-
/** Content length */
|
|
390
|
-
length: number;
|
|
391
|
-
/** Extracted tokens from the current backend payload */
|
|
392
|
-
tokens?: string[];
|
|
393
|
-
/** Extracted keywords */
|
|
394
|
-
keywords?: string[];
|
|
395
|
-
/** Generated summary */
|
|
396
|
-
summary?: string;
|
|
397
|
-
/** Chunk relationships (schema v2.1: metadata.connect_to) */
|
|
398
|
-
connectTo?: ConnectTo[];
|
|
399
|
-
/**
|
|
400
|
-
* @deprecated Use connectTo instead. Retained for backward compatibility.
|
|
401
|
-
* Previously populated from metadata.relationships which is no longer emitted by the API.
|
|
402
|
-
*/
|
|
403
|
-
relationships?: string[];
|
|
404
446
|
}
|
|
405
447
|
/**
|
|
406
448
|
* Image chunk
|
|
407
449
|
*/
|
|
408
450
|
interface ImageChunk extends BaseChunk {
|
|
409
451
|
type: 'image';
|
|
410
|
-
/** Content length */
|
|
411
|
-
length: number;
|
|
412
452
|
/** Relative file path in ZIP */
|
|
413
453
|
filePath: string;
|
|
414
|
-
/** Generated summary */
|
|
415
|
-
summary?: string;
|
|
416
454
|
/** Image data buffer */
|
|
417
455
|
data: Buffer;
|
|
418
456
|
/** Image format (derived from file extension) */
|
|
@@ -425,14 +463,8 @@ interface ImageChunk extends BaseChunk {
|
|
|
425
463
|
*/
|
|
426
464
|
interface TableChunk extends BaseChunk {
|
|
427
465
|
type: 'table';
|
|
428
|
-
/** Content length */
|
|
429
|
-
length: number;
|
|
430
466
|
/** Relative file path in ZIP */
|
|
431
467
|
filePath: string;
|
|
432
|
-
/** Table type */
|
|
433
|
-
tableType?: string;
|
|
434
|
-
/** Generated summary */
|
|
435
|
-
summary?: string;
|
|
436
468
|
/** HTML representation */
|
|
437
469
|
html: string;
|
|
438
470
|
/** Save table HTML to disk */
|
|
@@ -450,20 +482,22 @@ interface ParseResult {
|
|
|
450
482
|
manifest: Manifest;
|
|
451
483
|
/** All chunks */
|
|
452
484
|
chunks: Chunk[];
|
|
453
|
-
/**
|
|
454
|
-
|
|
485
|
+
/** Document navigation tree from doc_nav.json (current worker output) */
|
|
486
|
+
docNav?: DocNav;
|
|
455
487
|
/** Full document as Markdown (if available) */
|
|
456
488
|
fullMarkdown?: string;
|
|
457
|
-
/**
|
|
489
|
+
/** Raw ZIP buffer */
|
|
490
|
+
rawZip: Buffer;
|
|
491
|
+
/** @deprecated Current worker no longer emits chunks_slim.json */
|
|
492
|
+
chunksSlim?: SlimChunk[];
|
|
493
|
+
/** @deprecated Current worker no longer emits hierarchy.json */
|
|
458
494
|
hierarchy?: unknown;
|
|
459
|
-
/** Table-of-contents hierarchy hints (if available) */
|
|
495
|
+
/** @deprecated Table-of-contents hierarchy hints (if available) */
|
|
460
496
|
tocHierarchies?: unknown;
|
|
461
|
-
/** Knowledge-base CSV export (if available) */
|
|
497
|
+
/** @deprecated Knowledge-base CSV export (if available) */
|
|
462
498
|
kbCsv?: string;
|
|
463
|
-
/** Pre-rendered hierarchy HTML view (if available) */
|
|
499
|
+
/** @deprecated Pre-rendered hierarchy HTML view (if available) */
|
|
464
500
|
hierarchyViewHtml?: string;
|
|
465
|
-
/** Raw ZIP buffer */
|
|
466
|
-
rawZip: Buffer;
|
|
467
501
|
/** Text chunks only */
|
|
468
502
|
readonly textChunks: TextChunk[];
|
|
469
503
|
/** Image chunks only */
|
|
@@ -611,6 +645,14 @@ interface RetrievalQueryParams {
|
|
|
611
645
|
namespace?: string;
|
|
612
646
|
/** Maximum number of results to return */
|
|
613
647
|
topK?: number;
|
|
648
|
+
/**
|
|
649
|
+
* Force retrieval mode.
|
|
650
|
+
*
|
|
651
|
+
* - ``true`` — agentic (LLM navigation + answer synthesis)
|
|
652
|
+
* - ``false`` — legacy 3-channel RRF only
|
|
653
|
+
* - ``undefined`` / omitted — server default
|
|
654
|
+
*/
|
|
655
|
+
useAgentic?: boolean;
|
|
614
656
|
/** Chunk type filter: 1=all, 2=text, 3=image, 4=table, 5=text+image, 6=text+table */
|
|
615
657
|
dataType?: 1 | 2 | 3 | 4 | 5 | 6;
|
|
616
658
|
/** Path keywords for include/exclude filtering */
|
|
@@ -668,6 +710,10 @@ interface RetrievalQueryResponse {
|
|
|
668
710
|
query: string;
|
|
669
711
|
/** Retrieval router path used by the API for this query */
|
|
670
712
|
routerUsed?: string;
|
|
713
|
+
/** LLM-generated natural-language answer (agentic mode only) */
|
|
714
|
+
answerText?: string | null;
|
|
715
|
+
/** Cited evidence chunks with asset URLs (agentic mode only) */
|
|
716
|
+
referencedChunks?: Array<Record<string, unknown>> | null;
|
|
671
717
|
/** Ranked retrieval results */
|
|
672
718
|
results: RetrievalResult[];
|
|
673
719
|
}
|
package/dist/index.js
CHANGED
|
@@ -844,6 +844,13 @@ async function parseResult(httpClient, resultUrl, options) {
|
|
|
844
844
|
if (fullMdFile) {
|
|
845
845
|
fullMarkdown = await fullMdFile.async("string");
|
|
846
846
|
}
|
|
847
|
+
let docNav;
|
|
848
|
+
const docNavFile = zip.file("doc_nav.json");
|
|
849
|
+
if (docNavFile) {
|
|
850
|
+
const docNavContent = await docNavFile.async("string");
|
|
851
|
+
const rawDocNav = JSON.parse(docNavContent);
|
|
852
|
+
docNav = keysToCamel(rawDocNav);
|
|
853
|
+
}
|
|
847
854
|
let hierarchy;
|
|
848
855
|
const hierarchyFile = zip.file("hierarchy.json");
|
|
849
856
|
if (hierarchyFile) {
|
|
@@ -877,13 +884,15 @@ async function parseResult(httpClient, resultUrl, options) {
|
|
|
877
884
|
const result = {
|
|
878
885
|
manifest,
|
|
879
886
|
chunks,
|
|
880
|
-
|
|
887
|
+
docNav,
|
|
881
888
|
fullMarkdown,
|
|
889
|
+
rawZip: zipBuffer,
|
|
890
|
+
// Legacy
|
|
891
|
+
chunksSlim,
|
|
882
892
|
hierarchy,
|
|
883
893
|
tocHierarchies,
|
|
884
894
|
kbCsv,
|
|
885
895
|
hierarchyViewHtml,
|
|
886
|
-
rawZip: zipBuffer,
|
|
887
896
|
get textChunks() {
|
|
888
897
|
return chunks.filter((c) => c.type === "text");
|
|
889
898
|
},
|
|
@@ -905,6 +914,9 @@ async function parseResult(httpClient, resultUrl, options) {
|
|
|
905
914
|
async save(directory) {
|
|
906
915
|
await import_fs2.promises.mkdir(directory, { recursive: true });
|
|
907
916
|
await import_fs2.promises.writeFile((0, import_path.join)(directory, "manifest.json"), JSON.stringify(manifest, null, 2));
|
|
917
|
+
if (docNav) {
|
|
918
|
+
await import_fs2.promises.writeFile((0, import_path.join)(directory, "doc_nav.json"), JSON.stringify(docNav, null, 2));
|
|
919
|
+
}
|
|
908
920
|
await import_fs2.promises.writeFile((0, import_path.join)(directory, "chunks.json"), JSON.stringify(chunks, null, 2));
|
|
909
921
|
if (chunksSlim) {
|
|
910
922
|
await import_fs2.promises.writeFile(
|
|
@@ -960,56 +972,24 @@ function extractSlimChunks(payload) {
|
|
|
960
972
|
}
|
|
961
973
|
return [];
|
|
962
974
|
}
|
|
963
|
-
function getChunkMetadata(chunkData) {
|
|
964
|
-
if (!chunkData.metadata) {
|
|
965
|
-
return {};
|
|
966
|
-
}
|
|
967
|
-
return chunkData.metadata;
|
|
968
|
-
}
|
|
969
975
|
function getChunkFilePath(chunkData) {
|
|
970
|
-
const metadata =
|
|
971
|
-
return chunkData.filePath ?? metadata
|
|
972
|
-
}
|
|
973
|
-
function normalizePageNums(pageNums) {
|
|
974
|
-
if (!Array.isArray(pageNums)) {
|
|
975
|
-
return void 0;
|
|
976
|
-
}
|
|
977
|
-
const normalized = pageNums.filter((pageNum) => typeof pageNum === "number");
|
|
978
|
-
return normalized.length > 0 ? normalized : void 0;
|
|
979
|
-
}
|
|
980
|
-
function normalizeTokens(tokens) {
|
|
981
|
-
if (!Array.isArray(tokens)) {
|
|
982
|
-
return void 0;
|
|
983
|
-
}
|
|
984
|
-
if (!tokens.every((token) => typeof token === "string")) {
|
|
985
|
-
return void 0;
|
|
986
|
-
}
|
|
987
|
-
return tokens;
|
|
976
|
+
const metadata = chunkData.metadata;
|
|
977
|
+
return chunkData.filePath ?? metadata?.filePath ?? chunkData.path;
|
|
988
978
|
}
|
|
989
|
-
function
|
|
990
|
-
const metadata = getChunkMetadata(chunkData);
|
|
991
|
-
const connectTo = metadata.connectTo ?? chunkData.connectTo;
|
|
992
|
-
const relationships = metadata.relationships ?? chunkData.relationships;
|
|
979
|
+
function buildTextChunk(chunkData) {
|
|
993
980
|
return {
|
|
994
981
|
chunkId: chunkData.chunkId ?? "",
|
|
995
982
|
type: "text",
|
|
996
983
|
content: chunkData.content ?? "",
|
|
997
984
|
path: chunkData.path ?? "",
|
|
998
|
-
|
|
999
|
-
length: metadata.length ?? chunkData.length ?? 0,
|
|
1000
|
-
tokens: normalizeTokens(metadata.tokens ?? chunkData.tokens),
|
|
1001
|
-
keywords: metadata.keywords ?? chunkData.keywords,
|
|
1002
|
-
summary: metadata.summary ?? chunkData.summary,
|
|
1003
|
-
...connectTo !== void 0 && { connectTo },
|
|
1004
|
-
...relationships !== void 0 && { relationships }
|
|
985
|
+
metadata: chunkData.metadata ?? {}
|
|
1005
986
|
};
|
|
1006
987
|
}
|
|
1007
988
|
async function processChunk(zip, chunkData) {
|
|
1008
989
|
if (chunkData.type === "text") {
|
|
1009
|
-
return
|
|
990
|
+
return buildTextChunk(chunkData);
|
|
1010
991
|
}
|
|
1011
992
|
if (chunkData.type === "image") {
|
|
1012
|
-
const metadata = getChunkMetadata(chunkData);
|
|
1013
993
|
const filePath = getChunkFilePath(chunkData);
|
|
1014
994
|
if (!filePath) {
|
|
1015
995
|
throw new KnowhereError(`Image chunk missing file path: ${chunkData.chunkId ?? "unknown"}`);
|
|
@@ -1025,11 +1005,9 @@ async function processChunk(zip, chunkData) {
|
|
|
1025
1005
|
type: "image",
|
|
1026
1006
|
content: chunkData.content ?? "",
|
|
1027
1007
|
path: chunkData.path ?? "",
|
|
1028
|
-
pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
|
|
1029
|
-
length: metadata.length ?? chunkData.length ?? 0,
|
|
1030
1008
|
filePath,
|
|
1031
|
-
summary: metadata.summary ?? chunkData.summary,
|
|
1032
1009
|
data: imageBuffer,
|
|
1010
|
+
metadata: chunkData.metadata ?? {},
|
|
1033
1011
|
get format() {
|
|
1034
1012
|
return getFileExtension(this.filePath);
|
|
1035
1013
|
},
|
|
@@ -1044,7 +1022,6 @@ async function processChunk(zip, chunkData) {
|
|
|
1044
1022
|
return enrichedChunk;
|
|
1045
1023
|
}
|
|
1046
1024
|
if (chunkData.type === "table") {
|
|
1047
|
-
const metadata = getChunkMetadata(chunkData);
|
|
1048
1025
|
const filePath = getChunkFilePath(chunkData);
|
|
1049
1026
|
if (!filePath) {
|
|
1050
1027
|
throw new KnowhereError(`Table chunk missing file path: ${chunkData.chunkId ?? "unknown"}`);
|
|
@@ -1060,12 +1037,9 @@ async function processChunk(zip, chunkData) {
|
|
|
1060
1037
|
type: "table",
|
|
1061
1038
|
content: chunkData.content ?? "",
|
|
1062
1039
|
path: chunkData.path ?? "",
|
|
1063
|
-
pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
|
|
1064
|
-
length: metadata.length ?? chunkData.length ?? 0,
|
|
1065
1040
|
filePath,
|
|
1066
|
-
tableType: metadata.tableType ?? chunkData.tableType,
|
|
1067
|
-
summary: metadata.summary ?? chunkData.summary,
|
|
1068
1041
|
html,
|
|
1042
|
+
metadata: chunkData.metadata ?? {},
|
|
1069
1043
|
async save(directory) {
|
|
1070
1044
|
const outputPath = (0, import_path.join)(directory, sanitizePath(this.filePath));
|
|
1071
1045
|
const outputDir = (0, import_path.dirname)(outputPath);
|
|
@@ -1076,7 +1050,7 @@ async function processChunk(zip, chunkData) {
|
|
|
1076
1050
|
};
|
|
1077
1051
|
return enrichedChunk;
|
|
1078
1052
|
}
|
|
1079
|
-
return
|
|
1053
|
+
return buildTextChunk(chunkData);
|
|
1080
1054
|
}
|
|
1081
1055
|
|
|
1082
1056
|
// src/resources/jobs.ts
|
package/dist/index.mjs
CHANGED
|
@@ -784,6 +784,13 @@ async function parseResult(httpClient, resultUrl, options) {
|
|
|
784
784
|
if (fullMdFile) {
|
|
785
785
|
fullMarkdown = await fullMdFile.async("string");
|
|
786
786
|
}
|
|
787
|
+
let docNav;
|
|
788
|
+
const docNavFile = zip.file("doc_nav.json");
|
|
789
|
+
if (docNavFile) {
|
|
790
|
+
const docNavContent = await docNavFile.async("string");
|
|
791
|
+
const rawDocNav = JSON.parse(docNavContent);
|
|
792
|
+
docNav = keysToCamel(rawDocNav);
|
|
793
|
+
}
|
|
787
794
|
let hierarchy;
|
|
788
795
|
const hierarchyFile = zip.file("hierarchy.json");
|
|
789
796
|
if (hierarchyFile) {
|
|
@@ -817,13 +824,15 @@ async function parseResult(httpClient, resultUrl, options) {
|
|
|
817
824
|
const result = {
|
|
818
825
|
manifest,
|
|
819
826
|
chunks,
|
|
820
|
-
|
|
827
|
+
docNav,
|
|
821
828
|
fullMarkdown,
|
|
829
|
+
rawZip: zipBuffer,
|
|
830
|
+
// Legacy
|
|
831
|
+
chunksSlim,
|
|
822
832
|
hierarchy,
|
|
823
833
|
tocHierarchies,
|
|
824
834
|
kbCsv,
|
|
825
835
|
hierarchyViewHtml,
|
|
826
|
-
rawZip: zipBuffer,
|
|
827
836
|
get textChunks() {
|
|
828
837
|
return chunks.filter((c) => c.type === "text");
|
|
829
838
|
},
|
|
@@ -845,6 +854,9 @@ async function parseResult(httpClient, resultUrl, options) {
|
|
|
845
854
|
async save(directory) {
|
|
846
855
|
await fs2.mkdir(directory, { recursive: true });
|
|
847
856
|
await fs2.writeFile(join(directory, "manifest.json"), JSON.stringify(manifest, null, 2));
|
|
857
|
+
if (docNav) {
|
|
858
|
+
await fs2.writeFile(join(directory, "doc_nav.json"), JSON.stringify(docNav, null, 2));
|
|
859
|
+
}
|
|
848
860
|
await fs2.writeFile(join(directory, "chunks.json"), JSON.stringify(chunks, null, 2));
|
|
849
861
|
if (chunksSlim) {
|
|
850
862
|
await fs2.writeFile(
|
|
@@ -900,56 +912,24 @@ function extractSlimChunks(payload) {
|
|
|
900
912
|
}
|
|
901
913
|
return [];
|
|
902
914
|
}
|
|
903
|
-
function getChunkMetadata(chunkData) {
|
|
904
|
-
if (!chunkData.metadata) {
|
|
905
|
-
return {};
|
|
906
|
-
}
|
|
907
|
-
return chunkData.metadata;
|
|
908
|
-
}
|
|
909
915
|
function getChunkFilePath(chunkData) {
|
|
910
|
-
const metadata =
|
|
911
|
-
return chunkData.filePath ?? metadata
|
|
912
|
-
}
|
|
913
|
-
function normalizePageNums(pageNums) {
|
|
914
|
-
if (!Array.isArray(pageNums)) {
|
|
915
|
-
return void 0;
|
|
916
|
-
}
|
|
917
|
-
const normalized = pageNums.filter((pageNum) => typeof pageNum === "number");
|
|
918
|
-
return normalized.length > 0 ? normalized : void 0;
|
|
919
|
-
}
|
|
920
|
-
function normalizeTokens(tokens) {
|
|
921
|
-
if (!Array.isArray(tokens)) {
|
|
922
|
-
return void 0;
|
|
923
|
-
}
|
|
924
|
-
if (!tokens.every((token) => typeof token === "string")) {
|
|
925
|
-
return void 0;
|
|
926
|
-
}
|
|
927
|
-
return tokens;
|
|
916
|
+
const metadata = chunkData.metadata;
|
|
917
|
+
return chunkData.filePath ?? metadata?.filePath ?? chunkData.path;
|
|
928
918
|
}
|
|
929
|
-
function
|
|
930
|
-
const metadata = getChunkMetadata(chunkData);
|
|
931
|
-
const connectTo = metadata.connectTo ?? chunkData.connectTo;
|
|
932
|
-
const relationships = metadata.relationships ?? chunkData.relationships;
|
|
919
|
+
function buildTextChunk(chunkData) {
|
|
933
920
|
return {
|
|
934
921
|
chunkId: chunkData.chunkId ?? "",
|
|
935
922
|
type: "text",
|
|
936
923
|
content: chunkData.content ?? "",
|
|
937
924
|
path: chunkData.path ?? "",
|
|
938
|
-
|
|
939
|
-
length: metadata.length ?? chunkData.length ?? 0,
|
|
940
|
-
tokens: normalizeTokens(metadata.tokens ?? chunkData.tokens),
|
|
941
|
-
keywords: metadata.keywords ?? chunkData.keywords,
|
|
942
|
-
summary: metadata.summary ?? chunkData.summary,
|
|
943
|
-
...connectTo !== void 0 && { connectTo },
|
|
944
|
-
...relationships !== void 0 && { relationships }
|
|
925
|
+
metadata: chunkData.metadata ?? {}
|
|
945
926
|
};
|
|
946
927
|
}
|
|
947
928
|
async function processChunk(zip, chunkData) {
|
|
948
929
|
if (chunkData.type === "text") {
|
|
949
|
-
return
|
|
930
|
+
return buildTextChunk(chunkData);
|
|
950
931
|
}
|
|
951
932
|
if (chunkData.type === "image") {
|
|
952
|
-
const metadata = getChunkMetadata(chunkData);
|
|
953
933
|
const filePath = getChunkFilePath(chunkData);
|
|
954
934
|
if (!filePath) {
|
|
955
935
|
throw new KnowhereError(`Image chunk missing file path: ${chunkData.chunkId ?? "unknown"}`);
|
|
@@ -965,11 +945,9 @@ async function processChunk(zip, chunkData) {
|
|
|
965
945
|
type: "image",
|
|
966
946
|
content: chunkData.content ?? "",
|
|
967
947
|
path: chunkData.path ?? "",
|
|
968
|
-
pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
|
|
969
|
-
length: metadata.length ?? chunkData.length ?? 0,
|
|
970
948
|
filePath,
|
|
971
|
-
summary: metadata.summary ?? chunkData.summary,
|
|
972
949
|
data: imageBuffer,
|
|
950
|
+
metadata: chunkData.metadata ?? {},
|
|
973
951
|
get format() {
|
|
974
952
|
return getFileExtension(this.filePath);
|
|
975
953
|
},
|
|
@@ -984,7 +962,6 @@ async function processChunk(zip, chunkData) {
|
|
|
984
962
|
return enrichedChunk;
|
|
985
963
|
}
|
|
986
964
|
if (chunkData.type === "table") {
|
|
987
|
-
const metadata = getChunkMetadata(chunkData);
|
|
988
965
|
const filePath = getChunkFilePath(chunkData);
|
|
989
966
|
if (!filePath) {
|
|
990
967
|
throw new KnowhereError(`Table chunk missing file path: ${chunkData.chunkId ?? "unknown"}`);
|
|
@@ -1000,12 +977,9 @@ async function processChunk(zip, chunkData) {
|
|
|
1000
977
|
type: "table",
|
|
1001
978
|
content: chunkData.content ?? "",
|
|
1002
979
|
path: chunkData.path ?? "",
|
|
1003
|
-
pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
|
|
1004
|
-
length: metadata.length ?? chunkData.length ?? 0,
|
|
1005
980
|
filePath,
|
|
1006
|
-
tableType: metadata.tableType ?? chunkData.tableType,
|
|
1007
|
-
summary: metadata.summary ?? chunkData.summary,
|
|
1008
981
|
html,
|
|
982
|
+
metadata: chunkData.metadata ?? {},
|
|
1009
983
|
async save(directory) {
|
|
1010
984
|
const outputPath = join(directory, sanitizePath(this.filePath));
|
|
1011
985
|
const outputDir = dirname(outputPath);
|
|
@@ -1016,7 +990,7 @@ async function processChunk(zip, chunkData) {
|
|
|
1016
990
|
};
|
|
1017
991
|
return enrichedChunk;
|
|
1018
992
|
}
|
|
1019
|
-
return
|
|
993
|
+
return buildTextChunk(chunkData);
|
|
1020
994
|
}
|
|
1021
995
|
|
|
1022
996
|
// src/resources/jobs.ts
|