@ontos-ai/knowhere-sdk 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -163,6 +163,83 @@ const jobResult = await client.jobs.wait(job.jobId, {
163
163
  const result = await client.jobs.load(jobResult);
164
164
  ```
165
165
 
166
+ ### Retrieval and Document Lifecycle
167
+
168
+ Published documents are queryable through the retrieval API after a job
169
+ finishes. `client.jobs.create(...)` does not return a usable `documentId`;
170
+ persist `jobResult.documentId` after publication if you need to update or
171
+ archive the same document later.
172
+
173
+ ```typescript
174
+ const job = await client.jobs.create({
175
+ sourceType: 'url',
176
+ sourceUrl: 'https://example.com/manual.pdf',
177
+ namespace: 'support-center',
178
+ });
179
+
180
+ const jobResult = await client.jobs.wait(job.jobId);
181
+ const documentId = jobResult.documentId;
182
+
183
+ if (!documentId) {
184
+ throw new Error('Expected documentId after successful publication.');
185
+ }
186
+
187
+ console.log(documentId);
188
+
189
+ const response = await client.retrieval.query({
190
+ namespace: 'support-center',
191
+ query: 'How do I reset Bluetooth pairing?',
192
+ topK: 5,
193
+ });
194
+
195
+ for (const result of response.results) {
196
+ console.log(result.content);
197
+ console.log(result.score);
198
+ console.log(result.source.sourceFileName, result.source.sectionPath);
199
+ }
200
+ ```
201
+
202
+ Retrieval results use one canonical source object:
203
+
204
+ ```typescript
205
+ result.content;
206
+ result.chunkType;
207
+ result.score;
208
+ result.assetUrl;
209
+ result.source.documentId;
210
+ result.source.sourceFileName;
211
+ result.source.sectionPath;
212
+ ```
213
+
214
+ Use `documentId` to update or archive a document:
215
+
216
+ ```typescript
217
+ const updateJob = await client.jobs.create({
218
+ sourceType: 'url',
219
+ sourceUrl: 'https://example.com/manual-v2.pdf',
220
+ documentId,
221
+ });
222
+
223
+ const documents = await client.documents.list({ namespace: 'support-center' });
224
+ const document = await client.documents.get(documentId);
225
+ const archived = await client.documents.archive(documentId);
226
+
227
+ console.log(documents.documents.length);
228
+ console.log(document.status);
229
+ console.log(archived.status);
230
+ ```
231
+
232
+ Follow-up queries can exclude documents or sections for one request:
233
+
234
+ ```typescript
235
+ const followUp = await client.retrieval.query({
236
+ namespace: 'support-center',
237
+ query: 'battery charging',
238
+ excludeDocumentIds: ['doc_old'],
239
+ excludeSections: [{ documentId: 'doc_123', sectionPath: 'Appendix / Legal' }],
240
+ });
241
+ ```
242
+
166
243
  ### Error Handling
167
244
 
168
245
  ```typescript
package/dist/index.d.mts CHANGED
@@ -41,6 +41,8 @@ interface Job {
41
41
  sourceType: string;
42
42
  /** Optional custom data identifier */
43
43
  dataId?: string;
44
+ /** Retrieval namespace for the canonical document */
45
+ namespace?: string;
44
46
  /** Job creation timestamp */
45
47
  createdAt: Date;
46
48
  /** Presigned URL for file upload (if sourceType is 'file') */
@@ -75,6 +77,10 @@ interface JobResult {
75
77
  sourceType: string;
76
78
  /** Optional custom data identifier */
77
79
  dataId?: string;
80
+ /** Retrieval namespace for the canonical document */
81
+ namespace?: string;
82
+ /** Stable document identifier for retrieval/document lifecycle APIs */
83
+ documentId?: string;
78
84
  /** Job creation timestamp */
79
85
  createdAt: Date;
80
86
  /** Processing progress information */
@@ -157,6 +163,10 @@ interface CreateJobParams {
157
163
  fileName?: string;
158
164
  /** Optional custom data identifier */
159
165
  dataId?: string;
166
+ /** Retrieval namespace for the canonical document */
167
+ namespace?: string;
168
+ /** Existing document identifier when updating a published document */
169
+ documentId?: string;
160
170
  /** Parsing configuration */
161
171
  parsingParams?: ParsingParams;
162
172
  /** Webhook configuration */
@@ -216,9 +226,13 @@ interface ParseParams {
216
226
  /** Generate table summaries */
217
227
  summaryTable?: boolean;
218
228
  /** Generate text summaries */
219
- summaryText?: boolean;
229
+ summaryTxt?: boolean;
220
230
  /** Custom data identifier */
221
231
  dataId?: string;
232
+ /** Retrieval namespace for the canonical document */
233
+ namespace?: string;
234
+ /** Existing document identifier when updating a published document */
235
+ documentId?: string;
222
236
  /** Additional fragment description */
223
237
  addFragDesc?: string;
224
238
  /** Knowledge base directory */
@@ -282,6 +296,30 @@ interface Statistics {
282
296
  interface FileIndex {
283
297
  [chunkId: string]: string;
284
298
  }
299
+ /**
300
+ * Processing cost details emitted by manifest v2
301
+ */
302
+ interface ProcessingCost {
303
+ microDollars?: number;
304
+ credits?: number;
305
+ }
306
+ /**
307
+ * Processing timing details emitted by manifest v2
308
+ */
309
+ interface ProcessingTiming {
310
+ startedAt?: Date;
311
+ completedAt?: Date;
312
+ durationMs?: number;
313
+ }
314
+ /**
315
+ * Processing metadata emitted by manifest v2
316
+ */
317
+ interface ProcessingMetadata {
318
+ pageCount?: number;
319
+ billingStatus?: string;
320
+ cost?: ProcessingCost;
321
+ timing?: ProcessingTiming;
322
+ }
285
323
  /**
286
324
  * Manifest containing metadata about the parse result
287
325
  */
@@ -295,11 +333,29 @@ interface Manifest {
295
333
  /** Original source file name */
296
334
  sourceFileName: string;
297
335
  /** Processing completion date */
298
- processingDate: Date;
336
+ /** Processing completion date (optional: only present if emitted by the worker) */
337
+ processingDate?: Date;
338
+ /** Worker-side processing metadata emitted by manifest v2 */
339
+ processing?: ProcessingMetadata;
299
340
  /** Statistics */
300
341
  statistics: Statistics;
301
- /** File index */
302
- files: FileIndex;
342
+ /** Legacy file index from earlier ZIP manifests */
343
+ files?: FileIndex;
344
+ }
345
+ /**
346
+ * Chunk relationship entry (metadata.connect_to per schema v2.1)
347
+ */
348
+ interface ConnectTo {
349
+ /** Target chunk_id */
350
+ target: string;
351
+ /** Relationship type */
352
+ relation: 'embeds' | 'related';
353
+ /** Placeholder ref in content, e.g. '[images/a.png]' (embeds only) */
354
+ ref?: string;
355
+ /** Semantic similarity score (related only) */
356
+ score?: number;
357
+ /** Shared keywords (related only) */
358
+ keywords?: string[];
303
359
  }
304
360
  /**
305
361
  * Base chunk properties
@@ -313,6 +369,17 @@ interface BaseChunk {
313
369
  content: string;
314
370
  /** Relative path in ZIP */
315
371
  path: string;
372
+ /** Page numbers spanned by this chunk when provided by the backend */
373
+ pageNums?: number[];
374
+ }
375
+ /**
376
+ * Minimal chunk representation emitted in chunks_slim.json
377
+ */
378
+ interface SlimChunk {
379
+ type: 'text' | 'image' | 'table';
380
+ path: string;
381
+ content: string;
382
+ summary?: string;
316
383
  }
317
384
  /**
318
385
  * Text chunk
@@ -321,13 +388,18 @@ interface TextChunk extends BaseChunk {
321
388
  type: 'text';
322
389
  /** Content length */
323
390
  length: number;
324
- /** Tokens or token count, depending on backend payload */
325
- tokens?: number | string[];
391
+ /** Extracted tokens from the current backend payload */
392
+ tokens?: string[];
326
393
  /** Extracted keywords */
327
394
  keywords?: string[];
328
395
  /** Generated summary */
329
396
  summary?: string;
330
- /** Related chunk IDs */
397
+ /** Chunk relationships (schema v2.1: metadata.connect_to) */
398
+ connectTo?: ConnectTo[];
399
+ /**
400
+ * @deprecated Use connectTo instead. Retained for backward compatibility.
401
+ * Previously populated from metadata.relationships which is no longer emitted by the API.
402
+ */
331
403
  relationships?: string[];
332
404
  }
333
405
  /**
@@ -378,10 +450,18 @@ interface ParseResult {
378
450
  manifest: Manifest;
379
451
  /** All chunks */
380
452
  chunks: Chunk[];
453
+ /** Minimal chunk projection from chunks_slim.json (if available) */
454
+ chunksSlim?: SlimChunk[];
381
455
  /** Full document as Markdown (if available) */
382
456
  fullMarkdown?: string;
383
457
  /** Document hierarchy (if available) */
384
458
  hierarchy?: unknown;
459
+ /** Table-of-contents hierarchy hints (if available) */
460
+ tocHierarchies?: unknown;
461
+ /** Knowledge-base CSV export (if available) */
462
+ kbCsv?: string;
463
+ /** Pre-rendered hierarchy HTML view (if available) */
464
+ hierarchyViewHtml?: string;
385
465
  /** Raw ZIP buffer */
386
466
  rawZip: Buffer;
387
467
  /** Text chunks only */
@@ -392,6 +472,10 @@ interface ParseResult {
392
472
  readonly tableChunks: TableChunk[];
393
473
  /** Job ID */
394
474
  readonly jobId: string;
475
+ /** Effective retrieval namespace when loaded from a job result */
476
+ namespace?: string;
477
+ /** Canonical document identifier when loaded from a job result */
478
+ documentId?: string;
395
479
  /** Statistics */
396
480
  readonly statistics: Statistics;
397
481
  /** Find a specific chunk by ID */
@@ -500,12 +584,165 @@ declare class Jobs extends BaseResource {
500
584
  private resolveLoadJobResult;
501
585
  }
502
586
 
587
+ /**
588
+ * Section exclusion for follow-up retrieval queries.
589
+ */
590
+ interface RetrievalSectionExclusion {
591
+ /** Document containing the section to exclude */
592
+ documentId: string;
593
+ /** Human-readable section path to exclude */
594
+ sectionPath: string;
595
+ }
596
+ /**
597
+ * Supported retrieval channel names.
598
+ */
599
+ type RetrievalChannel = 'path' | 'content' | 'term';
600
+ /**
601
+ * Path filtering mode for retrieval queries.
602
+ */
603
+ type RetrievalFilterMode = 'delete' | 'keep';
604
+ /**
605
+ * Retrieval query parameters.
606
+ */
607
+ interface RetrievalQueryParams {
608
+ /** Search query text */
609
+ query: string;
610
+ /** Retrieval namespace. Defaults to the server's default namespace when omitted. */
611
+ namespace?: string;
612
+ /** Maximum number of results to return */
613
+ topK?: number;
614
+ /** Chunk type filter: 1=all, 2=text, 3=image, 4=table, 5=text+image, 6=text+table */
615
+ dataType?: 1 | 2 | 3 | 4 | 5 | 6;
616
+ /** Path keywords for include/exclude filtering */
617
+ signalPaths?: string[];
618
+ /** Signal path filter mode */
619
+ filterMode?: RetrievalFilterMode;
620
+ /** Retrieval channels to run. Defaults to all channels when omitted. */
621
+ channels?: RetrievalChannel[];
622
+ /** Per-channel weight overrides for reciprocal-rank fusion */
623
+ channelWeights?: Partial<Record<RetrievalChannel, number>>;
624
+ /** Enable LLM reranking after channel fusion */
625
+ rerank?: boolean;
626
+ /** Minimum retrieval score threshold after fusion */
627
+ threshold?: number;
628
+ /** Override the internal per-channel recall count */
629
+ internalRecallK?: number;
630
+ /** Documents to exclude for this request only */
631
+ excludeDocumentIds?: string[];
632
+ /** Document sections to exclude for this request only */
633
+ excludeSections?: RetrievalSectionExclusion[];
634
+ }
635
+ /**
636
+ * Caller-facing source reference attached to a retrieval result.
637
+ */
638
+ interface RetrievalSource {
639
+ /** Stable document identifier */
640
+ documentId?: string;
641
+ /** Original source file name */
642
+ sourceFileName?: string;
643
+ /** Human-readable section path */
644
+ sectionPath?: string;
645
+ }
646
+ /**
647
+ * Canonical chunk result returned by retrieval query.
648
+ */
649
+ interface RetrievalResult {
650
+ /** Knowledge content to use directly in the caller's answer */
651
+ content: string;
652
+ /** Chunk type, for example text, image, or table */
653
+ chunkType: string;
654
+ /** Retrieval score returned by the API */
655
+ score: number;
656
+ /** Presigned asset URL for media chunks when available */
657
+ assetUrl?: string;
658
+ /** Source reference for this result */
659
+ source: RetrievalSource;
660
+ }
661
+ /**
662
+ * Response from POST /v1/retrieval/query.
663
+ */
664
+ interface RetrievalQueryResponse {
665
+ /** Namespace searched by the API */
666
+ namespace: string;
667
+ /** Echoed query text */
668
+ query: string;
669
+ /** Retrieval router path used by the API for this query */
670
+ routerUsed?: string;
671
+ /** Ranked retrieval results */
672
+ results: RetrievalResult[];
673
+ }
674
+
675
+ /**
676
+ * Resource for querying published retrieval documents.
677
+ */
678
+ declare class Retrieval extends BaseResource {
679
+ /**
680
+ * Query published documents.
681
+ */
682
+ query(params: RetrievalQueryParams): Promise<RetrievalQueryResponse>;
683
+ }
684
+
685
+ /**
686
+ * Canonical document state returned by document lifecycle endpoints.
687
+ */
688
+ interface Document {
689
+ /** Stable document identifier */
690
+ documentId: string;
691
+ /** Retrieval namespace */
692
+ namespace: string;
693
+ /** Current lifecycle status */
694
+ status: string;
695
+ /** Current published job result identifier */
696
+ currentJobResultId?: string;
697
+ /** Original source file name */
698
+ sourceFileName?: string;
699
+ /** Document creation timestamp */
700
+ createdAt?: Date;
701
+ /** Last update timestamp */
702
+ updatedAt?: Date;
703
+ /** Archive timestamp, when archived */
704
+ archivedAt?: Date;
705
+ }
706
+ /**
707
+ * Response from GET /v1/documents.
708
+ */
709
+ interface DocumentListResponse {
710
+ /** Namespace listed by the API */
711
+ namespace: string;
712
+ /** Documents visible in the namespace */
713
+ documents: Document[];
714
+ }
715
+
716
+ /**
717
+ * Resource for canonical document lifecycle operations.
718
+ */
719
+ declare class Documents extends BaseResource {
720
+ /**
721
+ * List canonical documents in a namespace.
722
+ */
723
+ list(params?: {
724
+ namespace?: string;
725
+ }): Promise<DocumentListResponse>;
726
+ /**
727
+ * Get one canonical document by ID.
728
+ */
729
+ get(documentId: string): Promise<Document>;
730
+ /**
731
+ * Archive one canonical document by ID.
732
+ */
733
+ archive(documentId: string): Promise<Document>;
734
+ }
735
+
503
736
  /**
504
737
  * Main Knowhere SDK client
505
738
  */
506
739
  declare class Knowhere {
507
740
  /** Jobs resource for low-level API */
508
741
  readonly jobs: Jobs;
742
+ /** Retrieval resource for querying published documents */
743
+ readonly retrieval: Retrieval;
744
+ /** Documents resource for canonical document lifecycle operations */
745
+ readonly documents: Documents;
509
746
  private httpClient;
510
747
  /**
511
748
  * Create a new Knowhere client
@@ -665,4 +902,4 @@ declare class JobFailedError extends KnowhereError {
665
902
  constructor(message: string, code: string, jobResult: JobResult);
666
903
  }
667
904
 
668
- export { APIError, AuthenticationError, BadRequestError, type BaseChunk, ChecksumError, type Chunk, ConflictError, type CreateJobParams, type DocType, type FileIndex, GatewayTimeoutError, type ImageChunk, InternalServerError, InvalidStateError, type Job, type JobError, JobFailedError, type JobResult, type JobStatus, Jobs, Knowhere, KnowhereError, type KnowhereOptions, type LoadOptions, type Manifest, NetworkError, NotFoundError, type ParseParams, type ParseResult, type ParsingModel, type ParsingParams, PaymentRequiredError, PermissionDeniedError, type PollProgress, PollingTimeoutError, RateLimitError, ServiceUnavailableError, type Statistics, type TableChunk, type TextChunk, TimeoutError, type UploadParams, type UploadProgress, VERSION, ValidationError, type WaitOptions, type WebhookConfig, Knowhere as default };
905
+ export { APIError, AuthenticationError, BadRequestError, type BaseChunk, ChecksumError, type Chunk, ConflictError, type CreateJobParams, type DocType, type Document, type DocumentListResponse, Documents, type FileIndex, GatewayTimeoutError, type ImageChunk, InternalServerError, InvalidStateError, type Job, type JobError, JobFailedError, type JobResult, type JobStatus, Jobs, Knowhere, KnowhereError, type KnowhereOptions, type LoadOptions, type Manifest, NetworkError, NotFoundError, type ParseParams, type ParseResult, type ParsingModel, type ParsingParams, PaymentRequiredError, PermissionDeniedError, type PollProgress, PollingTimeoutError, RateLimitError, Retrieval, type RetrievalChannel, type RetrievalFilterMode, type RetrievalQueryParams, type RetrievalQueryResponse, type RetrievalResult, type RetrievalSectionExclusion, type RetrievalSource, ServiceUnavailableError, type Statistics, type TableChunk, type TextChunk, TimeoutError, type UploadParams, type UploadProgress, VERSION, ValidationError, type WaitOptions, type WebhookConfig, Knowhere as default };
package/dist/index.d.ts CHANGED
@@ -41,6 +41,8 @@ interface Job {
41
41
  sourceType: string;
42
42
  /** Optional custom data identifier */
43
43
  dataId?: string;
44
+ /** Retrieval namespace for the canonical document */
45
+ namespace?: string;
44
46
  /** Job creation timestamp */
45
47
  createdAt: Date;
46
48
  /** Presigned URL for file upload (if sourceType is 'file') */
@@ -75,6 +77,10 @@ interface JobResult {
75
77
  sourceType: string;
76
78
  /** Optional custom data identifier */
77
79
  dataId?: string;
80
+ /** Retrieval namespace for the canonical document */
81
+ namespace?: string;
82
+ /** Stable document identifier for retrieval/document lifecycle APIs */
83
+ documentId?: string;
78
84
  /** Job creation timestamp */
79
85
  createdAt: Date;
80
86
  /** Processing progress information */
@@ -157,6 +163,10 @@ interface CreateJobParams {
157
163
  fileName?: string;
158
164
  /** Optional custom data identifier */
159
165
  dataId?: string;
166
+ /** Retrieval namespace for the canonical document */
167
+ namespace?: string;
168
+ /** Existing document identifier when updating a published document */
169
+ documentId?: string;
160
170
  /** Parsing configuration */
161
171
  parsingParams?: ParsingParams;
162
172
  /** Webhook configuration */
@@ -216,9 +226,13 @@ interface ParseParams {
216
226
  /** Generate table summaries */
217
227
  summaryTable?: boolean;
218
228
  /** Generate text summaries */
219
- summaryText?: boolean;
229
+ summaryTxt?: boolean;
220
230
  /** Custom data identifier */
221
231
  dataId?: string;
232
+ /** Retrieval namespace for the canonical document */
233
+ namespace?: string;
234
+ /** Existing document identifier when updating a published document */
235
+ documentId?: string;
222
236
  /** Additional fragment description */
223
237
  addFragDesc?: string;
224
238
  /** Knowledge base directory */
@@ -282,6 +296,30 @@ interface Statistics {
282
296
  interface FileIndex {
283
297
  [chunkId: string]: string;
284
298
  }
299
+ /**
300
+ * Processing cost details emitted by manifest v2
301
+ */
302
+ interface ProcessingCost {
303
+ microDollars?: number;
304
+ credits?: number;
305
+ }
306
+ /**
307
+ * Processing timing details emitted by manifest v2
308
+ */
309
+ interface ProcessingTiming {
310
+ startedAt?: Date;
311
+ completedAt?: Date;
312
+ durationMs?: number;
313
+ }
314
+ /**
315
+ * Processing metadata emitted by manifest v2
316
+ */
317
+ interface ProcessingMetadata {
318
+ pageCount?: number;
319
+ billingStatus?: string;
320
+ cost?: ProcessingCost;
321
+ timing?: ProcessingTiming;
322
+ }
285
323
  /**
286
324
  * Manifest containing metadata about the parse result
287
325
  */
@@ -295,11 +333,29 @@ interface Manifest {
295
333
  /** Original source file name */
296
334
  sourceFileName: string;
297
335
  /** Processing completion date */
298
- processingDate: Date;
336
+ /** Processing completion date (optional: only present if emitted by the worker) */
337
+ processingDate?: Date;
338
+ /** Worker-side processing metadata emitted by manifest v2 */
339
+ processing?: ProcessingMetadata;
299
340
  /** Statistics */
300
341
  statistics: Statistics;
301
- /** File index */
302
- files: FileIndex;
342
+ /** Legacy file index from earlier ZIP manifests */
343
+ files?: FileIndex;
344
+ }
345
+ /**
346
+ * Chunk relationship entry (metadata.connect_to per schema v2.1)
347
+ */
348
+ interface ConnectTo {
349
+ /** Target chunk_id */
350
+ target: string;
351
+ /** Relationship type */
352
+ relation: 'embeds' | 'related';
353
+ /** Placeholder ref in content, e.g. '[images/a.png]' (embeds only) */
354
+ ref?: string;
355
+ /** Semantic similarity score (related only) */
356
+ score?: number;
357
+ /** Shared keywords (related only) */
358
+ keywords?: string[];
303
359
  }
304
360
  /**
305
361
  * Base chunk properties
@@ -313,6 +369,17 @@ interface BaseChunk {
313
369
  content: string;
314
370
  /** Relative path in ZIP */
315
371
  path: string;
372
+ /** Page numbers spanned by this chunk when provided by the backend */
373
+ pageNums?: number[];
374
+ }
375
+ /**
376
+ * Minimal chunk representation emitted in chunks_slim.json
377
+ */
378
+ interface SlimChunk {
379
+ type: 'text' | 'image' | 'table';
380
+ path: string;
381
+ content: string;
382
+ summary?: string;
316
383
  }
317
384
  /**
318
385
  * Text chunk
@@ -321,13 +388,18 @@ interface TextChunk extends BaseChunk {
321
388
  type: 'text';
322
389
  /** Content length */
323
390
  length: number;
324
- /** Tokens or token count, depending on backend payload */
325
- tokens?: number | string[];
391
+ /** Extracted tokens from the current backend payload */
392
+ tokens?: string[];
326
393
  /** Extracted keywords */
327
394
  keywords?: string[];
328
395
  /** Generated summary */
329
396
  summary?: string;
330
- /** Related chunk IDs */
397
+ /** Chunk relationships (schema v2.1: metadata.connect_to) */
398
+ connectTo?: ConnectTo[];
399
+ /**
400
+ * @deprecated Use connectTo instead. Retained for backward compatibility.
401
+ * Previously populated from metadata.relationships which is no longer emitted by the API.
402
+ */
331
403
  relationships?: string[];
332
404
  }
333
405
  /**
@@ -378,10 +450,18 @@ interface ParseResult {
378
450
  manifest: Manifest;
379
451
  /** All chunks */
380
452
  chunks: Chunk[];
453
+ /** Minimal chunk projection from chunks_slim.json (if available) */
454
+ chunksSlim?: SlimChunk[];
381
455
  /** Full document as Markdown (if available) */
382
456
  fullMarkdown?: string;
383
457
  /** Document hierarchy (if available) */
384
458
  hierarchy?: unknown;
459
+ /** Table-of-contents hierarchy hints (if available) */
460
+ tocHierarchies?: unknown;
461
+ /** Knowledge-base CSV export (if available) */
462
+ kbCsv?: string;
463
+ /** Pre-rendered hierarchy HTML view (if available) */
464
+ hierarchyViewHtml?: string;
385
465
  /** Raw ZIP buffer */
386
466
  rawZip: Buffer;
387
467
  /** Text chunks only */
@@ -392,6 +472,10 @@ interface ParseResult {
392
472
  readonly tableChunks: TableChunk[];
393
473
  /** Job ID */
394
474
  readonly jobId: string;
475
+ /** Effective retrieval namespace when loaded from a job result */
476
+ namespace?: string;
477
+ /** Canonical document identifier when loaded from a job result */
478
+ documentId?: string;
395
479
  /** Statistics */
396
480
  readonly statistics: Statistics;
397
481
  /** Find a specific chunk by ID */
@@ -500,12 +584,165 @@ declare class Jobs extends BaseResource {
500
584
  private resolveLoadJobResult;
501
585
  }
502
586
 
587
+ /**
588
+ * Section exclusion for follow-up retrieval queries.
589
+ */
590
+ interface RetrievalSectionExclusion {
591
+ /** Document containing the section to exclude */
592
+ documentId: string;
593
+ /** Human-readable section path to exclude */
594
+ sectionPath: string;
595
+ }
596
+ /**
597
+ * Supported retrieval channel names.
598
+ */
599
+ type RetrievalChannel = 'path' | 'content' | 'term';
600
+ /**
601
+ * Path filtering mode for retrieval queries.
602
+ */
603
+ type RetrievalFilterMode = 'delete' | 'keep';
604
+ /**
605
+ * Retrieval query parameters.
606
+ */
607
+ interface RetrievalQueryParams {
608
+ /** Search query text */
609
+ query: string;
610
+ /** Retrieval namespace. Defaults to the server's default namespace when omitted. */
611
+ namespace?: string;
612
+ /** Maximum number of results to return */
613
+ topK?: number;
614
+ /** Chunk type filter: 1=all, 2=text, 3=image, 4=table, 5=text+image, 6=text+table */
615
+ dataType?: 1 | 2 | 3 | 4 | 5 | 6;
616
+ /** Path keywords for include/exclude filtering */
617
+ signalPaths?: string[];
618
+ /** Signal path filter mode */
619
+ filterMode?: RetrievalFilterMode;
620
+ /** Retrieval channels to run. Defaults to all channels when omitted. */
621
+ channels?: RetrievalChannel[];
622
+ /** Per-channel weight overrides for reciprocal-rank fusion */
623
+ channelWeights?: Partial<Record<RetrievalChannel, number>>;
624
+ /** Enable LLM reranking after channel fusion */
625
+ rerank?: boolean;
626
+ /** Minimum retrieval score threshold after fusion */
627
+ threshold?: number;
628
+ /** Override the internal per-channel recall count */
629
+ internalRecallK?: number;
630
+ /** Documents to exclude for this request only */
631
+ excludeDocumentIds?: string[];
632
+ /** Document sections to exclude for this request only */
633
+ excludeSections?: RetrievalSectionExclusion[];
634
+ }
635
+ /**
636
+ * Caller-facing source reference attached to a retrieval result.
637
+ */
638
+ interface RetrievalSource {
639
+ /** Stable document identifier */
640
+ documentId?: string;
641
+ /** Original source file name */
642
+ sourceFileName?: string;
643
+ /** Human-readable section path */
644
+ sectionPath?: string;
645
+ }
646
+ /**
647
+ * Canonical chunk result returned by retrieval query.
648
+ */
649
+ interface RetrievalResult {
650
+ /** Knowledge content to use directly in the caller's answer */
651
+ content: string;
652
+ /** Chunk type, for example text, image, or table */
653
+ chunkType: string;
654
+ /** Retrieval score returned by the API */
655
+ score: number;
656
+ /** Presigned asset URL for media chunks when available */
657
+ assetUrl?: string;
658
+ /** Source reference for this result */
659
+ source: RetrievalSource;
660
+ }
661
+ /**
662
+ * Response from POST /v1/retrieval/query.
663
+ */
664
+ interface RetrievalQueryResponse {
665
+ /** Namespace searched by the API */
666
+ namespace: string;
667
+ /** Echoed query text */
668
+ query: string;
669
+ /** Retrieval router path used by the API for this query */
670
+ routerUsed?: string;
671
+ /** Ranked retrieval results */
672
+ results: RetrievalResult[];
673
+ }
674
+
675
+ /**
676
+ * Resource for querying published retrieval documents.
677
+ */
678
+ declare class Retrieval extends BaseResource {
679
+ /**
680
+ * Query published documents.
681
+ */
682
+ query(params: RetrievalQueryParams): Promise<RetrievalQueryResponse>;
683
+ }
684
+
685
+ /**
686
+ * Canonical document state returned by document lifecycle endpoints.
687
+ */
688
+ interface Document {
689
+ /** Stable document identifier */
690
+ documentId: string;
691
+ /** Retrieval namespace */
692
+ namespace: string;
693
+ /** Current lifecycle status */
694
+ status: string;
695
+ /** Current published job result identifier */
696
+ currentJobResultId?: string;
697
+ /** Original source file name */
698
+ sourceFileName?: string;
699
+ /** Document creation timestamp */
700
+ createdAt?: Date;
701
+ /** Last update timestamp */
702
+ updatedAt?: Date;
703
+ /** Archive timestamp, when archived */
704
+ archivedAt?: Date;
705
+ }
706
+ /**
707
+ * Response from GET /v1/documents.
708
+ */
709
+ interface DocumentListResponse {
710
+ /** Namespace listed by the API */
711
+ namespace: string;
712
+ /** Documents visible in the namespace */
713
+ documents: Document[];
714
+ }
715
+
716
+ /**
717
+ * Resource for canonical document lifecycle operations.
718
+ */
719
+ declare class Documents extends BaseResource {
720
+ /**
721
+ * List canonical documents in a namespace.
722
+ */
723
+ list(params?: {
724
+ namespace?: string;
725
+ }): Promise<DocumentListResponse>;
726
+ /**
727
+ * Get one canonical document by ID.
728
+ */
729
+ get(documentId: string): Promise<Document>;
730
+ /**
731
+ * Archive one canonical document by ID.
732
+ */
733
+ archive(documentId: string): Promise<Document>;
734
+ }
735
+
503
736
  /**
504
737
  * Main Knowhere SDK client
505
738
  */
506
739
  declare class Knowhere {
507
740
  /** Jobs resource for low-level API */
508
741
  readonly jobs: Jobs;
742
+ /** Retrieval resource for querying published documents */
743
+ readonly retrieval: Retrieval;
744
+ /** Documents resource for canonical document lifecycle operations */
745
+ readonly documents: Documents;
509
746
  private httpClient;
510
747
  /**
511
748
  * Create a new Knowhere client
@@ -665,4 +902,4 @@ declare class JobFailedError extends KnowhereError {
665
902
  constructor(message: string, code: string, jobResult: JobResult);
666
903
  }
667
904
 
668
- export { APIError, AuthenticationError, BadRequestError, type BaseChunk, ChecksumError, type Chunk, ConflictError, type CreateJobParams, type DocType, type FileIndex, GatewayTimeoutError, type ImageChunk, InternalServerError, InvalidStateError, type Job, type JobError, JobFailedError, type JobResult, type JobStatus, Jobs, Knowhere, KnowhereError, type KnowhereOptions, type LoadOptions, type Manifest, NetworkError, NotFoundError, type ParseParams, type ParseResult, type ParsingModel, type ParsingParams, PaymentRequiredError, PermissionDeniedError, type PollProgress, PollingTimeoutError, RateLimitError, ServiceUnavailableError, type Statistics, type TableChunk, type TextChunk, TimeoutError, type UploadParams, type UploadProgress, VERSION, ValidationError, type WaitOptions, type WebhookConfig, Knowhere as default };
905
+ export { APIError, AuthenticationError, BadRequestError, type BaseChunk, ChecksumError, type Chunk, ConflictError, type CreateJobParams, type DocType, type Document, type DocumentListResponse, Documents, type FileIndex, GatewayTimeoutError, type ImageChunk, InternalServerError, InvalidStateError, type Job, type JobError, JobFailedError, type JobResult, type JobStatus, Jobs, Knowhere, KnowhereError, type KnowhereOptions, type LoadOptions, type Manifest, NetworkError, NotFoundError, type ParseParams, type ParseResult, type ParsingModel, type ParsingParams, PaymentRequiredError, PermissionDeniedError, type PollProgress, PollingTimeoutError, RateLimitError, Retrieval, type RetrievalChannel, type RetrievalFilterMode, type RetrievalQueryParams, type RetrievalQueryResponse, type RetrievalResult, type RetrievalSectionExclusion, type RetrievalSource, ServiceUnavailableError, type Statistics, type TableChunk, type TextChunk, TimeoutError, type UploadParams, type UploadProgress, VERSION, ValidationError, type WaitOptions, type WebhookConfig, Knowhere as default };
package/dist/index.js CHANGED
@@ -35,6 +35,7 @@ __export(index_exports, {
35
35
  BadRequestError: () => BadRequestError,
36
36
  ChecksumError: () => ChecksumError,
37
37
  ConflictError: () => ConflictError,
38
+ Documents: () => Documents,
38
39
  GatewayTimeoutError: () => GatewayTimeoutError,
39
40
  InternalServerError: () => InternalServerError,
40
41
  InvalidStateError: () => InvalidStateError,
@@ -48,6 +49,7 @@ __export(index_exports, {
48
49
  PermissionDeniedError: () => PermissionDeniedError,
49
50
  PollingTimeoutError: () => PollingTimeoutError,
50
51
  RateLimitError: () => RateLimitError,
52
+ Retrieval: () => Retrieval,
51
53
  ServiceUnavailableError: () => ServiceUnavailableError,
52
54
  TimeoutError: () => TimeoutError,
53
55
  VERSION: () => VERSION,
@@ -344,6 +346,15 @@ function enrichJobResult(jobResult) {
344
346
  }
345
347
  return jobResult;
346
348
  }
349
+ function enrichParseResult(parseResult2, scope) {
350
+ if (scope.namespace !== void 0) {
351
+ parseResult2.namespace = scope.namespace;
352
+ }
353
+ if (scope.documentId !== void 0) {
354
+ parseResult2.documentId = scope.documentId;
355
+ }
356
+ return parseResult2;
357
+ }
347
358
  function sanitizePath(path2) {
348
359
  let sanitized = path2.replace(/^\/+/, "");
349
360
  sanitized = sanitized.replace(/\.\.(\/|\\)/g, "");
@@ -839,11 +850,39 @@ async function parseResult(httpClient, resultUrl, options) {
839
850
  const hierarchyContent = await hierarchyFile.async("string");
840
851
  hierarchy = JSON.parse(hierarchyContent);
841
852
  }
853
+ let chunksSlim;
854
+ const chunksSlimFile = zip.file("chunks_slim.json");
855
+ if (chunksSlimFile) {
856
+ const chunksSlimContent = await chunksSlimFile.async("string");
857
+ let chunksSlimData = JSON.parse(chunksSlimContent);
858
+ chunksSlimData = keysToCamel(chunksSlimData);
859
+ chunksSlim = extractSlimChunks(chunksSlimData);
860
+ }
861
+ let tocHierarchies;
862
+ const tocHierarchiesFile = zip.file("toc_hierarchies.json");
863
+ if (tocHierarchiesFile) {
864
+ const tocHierarchiesContent = await tocHierarchiesFile.async("string");
865
+ tocHierarchies = keysToCamel(JSON.parse(tocHierarchiesContent));
866
+ }
867
+ let kbCsv;
868
+ const kbCsvFile = zip.file("kb.csv");
869
+ if (kbCsvFile) {
870
+ kbCsv = await kbCsvFile.async("string");
871
+ }
872
+ let hierarchyViewHtml;
873
+ const hierarchyViewFile = zip.file("hierarchy_view.html");
874
+ if (hierarchyViewFile) {
875
+ hierarchyViewHtml = await hierarchyViewFile.async("string");
876
+ }
842
877
  const result = {
843
878
  manifest,
844
879
  chunks,
880
+ chunksSlim,
845
881
  fullMarkdown,
846
882
  hierarchy,
883
+ tocHierarchies,
884
+ kbCsv,
885
+ hierarchyViewHtml,
847
886
  rawZip: zipBuffer,
848
887
  get textChunks() {
849
888
  return chunks.filter((c) => c.type === "text");
@@ -867,12 +906,30 @@ async function parseResult(httpClient, resultUrl, options) {
867
906
  await import_fs2.promises.mkdir(directory, { recursive: true });
868
907
  await import_fs2.promises.writeFile((0, import_path.join)(directory, "manifest.json"), JSON.stringify(manifest, null, 2));
869
908
  await import_fs2.promises.writeFile((0, import_path.join)(directory, "chunks.json"), JSON.stringify(chunks, null, 2));
909
+ if (chunksSlim) {
910
+ await import_fs2.promises.writeFile(
911
+ (0, import_path.join)(directory, "chunks_slim.json"),
912
+ JSON.stringify({ chunks: chunksSlim }, null, 2)
913
+ );
914
+ }
870
915
  if (fullMarkdown) {
871
916
  await import_fs2.promises.writeFile((0, import_path.join)(directory, "full.md"), fullMarkdown);
872
917
  }
873
918
  if (hierarchy) {
874
919
  await import_fs2.promises.writeFile((0, import_path.join)(directory, "hierarchy.json"), JSON.stringify(hierarchy, null, 2));
875
920
  }
921
+ if (tocHierarchies) {
922
+ await import_fs2.promises.writeFile(
923
+ (0, import_path.join)(directory, "toc_hierarchies.json"),
924
+ JSON.stringify(tocHierarchies, null, 2)
925
+ );
926
+ }
927
+ if (kbCsv) {
928
+ await import_fs2.promises.writeFile((0, import_path.join)(directory, "kb.csv"), kbCsv);
929
+ }
930
+ if (hierarchyViewHtml) {
931
+ await import_fs2.promises.writeFile((0, import_path.join)(directory, "hierarchy_view.html"), hierarchyViewHtml);
932
+ }
876
933
  for (const imageChunk of this.imageChunks) {
877
934
  await imageChunk.save(directory);
878
935
  }
@@ -894,6 +951,15 @@ function extractChunks(payload) {
894
951
  }
895
952
  return [];
896
953
  }
954
+ function extractSlimChunks(payload) {
955
+ if (Array.isArray(payload)) {
956
+ return payload;
957
+ }
958
+ if (Array.isArray(payload.chunks)) {
959
+ return payload.chunks;
960
+ }
961
+ return [];
962
+ }
897
963
  function getChunkMetadata(chunkData) {
898
964
  if (!chunkData.metadata) {
899
965
  return {};
@@ -904,18 +970,38 @@ function getChunkFilePath(chunkData) {
904
970
  const metadata = getChunkMetadata(chunkData);
905
971
  return chunkData.filePath ?? metadata.filePath ?? chunkData.path;
906
972
  }
973
+ function normalizePageNums(pageNums) {
974
+ if (!Array.isArray(pageNums)) {
975
+ return void 0;
976
+ }
977
+ const normalized = pageNums.filter((pageNum) => typeof pageNum === "number");
978
+ return normalized.length > 0 ? normalized : void 0;
979
+ }
980
+ function normalizeTokens(tokens) {
981
+ if (!Array.isArray(tokens)) {
982
+ return void 0;
983
+ }
984
+ if (!tokens.every((token) => typeof token === "string")) {
985
+ return void 0;
986
+ }
987
+ return tokens;
988
+ }
907
989
  function normalizeTextChunk(chunkData) {
908
990
  const metadata = getChunkMetadata(chunkData);
991
+ const connectTo = metadata.connectTo ?? chunkData.connectTo;
992
+ const relationships = metadata.relationships ?? chunkData.relationships;
909
993
  return {
910
994
  chunkId: chunkData.chunkId ?? "",
911
995
  type: "text",
912
996
  content: chunkData.content ?? "",
913
997
  path: chunkData.path ?? "",
998
+ pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
914
999
  length: metadata.length ?? chunkData.length ?? 0,
915
- tokens: metadata.tokens ?? chunkData.tokens,
1000
+ tokens: normalizeTokens(metadata.tokens ?? chunkData.tokens),
916
1001
  keywords: metadata.keywords ?? chunkData.keywords,
917
1002
  summary: metadata.summary ?? chunkData.summary,
918
- relationships: metadata.relationships ?? chunkData.relationships
1003
+ ...connectTo !== void 0 && { connectTo },
1004
+ ...relationships !== void 0 && { relationships }
919
1005
  };
920
1006
  }
921
1007
  async function processChunk(zip, chunkData) {
@@ -939,6 +1025,7 @@ async function processChunk(zip, chunkData) {
939
1025
  type: "image",
940
1026
  content: chunkData.content ?? "",
941
1027
  path: chunkData.path ?? "",
1028
+ pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
942
1029
  length: metadata.length ?? chunkData.length ?? 0,
943
1030
  filePath,
944
1031
  summary: metadata.summary ?? chunkData.summary,
@@ -973,6 +1060,7 @@ async function processChunk(zip, chunkData) {
973
1060
  type: "table",
974
1061
  content: chunkData.content ?? "",
975
1062
  path: chunkData.path ?? "",
1063
+ pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
976
1064
  length: metadata.length ?? chunkData.length ?? 0,
977
1065
  filePath,
978
1066
  tableType: metadata.tableType ?? chunkData.tableType,
@@ -998,7 +1086,11 @@ var Jobs = class extends BaseResource {
998
1086
  * Create a new parsing job
999
1087
  */
1000
1088
  async create(params) {
1001
- const job = await this.httpClient.post("/v1/jobs", params);
1089
+ const job = await this.httpClient.post(
1090
+ "/v1/jobs",
1091
+ params
1092
+ );
1093
+ delete job.documentId;
1002
1094
  if (job.uploadUrl) {
1003
1095
  this.pendingUploadJobs.set(job.jobId, job);
1004
1096
  }
@@ -1046,7 +1138,8 @@ var Jobs = class extends BaseResource {
1046
1138
  if (!jobResult.resultUrl) {
1047
1139
  throw new NotFoundError("Result URL not available");
1048
1140
  }
1049
- return parseResult(this.httpClient, jobResult.resultUrl, options);
1141
+ const result = await parseResult(this.httpClient, jobResult.resultUrl, options);
1142
+ return enrichParseResult(result, jobResult);
1050
1143
  }
1051
1144
  isHttpUrl(value) {
1052
1145
  return /^https?:\/\//i.test(value);
@@ -1097,6 +1190,43 @@ var Jobs = class extends BaseResource {
1097
1190
  }
1098
1191
  };
1099
1192
 
1193
+ // src/resources/retrieval.ts
1194
+ var Retrieval = class extends BaseResource {
1195
+ /**
1196
+ * Query published documents.
1197
+ */
1198
+ async query(params) {
1199
+ return this.httpClient.post("/v1/retrieval/query", params);
1200
+ }
1201
+ };
1202
+
1203
+ // src/resources/documents.ts
1204
+ var Documents = class extends BaseResource {
1205
+ /**
1206
+ * List canonical documents in a namespace.
1207
+ */
1208
+ async list(params) {
1209
+ const requestConfig = params?.namespace ? {
1210
+ params: {
1211
+ namespace: params.namespace
1212
+ }
1213
+ } : void 0;
1214
+ return this.httpClient.get("/v1/documents", requestConfig);
1215
+ }
1216
+ /**
1217
+ * Get one canonical document by ID.
1218
+ */
1219
+ async get(documentId) {
1220
+ return this.httpClient.get(`/v1/documents/${documentId}`);
1221
+ }
1222
+ /**
1223
+ * Archive one canonical document by ID.
1224
+ */
1225
+ async archive(documentId) {
1226
+ return this.httpClient.post(`/v1/documents/${documentId}/archive`);
1227
+ }
1228
+ };
1229
+
1100
1230
  // src/client.ts
1101
1231
  function inferFileName(file, explicitFileName) {
1102
1232
  if (explicitFileName) {
@@ -1116,6 +1246,10 @@ function isReadStream2(file) {
1116
1246
  var Knowhere = class {
1117
1247
  /** Jobs resource for low-level API */
1118
1248
  jobs;
1249
+ /** Retrieval resource for querying published documents */
1250
+ retrieval;
1251
+ /** Documents resource for canonical document lifecycle operations */
1252
+ documents;
1119
1253
  httpClient;
1120
1254
  /**
1121
1255
  * Create a new Knowhere client
@@ -1139,6 +1273,8 @@ var Knowhere = class {
1139
1273
  httpsAgent: options.httpsAgent
1140
1274
  });
1141
1275
  this.jobs = new Jobs(this.httpClient);
1276
+ this.retrieval = new Retrieval(this.httpClient);
1277
+ this.documents = new Documents(this.httpClient);
1142
1278
  }
1143
1279
  /**
1144
1280
  * High-level API: Parse a document and return structured results
@@ -1181,7 +1317,7 @@ var Knowhere = class {
1181
1317
  smartTitleParse: params.smartTitleParse,
1182
1318
  summaryImage: params.summaryImage,
1183
1319
  summaryTable: params.summaryTable,
1184
- summaryTxt: params.summaryText,
1320
+ summaryTxt: params.summaryTxt,
1185
1321
  addFragDesc: params.addFragDesc,
1186
1322
  kbDir: params.kbDir
1187
1323
  };
@@ -1196,6 +1332,8 @@ var Knowhere = class {
1196
1332
  sourceUrl: params.url,
1197
1333
  fileName: resolvedFileName,
1198
1334
  dataId: params.dataId,
1335
+ namespace: params.namespace,
1336
+ documentId: params.documentId,
1199
1337
  parsingParams: Object.keys(parsingParams).length > 0 ? parsingParams : void 0,
1200
1338
  webhook
1201
1339
  });
@@ -1215,7 +1353,7 @@ var Knowhere = class {
1215
1353
  const result = await this.jobs.load(jobResult, {
1216
1354
  verifyChecksum: params.verifyChecksum
1217
1355
  });
1218
- return result;
1356
+ return enrichParseResult(result, jobResult);
1219
1357
  }
1220
1358
  };
1221
1359
  // Annotate the CommonJS export names for ESM import in node:
@@ -1225,6 +1363,7 @@ var Knowhere = class {
1225
1363
  BadRequestError,
1226
1364
  ChecksumError,
1227
1365
  ConflictError,
1366
+ Documents,
1228
1367
  GatewayTimeoutError,
1229
1368
  InternalServerError,
1230
1369
  InvalidStateError,
@@ -1238,6 +1377,7 @@ var Knowhere = class {
1238
1377
  PermissionDeniedError,
1239
1378
  PollingTimeoutError,
1240
1379
  RateLimitError,
1380
+ Retrieval,
1241
1381
  ServiceUnavailableError,
1242
1382
  TimeoutError,
1243
1383
  VERSION,
package/dist/index.mjs CHANGED
@@ -286,6 +286,15 @@ function enrichJobResult(jobResult) {
286
286
  }
287
287
  return jobResult;
288
288
  }
289
+ function enrichParseResult(parseResult2, scope) {
290
+ if (scope.namespace !== void 0) {
291
+ parseResult2.namespace = scope.namespace;
292
+ }
293
+ if (scope.documentId !== void 0) {
294
+ parseResult2.documentId = scope.documentId;
295
+ }
296
+ return parseResult2;
297
+ }
289
298
  function sanitizePath(path2) {
290
299
  let sanitized = path2.replace(/^\/+/, "");
291
300
  sanitized = sanitized.replace(/\.\.(\/|\\)/g, "");
@@ -781,11 +790,39 @@ async function parseResult(httpClient, resultUrl, options) {
781
790
  const hierarchyContent = await hierarchyFile.async("string");
782
791
  hierarchy = JSON.parse(hierarchyContent);
783
792
  }
793
+ let chunksSlim;
794
+ const chunksSlimFile = zip.file("chunks_slim.json");
795
+ if (chunksSlimFile) {
796
+ const chunksSlimContent = await chunksSlimFile.async("string");
797
+ let chunksSlimData = JSON.parse(chunksSlimContent);
798
+ chunksSlimData = keysToCamel(chunksSlimData);
799
+ chunksSlim = extractSlimChunks(chunksSlimData);
800
+ }
801
+ let tocHierarchies;
802
+ const tocHierarchiesFile = zip.file("toc_hierarchies.json");
803
+ if (tocHierarchiesFile) {
804
+ const tocHierarchiesContent = await tocHierarchiesFile.async("string");
805
+ tocHierarchies = keysToCamel(JSON.parse(tocHierarchiesContent));
806
+ }
807
+ let kbCsv;
808
+ const kbCsvFile = zip.file("kb.csv");
809
+ if (kbCsvFile) {
810
+ kbCsv = await kbCsvFile.async("string");
811
+ }
812
+ let hierarchyViewHtml;
813
+ const hierarchyViewFile = zip.file("hierarchy_view.html");
814
+ if (hierarchyViewFile) {
815
+ hierarchyViewHtml = await hierarchyViewFile.async("string");
816
+ }
784
817
  const result = {
785
818
  manifest,
786
819
  chunks,
820
+ chunksSlim,
787
821
  fullMarkdown,
788
822
  hierarchy,
823
+ tocHierarchies,
824
+ kbCsv,
825
+ hierarchyViewHtml,
789
826
  rawZip: zipBuffer,
790
827
  get textChunks() {
791
828
  return chunks.filter((c) => c.type === "text");
@@ -809,12 +846,30 @@ async function parseResult(httpClient, resultUrl, options) {
809
846
  await fs2.mkdir(directory, { recursive: true });
810
847
  await fs2.writeFile(join(directory, "manifest.json"), JSON.stringify(manifest, null, 2));
811
848
  await fs2.writeFile(join(directory, "chunks.json"), JSON.stringify(chunks, null, 2));
849
+ if (chunksSlim) {
850
+ await fs2.writeFile(
851
+ join(directory, "chunks_slim.json"),
852
+ JSON.stringify({ chunks: chunksSlim }, null, 2)
853
+ );
854
+ }
812
855
  if (fullMarkdown) {
813
856
  await fs2.writeFile(join(directory, "full.md"), fullMarkdown);
814
857
  }
815
858
  if (hierarchy) {
816
859
  await fs2.writeFile(join(directory, "hierarchy.json"), JSON.stringify(hierarchy, null, 2));
817
860
  }
861
+ if (tocHierarchies) {
862
+ await fs2.writeFile(
863
+ join(directory, "toc_hierarchies.json"),
864
+ JSON.stringify(tocHierarchies, null, 2)
865
+ );
866
+ }
867
+ if (kbCsv) {
868
+ await fs2.writeFile(join(directory, "kb.csv"), kbCsv);
869
+ }
870
+ if (hierarchyViewHtml) {
871
+ await fs2.writeFile(join(directory, "hierarchy_view.html"), hierarchyViewHtml);
872
+ }
818
873
  for (const imageChunk of this.imageChunks) {
819
874
  await imageChunk.save(directory);
820
875
  }
@@ -836,6 +891,15 @@ function extractChunks(payload) {
836
891
  }
837
892
  return [];
838
893
  }
894
+ function extractSlimChunks(payload) {
895
+ if (Array.isArray(payload)) {
896
+ return payload;
897
+ }
898
+ if (Array.isArray(payload.chunks)) {
899
+ return payload.chunks;
900
+ }
901
+ return [];
902
+ }
839
903
  function getChunkMetadata(chunkData) {
840
904
  if (!chunkData.metadata) {
841
905
  return {};
@@ -846,18 +910,38 @@ function getChunkFilePath(chunkData) {
846
910
  const metadata = getChunkMetadata(chunkData);
847
911
  return chunkData.filePath ?? metadata.filePath ?? chunkData.path;
848
912
  }
913
+ function normalizePageNums(pageNums) {
914
+ if (!Array.isArray(pageNums)) {
915
+ return void 0;
916
+ }
917
+ const normalized = pageNums.filter((pageNum) => typeof pageNum === "number");
918
+ return normalized.length > 0 ? normalized : void 0;
919
+ }
920
+ function normalizeTokens(tokens) {
921
+ if (!Array.isArray(tokens)) {
922
+ return void 0;
923
+ }
924
+ if (!tokens.every((token) => typeof token === "string")) {
925
+ return void 0;
926
+ }
927
+ return tokens;
928
+ }
849
929
  function normalizeTextChunk(chunkData) {
850
930
  const metadata = getChunkMetadata(chunkData);
931
+ const connectTo = metadata.connectTo ?? chunkData.connectTo;
932
+ const relationships = metadata.relationships ?? chunkData.relationships;
851
933
  return {
852
934
  chunkId: chunkData.chunkId ?? "",
853
935
  type: "text",
854
936
  content: chunkData.content ?? "",
855
937
  path: chunkData.path ?? "",
938
+ pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
856
939
  length: metadata.length ?? chunkData.length ?? 0,
857
- tokens: metadata.tokens ?? chunkData.tokens,
940
+ tokens: normalizeTokens(metadata.tokens ?? chunkData.tokens),
858
941
  keywords: metadata.keywords ?? chunkData.keywords,
859
942
  summary: metadata.summary ?? chunkData.summary,
860
- relationships: metadata.relationships ?? chunkData.relationships
943
+ ...connectTo !== void 0 && { connectTo },
944
+ ...relationships !== void 0 && { relationships }
861
945
  };
862
946
  }
863
947
  async function processChunk(zip, chunkData) {
@@ -881,6 +965,7 @@ async function processChunk(zip, chunkData) {
881
965
  type: "image",
882
966
  content: chunkData.content ?? "",
883
967
  path: chunkData.path ?? "",
968
+ pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
884
969
  length: metadata.length ?? chunkData.length ?? 0,
885
970
  filePath,
886
971
  summary: metadata.summary ?? chunkData.summary,
@@ -915,6 +1000,7 @@ async function processChunk(zip, chunkData) {
915
1000
  type: "table",
916
1001
  content: chunkData.content ?? "",
917
1002
  path: chunkData.path ?? "",
1003
+ pageNums: normalizePageNums(metadata.pageNums ?? chunkData.pageNums),
918
1004
  length: metadata.length ?? chunkData.length ?? 0,
919
1005
  filePath,
920
1006
  tableType: metadata.tableType ?? chunkData.tableType,
@@ -940,7 +1026,11 @@ var Jobs = class extends BaseResource {
940
1026
  * Create a new parsing job
941
1027
  */
942
1028
  async create(params) {
943
- const job = await this.httpClient.post("/v1/jobs", params);
1029
+ const job = await this.httpClient.post(
1030
+ "/v1/jobs",
1031
+ params
1032
+ );
1033
+ delete job.documentId;
944
1034
  if (job.uploadUrl) {
945
1035
  this.pendingUploadJobs.set(job.jobId, job);
946
1036
  }
@@ -988,7 +1078,8 @@ var Jobs = class extends BaseResource {
988
1078
  if (!jobResult.resultUrl) {
989
1079
  throw new NotFoundError("Result URL not available");
990
1080
  }
991
- return parseResult(this.httpClient, jobResult.resultUrl, options);
1081
+ const result = await parseResult(this.httpClient, jobResult.resultUrl, options);
1082
+ return enrichParseResult(result, jobResult);
992
1083
  }
993
1084
  isHttpUrl(value) {
994
1085
  return /^https?:\/\//i.test(value);
@@ -1039,6 +1130,43 @@ var Jobs = class extends BaseResource {
1039
1130
  }
1040
1131
  };
1041
1132
 
1133
+ // src/resources/retrieval.ts
1134
+ var Retrieval = class extends BaseResource {
1135
+ /**
1136
+ * Query published documents.
1137
+ */
1138
+ async query(params) {
1139
+ return this.httpClient.post("/v1/retrieval/query", params);
1140
+ }
1141
+ };
1142
+
1143
+ // src/resources/documents.ts
1144
+ var Documents = class extends BaseResource {
1145
+ /**
1146
+ * List canonical documents in a namespace.
1147
+ */
1148
+ async list(params) {
1149
+ const requestConfig = params?.namespace ? {
1150
+ params: {
1151
+ namespace: params.namespace
1152
+ }
1153
+ } : void 0;
1154
+ return this.httpClient.get("/v1/documents", requestConfig);
1155
+ }
1156
+ /**
1157
+ * Get one canonical document by ID.
1158
+ */
1159
+ async get(documentId) {
1160
+ return this.httpClient.get(`/v1/documents/${documentId}`);
1161
+ }
1162
+ /**
1163
+ * Archive one canonical document by ID.
1164
+ */
1165
+ async archive(documentId) {
1166
+ return this.httpClient.post(`/v1/documents/${documentId}/archive`);
1167
+ }
1168
+ };
1169
+
1042
1170
  // src/client.ts
1043
1171
  function inferFileName(file, explicitFileName) {
1044
1172
  if (explicitFileName) {
@@ -1058,6 +1186,10 @@ function isReadStream2(file) {
1058
1186
  var Knowhere = class {
1059
1187
  /** Jobs resource for low-level API */
1060
1188
  jobs;
1189
+ /** Retrieval resource for querying published documents */
1190
+ retrieval;
1191
+ /** Documents resource for canonical document lifecycle operations */
1192
+ documents;
1061
1193
  httpClient;
1062
1194
  /**
1063
1195
  * Create a new Knowhere client
@@ -1081,6 +1213,8 @@ var Knowhere = class {
1081
1213
  httpsAgent: options.httpsAgent
1082
1214
  });
1083
1215
  this.jobs = new Jobs(this.httpClient);
1216
+ this.retrieval = new Retrieval(this.httpClient);
1217
+ this.documents = new Documents(this.httpClient);
1084
1218
  }
1085
1219
  /**
1086
1220
  * High-level API: Parse a document and return structured results
@@ -1123,7 +1257,7 @@ var Knowhere = class {
1123
1257
  smartTitleParse: params.smartTitleParse,
1124
1258
  summaryImage: params.summaryImage,
1125
1259
  summaryTable: params.summaryTable,
1126
- summaryTxt: params.summaryText,
1260
+ summaryTxt: params.summaryTxt,
1127
1261
  addFragDesc: params.addFragDesc,
1128
1262
  kbDir: params.kbDir
1129
1263
  };
@@ -1138,6 +1272,8 @@ var Knowhere = class {
1138
1272
  sourceUrl: params.url,
1139
1273
  fileName: resolvedFileName,
1140
1274
  dataId: params.dataId,
1275
+ namespace: params.namespace,
1276
+ documentId: params.documentId,
1141
1277
  parsingParams: Object.keys(parsingParams).length > 0 ? parsingParams : void 0,
1142
1278
  webhook
1143
1279
  });
@@ -1157,7 +1293,7 @@ var Knowhere = class {
1157
1293
  const result = await this.jobs.load(jobResult, {
1158
1294
  verifyChecksum: params.verifyChecksum
1159
1295
  });
1160
- return result;
1296
+ return enrichParseResult(result, jobResult);
1161
1297
  }
1162
1298
  };
1163
1299
  export {
@@ -1166,6 +1302,7 @@ export {
1166
1302
  BadRequestError,
1167
1303
  ChecksumError,
1168
1304
  ConflictError,
1305
+ Documents,
1169
1306
  GatewayTimeoutError,
1170
1307
  InternalServerError,
1171
1308
  InvalidStateError,
@@ -1179,6 +1316,7 @@ export {
1179
1316
  PermissionDeniedError,
1180
1317
  PollingTimeoutError,
1181
1318
  RateLimitError,
1319
+ Retrieval,
1182
1320
  ServiceUnavailableError,
1183
1321
  TimeoutError,
1184
1322
  VERSION,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ontos-ai/knowhere-sdk",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "Official Node.js SDK for Knowhere document parsing API",
5
5
  "keywords": [
6
6
  "knowhere",