@oscharko-dev/keiko-local-knowledge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/bounded-document-extraction.d.ts +27 -0
- package/dist/bounded-document-extraction.d.ts.map +1 -0
- package/dist/bounded-document-extraction.js +214 -0
- package/dist/capsule-lifecycle.d.ts +33 -0
- package/dist/capsule-lifecycle.d.ts.map +1 -0
- package/dist/capsule-lifecycle.js +292 -0
- package/dist/capsule-set-lifecycle.d.ts +15 -0
- package/dist/capsule-set-lifecycle.d.ts.map +1 -0
- package/dist/capsule-set-lifecycle.js +158 -0
- package/dist/chunking/chunker-persist.d.ts +36 -0
- package/dist/chunking/chunker-persist.d.ts.map +1 -0
- package/dist/chunking/chunker-persist.js +74 -0
- package/dist/chunking/chunker-runner.d.ts +9 -0
- package/dist/chunking/chunker-runner.d.ts.map +1 -0
- package/dist/chunking/chunker-runner.js +218 -0
- package/dist/chunking/chunker.d.ts +7 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +139 -0
- package/dist/chunking/citation-mapper.d.ts +4 -0
- package/dist/chunking/citation-mapper.d.ts.map +1 -0
- package/dist/chunking/citation-mapper.js +180 -0
- package/dist/chunking/index.d.ts +6 -0
- package/dist/chunking/index.d.ts.map +1 -0
- package/dist/chunking/index.js +8 -0
- package/dist/chunking/token-estimator.d.ts +3 -0
- package/dist/chunking/token-estimator.d.ts.map +1 -0
- package/dist/chunking/token-estimator.js +26 -0
- package/dist/chunking/types.d.ts +49 -0
- package/dist/chunking/types.d.ts.map +1 -0
- package/dist/chunking/types.js +26 -0
- package/dist/composition.d.ts +57 -0
- package/dist/composition.d.ts.map +1 -0
- package/dist/composition.js +310 -0
- package/dist/conversation/citation-attacher.d.ts +8 -0
- package/dist/conversation/citation-attacher.d.ts.map +1 -0
- package/dist/conversation/citation-attacher.js +55 -0
- package/dist/conversation/citation-excerpts.d.ts +4 -0
- package/dist/conversation/citation-excerpts.d.ts.map +1 -0
- package/dist/conversation/citation-excerpts.js +41 -0
- package/dist/conversation/grounded-answer-runner.d.ts +9 -0
- package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
- package/dist/conversation/grounded-answer-runner.js +61 -0
- package/dist/conversation/index.d.ts +5 -0
- package/dist/conversation/index.d.ts.map +1 -0
- package/dist/conversation/index.js +7 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
- package/dist/conversation/model-gateway-answer-generator.js +105 -0
- package/dist/conversation/types.d.ts +35 -0
- package/dist/conversation/types.d.ts.map +1 -0
- package/dist/conversation/types.js +24 -0
- package/dist/discovery/discovery-runner.d.ts +23 -0
- package/dist/discovery/discovery-runner.d.ts.map +1 -0
- package/dist/discovery/discovery-runner.js +109 -0
- package/dist/discovery/extract-progressive.d.ts +17 -0
- package/dist/discovery/extract-progressive.d.ts.map +1 -0
- package/dist/discovery/extract-progressive.js +522 -0
- package/dist/discovery/extract.d.ts +26 -0
- package/dist/discovery/extract.d.ts.map +1 -0
- package/dist/discovery/extract.js +906 -0
- package/dist/discovery/glob.d.ts +10 -0
- package/dist/discovery/glob.d.ts.map +1 -0
- package/dist/discovery/glob.js +72 -0
- package/dist/discovery/index.d.ts +6 -0
- package/dist/discovery/index.d.ts.map +1 -0
- package/dist/discovery/index.js +8 -0
- package/dist/discovery/media-type.d.ts +4 -0
- package/dist/discovery/media-type.d.ts.map +1 -0
- package/dist/discovery/media-type.js +62 -0
- package/dist/discovery/persist.d.ts +63 -0
- package/dist/discovery/persist.d.ts.map +1 -0
- package/dist/discovery/persist.js +345 -0
- package/dist/discovery/test-support.d.ts +16 -0
- package/dist/discovery/test-support.d.ts.map +1 -0
- package/dist/discovery/test-support.js +127 -0
- package/dist/discovery/types.d.ts +63 -0
- package/dist/discovery/types.d.ts.map +1 -0
- package/dist/discovery/types.js +28 -0
- package/dist/discovery/walk.d.ts +12 -0
- package/dist/discovery/walk.d.ts.map +1 -0
- package/dist/discovery/walk.js +302 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +22 -0
- package/dist/evaluations/dimensions.d.ts +14 -0
- package/dist/evaluations/dimensions.d.ts.map +1 -0
- package/dist/evaluations/dimensions.js +191 -0
- package/dist/evaluations/fixtures.d.ts +18 -0
- package/dist/evaluations/fixtures.d.ts.map +1 -0
- package/dist/evaluations/fixtures.js +858 -0
- package/dist/evaluations/index.d.ts +7 -0
- package/dist/evaluations/index.d.ts.map +1 -0
- package/dist/evaluations/index.js +10 -0
- package/dist/evaluations/report.d.ts +3 -0
- package/dist/evaluations/report.d.ts.map +1 -0
- package/dist/evaluations/report.js +31 -0
- package/dist/evaluations/runner-seed.d.ts +12 -0
- package/dist/evaluations/runner-seed.d.ts.map +1 -0
- package/dist/evaluations/runner-seed.js +175 -0
- package/dist/evaluations/runner.d.ts +8 -0
- package/dist/evaluations/runner.d.ts.map +1 -0
- package/dist/evaluations/runner.js +205 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
- package/dist/evaluations/scripted-embedding-adapter.js +163 -0
- package/dist/evaluations/types.d.ts +116 -0
- package/dist/evaluations/types.d.ts.map +1 -0
- package/dist/evaluations/types.js +27 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +41 -0
- package/dist/indexing/bounded-indexing.d.ts +41 -0
- package/dist/indexing/bounded-indexing.d.ts.map +1 -0
- package/dist/indexing/bounded-indexing.js +240 -0
- package/dist/indexing/checkpoint-persist.d.ts +8 -0
- package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
- package/dist/indexing/checkpoint-persist.js +135 -0
- package/dist/indexing/checkpoint-resume.d.ts +20 -0
- package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
- package/dist/indexing/checkpoint-resume.js +50 -0
- package/dist/indexing/embedding-batcher.d.ts +3 -0
- package/dist/indexing/embedding-batcher.d.ts.map +1 -0
- package/dist/indexing/embedding-batcher.js +390 -0
- package/dist/indexing/index.d.ts +7 -0
- package/dist/indexing/index.d.ts.map +1 -0
- package/dist/indexing/index.js +11 -0
- package/dist/indexing/job-persist.d.ts +46 -0
- package/dist/indexing/job-persist.d.ts.map +1 -0
- package/dist/indexing/job-persist.js +157 -0
- package/dist/indexing/job-resume.d.ts +4 -0
- package/dist/indexing/job-resume.d.ts.map +1 -0
- package/dist/indexing/job-resume.js +14 -0
- package/dist/indexing/orchestrator.d.ts +3 -0
- package/dist/indexing/orchestrator.d.ts.map +1 -0
- package/dist/indexing/orchestrator.js +1151 -0
- package/dist/indexing/types.d.ts +156 -0
- package/dist/indexing/types.d.ts.map +1 -0
- package/dist/indexing/types.js +30 -0
- package/dist/indexing/vector-persist.d.ts +32 -0
- package/dist/indexing/vector-persist.d.ts.map +1 -0
- package/dist/indexing/vector-persist.js +105 -0
- package/dist/parsers/_internal.d.ts +20 -0
- package/dist/parsers/_internal.d.ts.map +1 -0
- package/dist/parsers/_internal.js +122 -0
- package/dist/parsers/csv-parser.d.ts +3 -0
- package/dist/parsers/csv-parser.d.ts.map +1 -0
- package/dist/parsers/csv-parser.js +202 -0
- package/dist/parsers/docx-parser.d.ts +3 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +390 -0
- package/dist/parsers/html-parser.d.ts +3 -0
- package/dist/parsers/html-parser.d.ts.map +1 -0
- package/dist/parsers/html-parser.js +310 -0
- package/dist/parsers/index.d.ts +15 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/index.js +41 -0
- package/dist/parsers/json-parser.d.ts +3 -0
- package/dist/parsers/json-parser.d.ts.map +1 -0
- package/dist/parsers/json-parser.js +192 -0
- package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
- package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
- package/dist/parsers/large-document/capability-discovery.js +76 -0
- package/dist/parsers/large-document/diagnostics.d.ts +3 -0
- package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
- package/dist/parsers/large-document/diagnostics.js +11 -0
- package/dist/parsers/large-document/index.d.ts +15 -0
- package/dist/parsers/large-document/index.d.ts.map +1 -0
- package/dist/parsers/large-document/index.js +10 -0
- package/dist/parsers/large-document/legacy-format.d.ts +5 -0
- package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
- package/dist/parsers/large-document/legacy-format.js +25 -0
- package/dist/parsers/large-document/preflight.d.ts +9 -0
- package/dist/parsers/large-document/preflight.d.ts.map +1 -0
- package/dist/parsers/large-document/preflight.js +43 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-extraction.js +123 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-pdf.js +145 -0
- package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
- package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
- package/dist/parsers/large-document/synthetic-source.js +101 -0
- package/dist/parsers/large-document/window-builder.d.ts +24 -0
- package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
- package/dist/parsers/large-document/window-builder.js +75 -0
- package/dist/parsers/ocr/index.d.ts +4 -0
- package/dist/parsers/ocr/index.d.ts.map +1 -0
- package/dist/parsers/ocr/index.js +4 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
- package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
- package/dist/parsers/ocr/types.d.ts +16 -0
- package/dist/parsers/ocr/types.d.ts.map +1 -0
- package/dist/parsers/ocr/types.js +4 -0
- package/dist/parsers/parser-test-fixtures.d.ts +28 -0
- package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
- package/dist/parsers/parser-test-fixtures.js +139 -0
- package/dist/parsers/pdf-parser.d.ts +43 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +388 -0
- package/dist/parsers/registry.d.ts +8 -0
- package/dist/parsers/registry.d.ts.map +1 -0
- package/dist/parsers/registry.js +57 -0
- package/dist/parsers/text-parser.d.ts +3 -0
- package/dist/parsers/text-parser.d.ts.map +1 -0
- package/dist/parsers/text-parser.js +214 -0
- package/dist/parsers/types.d.ts +53 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers/types.js +21 -0
- package/dist/parsers/unsupported-parser.d.ts +4 -0
- package/dist/parsers/unsupported-parser.d.ts.map +1 -0
- package/dist/parsers/unsupported-parser.js +97 -0
- package/dist/parsers/xlsx-parser.d.ts +3 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +425 -0
- package/dist/privacy/audit-emitter.d.ts +5 -0
- package/dist/privacy/audit-emitter.d.ts.map +1 -0
- package/dist/privacy/audit-emitter.js +93 -0
- package/dist/privacy/diagnostic-redactor.d.ts +2 -0
- package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
- package/dist/privacy/diagnostic-redactor.js +153 -0
- package/dist/privacy/index.d.ts +5 -0
- package/dist/privacy/index.d.ts.map +1 -0
- package/dist/privacy/index.js +6 -0
- package/dist/privacy/retention-applier.d.ts +5 -0
- package/dist/privacy/retention-applier.d.ts.map +1 -0
- package/dist/privacy/retention-applier.js +88 -0
- package/dist/privacy/types.d.ts +98 -0
- package/dist/privacy/types.d.ts.map +1 -0
- package/dist/privacy/types.js +12 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
- package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
- package/dist/qualityIntelligence/index.d.ts +3 -0
- package/dist/qualityIntelligence/index.d.ts.map +1 -0
- package/dist/qualityIntelligence/index.js +5 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
- package/dist/qualityIntelligence/qiHandoff.js +82 -0
- package/dist/retrieval/answer-grounding.d.ts +9 -0
- package/dist/retrieval/answer-grounding.d.ts.map +1 -0
- package/dist/retrieval/answer-grounding.js +31 -0
- package/dist/retrieval/context-pack-assembler.d.ts +24 -0
- package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
- package/dist/retrieval/context-pack-assembler.js +50 -0
- package/dist/retrieval/index.d.ts +6 -0
- package/dist/retrieval/index.d.ts.map +1 -0
- package/dist/retrieval/index.js +9 -0
- package/dist/retrieval/retrieval-runner.d.ts +10 -0
- package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
- package/dist/retrieval/retrieval-runner.js +163 -0
- package/dist/retrieval/scoped-vector-search.d.ts +24 -0
- package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
- package/dist/retrieval/scoped-vector-search.js +864 -0
- package/dist/retrieval/types.d.ts +28 -0
- package/dist/retrieval/types.d.ts.map +1 -0
- package/dist/retrieval/types.js +33 -0
- package/dist/section-path-hash.d.ts +3 -0
- package/dist/section-path-hash.d.ts.map +1 -0
- package/dist/section-path-hash.js +9 -0
- package/dist/source-lifecycle.d.ts +14 -0
- package/dist/source-lifecycle.d.ts.map +1 -0
- package/dist/source-lifecycle.js +155 -0
- package/dist/source-routing-validation.d.ts +11 -0
- package/dist/source-routing-validation.d.ts.map +1 -0
- package/dist/source-routing-validation.js +140 -0
- package/dist/store-content-cipher.d.ts +11 -0
- package/dist/store-content-cipher.d.ts.map +1 -0
- package/dist/store-content-cipher.js +67 -0
- package/dist/store-content-encryption.d.ts +12 -0
- package/dist/store-content-encryption.d.ts.map +1 -0
- package/dist/store-content-encryption.js +275 -0
- package/dist/store-paths.d.ts +6 -0
- package/dist/store-paths.d.ts.map +1 -0
- package/dist/store-paths.js +61 -0
- package/dist/store.d.ts +30 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +219 -0
- package/dist/testing.d.ts +47 -0
- package/dist/testing.d.ts.map +1 -0
- package/dist/testing.js +170 -0
- package/dist/version.d.ts +2 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +4 -0
- package/package.json +43 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import type { ChunkId, DocumentId, EmbeddingModelIdentity, ExtractionCapabilityAvailability, IndexingJobError, KnowledgeCapsuleId, KnowledgeSourceId, LargeDocumentResourcePolicy, VectorRecord } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
import type { OpenAIEmbeddingAdapter } from "@oscharko-dev/keiko-model-gateway";
|
|
3
|
+
import type { WorkspaceFs } from "@oscharko-dev/keiko-workspace";
|
|
4
|
+
import type { ChunkingOptions } from "../chunking/index.js";
|
|
5
|
+
import type { DiscoveryOptions } from "../discovery/index.js";
|
|
6
|
+
import { KnowledgeStoreError } from "../errors.js";
|
|
7
|
+
import type { ParserRegistry, ProgressiveExtractor } from "../parsers/index.js";
|
|
8
|
+
import type { AuditEventSink } from "../privacy/index.js";
|
|
9
|
+
import type { KnowledgeStore } from "../store.js";
|
|
10
|
+
export declare const DEFAULT_INDEXING_BATCH_SIZE = 64;
|
|
11
|
+
export declare const DEFAULT_INDEXING_CONCURRENCY = 4;
|
|
12
|
+
export type IndexingErrorCode = "INCOMPATIBLE_EMBEDDING_IDENTITY" | "EMBEDDING_ADAPTER_FAILED" | "DISCOVERY_FAILED" | "CHUNKING_FAILED" | "CANCELLED" | "CAPSULE_NOT_FOUND" | "INVALID_OPTIONS" | "PERSISTENCE_FAILED";
|
|
13
|
+
export declare class IndexingError extends KnowledgeStoreError {
|
|
14
|
+
readonly name: string;
|
|
15
|
+
readonly code: IndexingErrorCode;
|
|
16
|
+
constructor(code: IndexingErrorCode, message: string, options?: {
|
|
17
|
+
cause?: unknown;
|
|
18
|
+
});
|
|
19
|
+
}
|
|
20
|
+
export interface IndexingOptions {
|
|
21
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
22
|
+
readonly sourceIds?: readonly KnowledgeSourceId[];
|
|
23
|
+
readonly parserRegistry: ParserRegistry;
|
|
24
|
+
readonly workspaceFs: WorkspaceFs;
|
|
25
|
+
readonly embeddingAdapter: OpenAIEmbeddingAdapter;
|
|
26
|
+
readonly store: KnowledgeStore;
|
|
27
|
+
readonly signal?: AbortSignal;
|
|
28
|
+
readonly progress?: (event: IndexingEvent) => void;
|
|
29
|
+
readonly auditSink?: AuditEventSink;
|
|
30
|
+
readonly force?: boolean;
|
|
31
|
+
readonly batchSize?: number;
|
|
32
|
+
readonly concurrency?: number;
|
|
33
|
+
readonly chunkingOptions?: ChunkingOptions;
|
|
34
|
+
readonly discoveryOptions?: DiscoveryOptions;
|
|
35
|
+
readonly now?: () => number;
|
|
36
|
+
readonly idSource?: () => string;
|
|
37
|
+
readonly largeDocumentPolicy?: LargeDocumentResourcePolicy;
|
|
38
|
+
readonly progressiveExtractors?: readonly ProgressiveExtractor[];
|
|
39
|
+
readonly extractionCapabilities?: ExtractionCapabilityAvailability;
|
|
40
|
+
readonly resume?: boolean;
|
|
41
|
+
}
|
|
42
|
+
export interface IndexingJobStartedEvent {
|
|
43
|
+
readonly kind: "job-started";
|
|
44
|
+
readonly jobId: string;
|
|
45
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
46
|
+
readonly sourceIds: readonly KnowledgeSourceId[];
|
|
47
|
+
readonly startedAt: number;
|
|
48
|
+
}
|
|
49
|
+
export interface IndexingDocumentDiscoveredEvent {
|
|
50
|
+
readonly kind: "document-discovered";
|
|
51
|
+
readonly jobId: string;
|
|
52
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
53
|
+
readonly sourceId: KnowledgeSourceId;
|
|
54
|
+
readonly relativePath: string;
|
|
55
|
+
readonly sizeBytes: number;
|
|
56
|
+
}
|
|
57
|
+
export interface IndexingDocumentExtractedEvent {
|
|
58
|
+
readonly kind: "document-extracted";
|
|
59
|
+
readonly jobId: string;
|
|
60
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
61
|
+
readonly sourceId: KnowledgeSourceId;
|
|
62
|
+
readonly documentId: DocumentId;
|
|
63
|
+
readonly relativePath: string;
|
|
64
|
+
}
|
|
65
|
+
export interface IndexingDocumentChunkedEvent {
|
|
66
|
+
readonly kind: "document-chunked";
|
|
67
|
+
readonly jobId: string;
|
|
68
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
69
|
+
readonly sourceId: KnowledgeSourceId;
|
|
70
|
+
readonly documentId: DocumentId;
|
|
71
|
+
readonly chunkCount: number;
|
|
72
|
+
}
|
|
73
|
+
export interface IndexingDocumentEmbeddedEvent {
|
|
74
|
+
readonly kind: "document-embedded";
|
|
75
|
+
readonly jobId: string;
|
|
76
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
77
|
+
readonly sourceId: KnowledgeSourceId;
|
|
78
|
+
readonly documentId: DocumentId;
|
|
79
|
+
readonly vectorCount: number;
|
|
80
|
+
readonly resumeToken: ChunkId;
|
|
81
|
+
}
|
|
82
|
+
export interface IndexingDocumentSkippedEvent {
|
|
83
|
+
readonly kind: "document-skipped";
|
|
84
|
+
readonly jobId: string;
|
|
85
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
86
|
+
readonly sourceId: KnowledgeSourceId;
|
|
87
|
+
readonly documentId: DocumentId;
|
|
88
|
+
readonly reason: "unchanged" | "already-embedded" | "unsupported";
|
|
89
|
+
}
|
|
90
|
+
export interface IndexingDocumentFailedEvent {
|
|
91
|
+
readonly kind: "document-failed";
|
|
92
|
+
readonly jobId: string;
|
|
93
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
94
|
+
readonly sourceId: KnowledgeSourceId;
|
|
95
|
+
readonly documentId?: DocumentId;
|
|
96
|
+
readonly relativePath?: string;
|
|
97
|
+
readonly error: IndexingJobError;
|
|
98
|
+
}
|
|
99
|
+
export interface IndexingJobCompletedEvent {
|
|
100
|
+
readonly kind: "job-completed";
|
|
101
|
+
readonly jobId: string;
|
|
102
|
+
readonly result: IndexingResult;
|
|
103
|
+
}
|
|
104
|
+
export interface IndexingJobCancelledEvent {
|
|
105
|
+
readonly kind: "job-cancelled";
|
|
106
|
+
readonly jobId: string;
|
|
107
|
+
readonly result: IndexingResult;
|
|
108
|
+
}
|
|
109
|
+
export interface IndexingJobFailedEvent {
|
|
110
|
+
readonly kind: "job-failed";
|
|
111
|
+
readonly jobId: string;
|
|
112
|
+
readonly error: IndexingJobError;
|
|
113
|
+
readonly result: IndexingResult;
|
|
114
|
+
}
|
|
115
|
+
export type IndexingEvent = IndexingJobStartedEvent | IndexingDocumentDiscoveredEvent | IndexingDocumentExtractedEvent | IndexingDocumentChunkedEvent | IndexingDocumentEmbeddedEvent | IndexingDocumentSkippedEvent | IndexingDocumentFailedEvent | IndexingJobCompletedEvent | IndexingJobCancelledEvent | IndexingJobFailedEvent;
|
|
116
|
+
export interface IndexingResult {
|
|
117
|
+
readonly jobId: string;
|
|
118
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
119
|
+
readonly status: "succeeded" | "failed" | "cancelled";
|
|
120
|
+
readonly totalDocuments: number;
|
|
121
|
+
readonly processedDocuments: number;
|
|
122
|
+
readonly failedDocuments: number;
|
|
123
|
+
readonly skippedDocuments: number;
|
|
124
|
+
readonly vectorsPersisted: number;
|
|
125
|
+
readonly startedAt: number;
|
|
126
|
+
readonly finishedAt: number;
|
|
127
|
+
readonly lastError?: IndexingJobError;
|
|
128
|
+
readonly embeddingIdentity?: EmbeddingModelIdentity;
|
|
129
|
+
}
|
|
130
|
+
export interface ChunkToEmbed {
|
|
131
|
+
readonly id: ChunkId;
|
|
132
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
133
|
+
readonly sourceId: KnowledgeSourceId;
|
|
134
|
+
readonly documentId: DocumentId;
|
|
135
|
+
readonly text: string;
|
|
136
|
+
}
|
|
137
|
+
export interface EmbedRetryOptions {
|
|
138
|
+
readonly maxRetries: number;
|
|
139
|
+
readonly baseDelayMs: number;
|
|
140
|
+
readonly sleep?: (ms: number, signal?: AbortSignal) => Promise<void>;
|
|
141
|
+
}
|
|
142
|
+
export interface EmbedBatchOptions {
|
|
143
|
+
readonly adapter: OpenAIEmbeddingAdapter;
|
|
144
|
+
readonly store: KnowledgeStore;
|
|
145
|
+
readonly pinnedIdentity: EmbeddingModelIdentity;
|
|
146
|
+
readonly concurrency: number;
|
|
147
|
+
readonly signal?: AbortSignal;
|
|
148
|
+
readonly now: () => number;
|
|
149
|
+
readonly idSource: () => string;
|
|
150
|
+
readonly retry?: EmbedRetryOptions;
|
|
151
|
+
}
|
|
152
|
+
export interface EmbedBatchResult {
|
|
153
|
+
readonly vectors: readonly VectorRecord[];
|
|
154
|
+
readonly errors: readonly IndexingJobError[];
|
|
155
|
+
}
|
|
156
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/indexing/types.ts"],"names":[],"mappings":"AAWA,OAAO,KAAK,EACV,OAAO,EACP,UAAU,EACV,sBAAsB,EACtB,gCAAgC,EAChC,gBAAgB,EAChB,kBAAkB,EAClB,iBAAiB,EACjB,2BAA2B,EAC3B,YAAY,EACb,MAAM,+BAA+B,CAAC;AACvC,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,mCAAmC,CAAC;AAChF,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,+BAA+B,CAAC;AAEjE,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAC5D,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAC9D,OAAO,EAAE,mBAAmB,EAAE,MAAM,cAAc,CAAC;AACnD,OAAO,KAAK,EAAE,cAAc,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAChF,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAC1D,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAQlD,eAAO,MAAM,2BAA2B,KAAK,CAAC;AAC9C,eAAO,MAAM,4BAA4B,IAAI,CAAC;AAK9C,MAAM,MAAM,iBAAiB,GACzB,iCAAiC,GACjC,0BAA0B,GAC1B,kBAAkB,GAClB,iBAAiB,GACjB,WAAW,GACX,mBAAmB,GACnB,iBAAiB,GACjB,oBAAoB,CAAC;AAKzB,qBAAa,aAAc,SAAQ,mBAAmB;IACpD,SAAyB,IAAI,EAAE,MAAM,CAAmB;IACxD,SAAgB,IAAI,EAAE,iBAAiB,CAAC;gBACrB,IAAI,EAAE,iBAAiB,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE;QAAE,KAAK,CAAC,EAAE,OAAO,CAAA;KAAE;CAI3F;AAGD,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IAGvC,QAAQ,CAAC,SAAS,CAAC,EAAE,SAAS,iBAAiB,EAAE,CAAC;IAClD,QAAQ,CAAC,cAAc,EAAE,cAAc,CAAC;IACxC,QAAQ,CAAC,WAAW,EAAE,WAAW,CAAC;IAClC,QAAQ,CAAC,gBAAgB,EAAE,sBAAsB,CAAC;IAClD,QAAQ,CAAC,KAAK,EAAE,cAAc,CAAC;IAC/B,QAAQ,CAAC,MAAM,CAAC,EAAE,WAAW,CAAC;IAI9B,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC,KAAK,EAAE,aAAa,KAAK,IAAI,CAAC;IACnD,QAAQ,CAAC,SAAS,CAAC,EAAE,cAAc,CAAC;IAIpC,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IAGzB,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC;IAE9B,QAAQ,CAAC,eAAe,CAAC,EAAE,eAAe,CAAC;IAE3C,QAAQ,CAAC,gBAAgB,CAAC,EAAE,gBAAgB,CAAC;IAE7C,QAAQ,CAAC,GAAG,CAAC,EAAE,MAAM,MAAM,CAAC;IAE5B,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,MAAM,CAAC;IAGjC,QAAQ,CAAC,mBAAmB,CAAC,EAAE,2BAA2B,CAAC;IAG3D,QAAQ,CAAC,qBAAqB,CAAC,EAAE,SAAS,oBAAoB,EAAE,CAAC;IACjE,QAAQ,CAAC,sBAAsB,CAAC,EAAE,gCAAgC,CAAC;IAGnE,QAAQ,CAAC,MAAM,CAAC,EAAE,OAAO,CAAC;CAC3B;AAKD,MAAM,WAAW,uBAAuB;IACtC,QAAQ,CAAC,IAAI,EAAE,aAAa,CAAC;IAC7B,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,SAAS,EAAE,SAAS,iBAAiB,EAAE,CAAC;IACjD,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAED,MAAM,WAAW,+BAA+B;IAC9C,QAAQ,CAAC,IAAI,EAAE,qBAAqB,CAAC;IACrC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,QAAQ,EAAE,iBAAiB,CAAC;IACrC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAED,MAAM,WAAW,8BAA8B;IAC7C,QAAQ,CAAC,IAAI,EAAE,oBAAoB,CAAC;IACpC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,QAAQ,EAAE,iBAAiB,CAAC;IACrC,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;IAChC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;CAC/B;AAED,MAAM,WAAW,4BAA4B;IAC3C,QAAQ,CAAC,IAAI,EAAE,kBAAkB,CAAC;IAClC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,QAAQ,EAAE,iBAAiB,CAAC;IACrC,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;IAChC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,WAAW,6BAA6B;IAC5C,QAAQ,CAAC,IAAI,EAAE,mBAAmB,CAAC;IACnC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,QAAQ,EAAE,iBAAiB,CAAC;IACrC,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;IAChC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAG7B,QAAQ,CAAC,WAAW,EAAE,OAAO,CAAC;CAC/B;AAED,MAAM,WAAW,4BAA4B;IAC3C,QAAQ,CAAC,IAAI,EAAE,kBAAkB,CAAC;IAClC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,QAAQ,EAAE,iBAAiB,CAAC;IACrC,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;IAChC,QAAQ,CAAC,MAAM,EAAE,WAAW,GAAG,kBAAkB,GAAG,aAAa,CAAC;CACnE;AAED,MAAM,WAAW,2BAA2B;IAC1C,QAAQ,CAAC,IAAI,EAAE,iBAAiB,CAAC;IACjC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,QAAQ,EAAE,iBAAiB,CAAC;IACrC,QAAQ,CAAC,UAAU,CAAC,EAAE,UAAU,CAAC;IACjC,QAAQ,CAAC,YAAY,CAAC,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,KAAK,EAAE,gBAAgB,CAAC;CAClC;AAED,MAAM,WAAW,yBAAyB;IACxC,QAAQ,CAAC,IAAI,EAAE,eAAe,CAAC;IAC/B,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,MAAM,EAAE,cAAc,CAAC;CACjC;AAED,MAAM,WAAW,yBAAyB;IACxC,QAAQ,CAAC,IAAI,EAAE,eAAe,CAAC;IAC/B,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,MAAM,EAAE,cAAc,CAAC;CACjC;AAED,MAAM,WAAW,sBAAsB;IACrC,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC;IAC5B,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,KAAK,EAAE,gBAAgB,CAAC;IACjC,QAAQ,CAAC,MAAM,EAAE,cAAc,CAAC;CACjC;AAED,MAAM,MAAM,aAAa,GACrB,uBAAuB,GACvB,+BAA+B,GAC/B,8BAA8B,GAC9B,4BAA4B,GAC5B,6BAA6B,GAC7B,4BAA4B,GAC5B,2BAA2B,GAC3B,yBAAyB,GACzB,yBAAyB,GACzB,sBAAsB,CAAC;AAG3B,MAAM,WAAW,cAAc;IAC7B,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,MAAM,EAAE,WAAW,GAAG,QAAQ,GAAG,WAAW,CAAC;IACtD,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,kBAAkB,EAAE,MAAM,CAAC;IACpC,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;IACjC,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAC;IAClC,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAC;IAClC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,SAAS,CAAC,EAAE,gBAAgB,CAAC;IACtC,QAAQ,CAAC,iBAAiB,CAAC,EAAE,sBAAsB,CAAC;CACrD;AAKD,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,EAAE,OAAO,CAAC;IACrB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,QAAQ,EAAE,iBAAiB,CAAC;IACrC,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;IAChC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAMD,MAAM,WAAW,iBAAiB;IAEhC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAE5B,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,WAAW,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;CACtE;AAED,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,OAAO,EAAE,sBAAsB,CAAC;IACzC,QAAQ,CAAC,KAAK,EAAE,cAAc,CAAC;IAC/B,QAAQ,CAAC,cAAc,EAAE,sBAAsB,CAAC;IAChD,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,MAAM,CAAC,EAAE,WAAW,CAAC;IAC9B,QAAQ,CAAC,GAAG,EAAE,MAAM,MAAM,CAAC;IAC3B,QAAQ,CAAC,QAAQ,EAAE,MAAM,MAAM,CAAC;IAEhC,QAAQ,CAAC,KAAK,CAAC,EAAE,iBAAiB,CAAC;CACpC;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,OAAO,EAAE,SAAS,YAAY,EAAE,CAAC;IAI1C,QAAQ,CAAC,MAAM,EAAE,SAAS,gBAAgB,EAAE,CAAC;CAC9C"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
// Type contracts for the indexing orchestrator (Epic #189, Issue #196). The orchestrator
|
|
2
|
+
// composes #194 discovery, #195 chunking, and #192 embedding capability into a single
|
|
3
|
+
// streaming pipeline that produces `vectors` rows for a capsule. Every state change emits
|
|
4
|
+
// one `IndexingEvent`; consumers (UI surfaces, evidence ledger, CLI) read the event stream
|
|
5
|
+
// rather than polling the database.
|
|
6
|
+
//
|
|
7
|
+
// The progress channel is an event union (not a number) so a consumer can branch on the
|
|
8
|
+
// `kind` discriminant and surface domain-specific UI per state — matching the pattern the
|
|
9
|
+
// discovery layer (#194) and the verification orchestrator (#7) already use elsewhere in
|
|
10
|
+
// the codebase.
|
|
11
|
+
import { KnowledgeStoreError } from "../errors.js";
|
|
12
|
+
// ─── Defaults (declared up-front so callers can reason about behaviour) ──────
|
|
13
|
+
// Hard caps tracked in the orchestrator. The contract scope (#196) requires:
|
|
14
|
+
// * batch size cap = 64 chunks per embedding flush
|
|
15
|
+
// * concurrent in-flight batches ≤ 4
|
|
16
|
+
// Lower defaults are not exposed because they bias toward "more requests, smaller batches"
|
|
17
|
+
// which costs more roundtrips without saving memory in our pipeline.
|
|
18
|
+
export const DEFAULT_INDEXING_BATCH_SIZE = 64;
|
|
19
|
+
export const DEFAULT_INDEXING_CONCURRENCY = 4;
|
|
20
|
+
// Distinct from KnowledgeStoreError so a test asserting "indexing failed" cannot
|
|
21
|
+
// accidentally accept any other store error. Extends KnowledgeStoreError so callers that
|
|
22
|
+
// catch the parent class still see the failure.
|
|
23
|
+
export class IndexingError extends KnowledgeStoreError {
|
|
24
|
+
name = "IndexingError";
|
|
25
|
+
code;
|
|
26
|
+
constructor(code, message, options) {
|
|
27
|
+
super(message, options);
|
|
28
|
+
this.code = code;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import type { ChunkId, DocumentId, EmbeddingModelIdentity, KnowledgeCapsuleId, KnowledgeSourceId, VectorId, VectorRecord } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
import type { DatabaseSync } from "node:sqlite";
|
|
3
|
+
import type { StoreContentCipher } from "../store-content-cipher.js";
|
|
4
|
+
export interface ChunkRow {
|
|
5
|
+
readonly id: string;
|
|
6
|
+
readonly capsule_id: string;
|
|
7
|
+
readonly source_id: string;
|
|
8
|
+
readonly document_id: string;
|
|
9
|
+
readonly parsed_unit_id: string;
|
|
10
|
+
readonly order_index: number;
|
|
11
|
+
readonly token_count: number;
|
|
12
|
+
readonly safe_excerpt_hash: string;
|
|
13
|
+
}
|
|
14
|
+
export interface VectorInsertRow {
|
|
15
|
+
readonly id: VectorId;
|
|
16
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
17
|
+
readonly sourceId: KnowledgeSourceId;
|
|
18
|
+
readonly documentId: DocumentId;
|
|
19
|
+
readonly chunkId: ChunkId;
|
|
20
|
+
readonly embedding: Uint8Array;
|
|
21
|
+
readonly identity: EmbeddingModelIdentity;
|
|
22
|
+
readonly storageReference: string;
|
|
23
|
+
readonly createdAt: number;
|
|
24
|
+
}
|
|
25
|
+
export declare function insertVectorRow(db: DatabaseSync, cipher: StoreContentCipher, row: VectorInsertRow): void;
|
|
26
|
+
export declare function deleteVectorsForDocument(db: DatabaseSync, capsuleId: KnowledgeCapsuleId, documentId: DocumentId): void;
|
|
27
|
+
export declare function deleteVectorsForCapsule(db: DatabaseSync, capsuleId: KnowledgeCapsuleId): void;
|
|
28
|
+
export declare function countVectorsForDocument(db: DatabaseSync, capsuleId: KnowledgeCapsuleId, documentId: DocumentId): number;
|
|
29
|
+
export declare function countVectorsForCapsule(db: DatabaseSync, capsuleId: KnowledgeCapsuleId): number;
|
|
30
|
+
export declare function selectChunksForDocument(db: DatabaseSync, capsuleId: KnowledgeCapsuleId, documentId: DocumentId): readonly ChunkRow[];
|
|
31
|
+
export declare function composeVectorRecord(row: VectorInsertRow): VectorRecord;
|
|
32
|
+
//# sourceMappingURL=vector-persist.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vector-persist.d.ts","sourceRoot":"","sources":["../../src/indexing/vector-persist.ts"],"names":[],"mappings":"AAaA,OAAO,KAAK,EACV,OAAO,EACP,UAAU,EACV,sBAAsB,EACtB,kBAAkB,EAClB,iBAAiB,EACjB,QAAQ,EACR,YAAY,EACb,MAAM,+BAA+B,CAAC;AACvC,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAGhD,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAyCrE,MAAM,WAAW,QAAQ;IACvB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,iBAAiB,EAAE,MAAM,CAAC;CACpC;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,EAAE,EAAE,QAAQ,CAAC;IACtB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,QAAQ,EAAE,iBAAiB,CAAC;IACrC,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;IAChC,QAAQ,CAAC,OAAO,EAAE,OAAO,CAAC;IAC1B,QAAQ,CAAC,SAAS,EAAE,UAAU,CAAC;IAC/B,QAAQ,CAAC,QAAQ,EAAE,sBAAsB,CAAC;IAC1C,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAC;IAClC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAaD,wBAAgB,eAAe,CAC7B,EAAE,EAAE,YAAY,EAChB,MAAM,EAAE,kBAAkB,EAC1B,GAAG,EAAE,eAAe,GACnB,IAAI,CAmBN;AAED,wBAAgB,wBAAwB,CACtC,EAAE,EAAE,YAAY,EAChB,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,GACrB,IAAI,CAEN;AAED,wBAAgB,uBAAuB,CAAC,EAAE,EAAE,YAAY,EAAE,SAAS,EAAE,kBAAkB,GAAG,IAAI,CAE7F;AAMD,wBAAgB,uBAAuB,CACrC,EAAE,EAAE,YAAY,EAChB,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,GACrB,MAAM,CAKR;AAED,wBAAgB,sBAAsB,CAAC,EAAE,EAAE,YAAY,EAAE,SAAS,EAAE,kBAAkB,GAAG,MAAM,CAK9F;AAED,wBAAgB,uBAAuB,CACrC,EAAE,EAAE,YAAY,EAChB,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,GACrB,SAAS,QAAQ,EAAE,CAGrB;AAID,wBAAgB,mBAAmB,CAAC,GAAG,EAAE,eAAe,GAAG,YAAY,CAYtE"}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
// Prepared-statement helpers for the `vectors` table (Epic #189, Issue #196). Every helper
|
|
2
|
+
// wraps a single statement so the orchestrator can compose them at the boundaries of its
|
|
3
|
+
// per-document work. The orchestrator is the only module that writes `vectors` rows.
|
|
4
|
+
//
|
|
5
|
+
// Vector ids are deterministic on `chunkId` (`vec:<chunkId>`): the chunks table already
|
|
6
|
+
// owns a UNIQUE (capsule_id, id) constraint, so the same chunk being re-embedded twice in
|
|
7
|
+
// a force run produces a byte-identical row id — the audit ledger's row-equality assertions
|
|
8
|
+
// (#10) hold across re-indexes.
|
|
9
|
+
//
|
|
10
|
+
// The composite FK `vectors(capsule_id, chunk_id) → chunks(capsule_id, id)` is enforced by
|
|
11
|
+
// SQLite — the orchestrator never bypasses it. If an upstream bug projects a chunk to a
|
|
12
|
+
// wrong capsule, the INSERT raises rather than silently splitting tenants.
|
|
13
|
+
import { KnowledgeStoreError } from "../errors.js";
|
|
14
|
+
const INSERT_VECTOR_SQL = [
|
|
15
|
+
"INSERT INTO vectors (",
|
|
16
|
+
" id, capsule_id, source_id, document_id, chunk_id,",
|
|
17
|
+
" embedding, embedding_model_provider, embedding_model_id, embedding_model_revision,",
|
|
18
|
+
" vector_dimensions, vector_metric, storage_reference, created_at",
|
|
19
|
+
") VALUES (",
|
|
20
|
+
" :id, :capsule_id, :source_id, :document_id, :chunk_id,",
|
|
21
|
+
" :embedding, :provider, :model_id, :revision,",
|
|
22
|
+
" :dimensions, :metric, :storage_reference, :created_at",
|
|
23
|
+
")",
|
|
24
|
+
"ON CONFLICT(id) DO UPDATE SET",
|
|
25
|
+
" embedding = excluded.embedding,",
|
|
26
|
+
" embedding_model_provider = excluded.embedding_model_provider,",
|
|
27
|
+
" embedding_model_id = excluded.embedding_model_id,",
|
|
28
|
+
" embedding_model_revision = excluded.embedding_model_revision,",
|
|
29
|
+
" vector_dimensions = excluded.vector_dimensions,",
|
|
30
|
+
" vector_metric = excluded.vector_metric,",
|
|
31
|
+
" storage_reference = excluded.storage_reference,",
|
|
32
|
+
" created_at = excluded.created_at",
|
|
33
|
+
].join(" ");
|
|
34
|
+
const DELETE_VECTORS_FOR_DOCUMENT_SQL = "DELETE FROM vectors WHERE capsule_id = :c AND document_id = :d";
|
|
35
|
+
const DELETE_VECTORS_FOR_CAPSULE_SQL = "DELETE FROM vectors WHERE capsule_id = :c";
|
|
36
|
+
const COUNT_VECTORS_FOR_DOCUMENT_SQL = "SELECT COUNT(*) AS n FROM vectors WHERE capsule_id = :c AND document_id = :d";
|
|
37
|
+
const COUNT_VECTORS_FOR_CAPSULE_SQL = "SELECT COUNT(*) AS n FROM vectors WHERE capsule_id = :c";
|
|
38
|
+
const SELECT_CHUNKS_FOR_DOCUMENT_SQL = [
|
|
39
|
+
"SELECT id, capsule_id, source_id, document_id, parsed_unit_id, order_index, token_count,",
|
|
40
|
+
" safe_excerpt_hash",
|
|
41
|
+
"FROM chunks",
|
|
42
|
+
"WHERE capsule_id = :c AND document_id = :d",
|
|
43
|
+
"ORDER BY order_index ASC",
|
|
44
|
+
].join(" ");
|
|
45
|
+
function assertEmbeddingShape(row) {
|
|
46
|
+
const expectedByteLength = row.identity.vectorDimensions * Float32Array.BYTES_PER_ELEMENT;
|
|
47
|
+
if (row.embedding.byteLength !== expectedByteLength) {
|
|
48
|
+
throw new KnowledgeStoreError(`vector ${String(row.id)} for capsule=${String(row.capsuleId)} ` +
|
|
49
|
+
`chunk=${String(row.chunkId)} has blob length ${String(row.embedding.byteLength)} ` +
|
|
50
|
+
`but identity.vectorDimensions=${String(row.identity.vectorDimensions)}`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
export function insertVectorRow(db, cipher, row) {
|
|
54
|
+
// Validate the PLAINTEXT shape (dimensions * 4 bytes) before sealing, so the identity check stays
|
|
55
|
+
// meaningful; the embedding is content (ADR-0047) and is sealed before it touches the BLOB column.
|
|
56
|
+
assertEmbeddingShape(row);
|
|
57
|
+
db.prepare(INSERT_VECTOR_SQL).run({
|
|
58
|
+
id: String(row.id),
|
|
59
|
+
capsule_id: String(row.capsuleId),
|
|
60
|
+
source_id: String(row.sourceId),
|
|
61
|
+
document_id: String(row.documentId),
|
|
62
|
+
chunk_id: String(row.chunkId),
|
|
63
|
+
embedding: cipher.sealVector(row.embedding),
|
|
64
|
+
provider: row.identity.provider,
|
|
65
|
+
model_id: row.identity.modelId,
|
|
66
|
+
revision: row.identity.modelRevision ?? null,
|
|
67
|
+
dimensions: row.identity.vectorDimensions,
|
|
68
|
+
metric: row.identity.vectorMetric,
|
|
69
|
+
storage_reference: row.storageReference,
|
|
70
|
+
created_at: row.createdAt,
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
export function deleteVectorsForDocument(db, capsuleId, documentId) {
|
|
74
|
+
db.prepare(DELETE_VECTORS_FOR_DOCUMENT_SQL).run({ c: capsuleId, d: documentId });
|
|
75
|
+
}
|
|
76
|
+
export function deleteVectorsForCapsule(db, capsuleId) {
|
|
77
|
+
db.prepare(DELETE_VECTORS_FOR_CAPSULE_SQL).run({ c: capsuleId });
|
|
78
|
+
}
|
|
79
|
+
export function countVectorsForDocument(db, capsuleId, documentId) {
|
|
80
|
+
const row = db.prepare(COUNT_VECTORS_FOR_DOCUMENT_SQL).get({ c: capsuleId, d: documentId });
|
|
81
|
+
return typeof row?.n === "number" ? row.n : 0;
|
|
82
|
+
}
|
|
83
|
+
export function countVectorsForCapsule(db, capsuleId) {
|
|
84
|
+
const row = db.prepare(COUNT_VECTORS_FOR_CAPSULE_SQL).get({ c: capsuleId });
|
|
85
|
+
return typeof row?.n === "number" ? row.n : 0;
|
|
86
|
+
}
|
|
87
|
+
export function selectChunksForDocument(db, capsuleId, documentId) {
|
|
88
|
+
const rows = db.prepare(SELECT_CHUNKS_FOR_DOCUMENT_SQL).all({ c: capsuleId, d: documentId });
|
|
89
|
+
return rows;
|
|
90
|
+
}
|
|
91
|
+
// VectorRecord composition is consolidated here so the batcher and any future replay tool
|
|
92
|
+
// share the exact shape that gets persisted into `vectors`.
|
|
93
|
+
export function composeVectorRecord(row) {
|
|
94
|
+
return {
|
|
95
|
+
id: row.id,
|
|
96
|
+
chunkId: row.chunkId,
|
|
97
|
+
capsuleId: row.capsuleId,
|
|
98
|
+
sourceId: row.sourceId,
|
|
99
|
+
documentId: row.documentId,
|
|
100
|
+
embeddingIdentity: row.identity,
|
|
101
|
+
vectorDimensions: row.identity.vectorDimensions,
|
|
102
|
+
storageReference: row.storageReference,
|
|
103
|
+
createdAt: row.createdAt,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { DocumentId, ParsedUnit, ParserDiagnostic, ParserIdentity, ParserResult } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
import type { ParserCapability, ParserErrorCode, ParserOptions } from "./types.js";
|
|
3
|
+
export declare function emptyResult(capability: ParserCapability, documentId: DocumentId, options: ParserOptions, diagnostics?: readonly ParserDiagnostic[], units?: readonly ParsedUnit[]): ParserResult;
|
|
4
|
+
export declare function parserIdentity(capability: ParserCapability): ParserIdentity;
|
|
5
|
+
export declare function diagnostic(code: ParserErrorCode, message: string, documentId: DocumentId, severity?: ParserDiagnostic["severity"]): ParserDiagnostic;
|
|
6
|
+
export interface LimitCheck {
|
|
7
|
+
readonly stop: boolean;
|
|
8
|
+
readonly code?: ParserErrorCode;
|
|
9
|
+
readonly message?: string;
|
|
10
|
+
}
|
|
11
|
+
export declare function shouldStop(startedAt: number, options: ParserOptions, emittedUnits: number): LimitCheck;
|
|
12
|
+
export declare function oversizeDiagnostic(documentId: DocumentId, byteLength: number, maxBytes: number): ParserDiagnostic;
|
|
13
|
+
export declare function objectLimitDiagnostic(documentId: DocumentId, maxObjectsPerDocument: number): ParserDiagnostic;
|
|
14
|
+
export interface DecodedText {
|
|
15
|
+
readonly text: string;
|
|
16
|
+
readonly bomBytes: number;
|
|
17
|
+
}
|
|
18
|
+
export declare function decodeXmlEntities(value: string): string;
|
|
19
|
+
export declare function decodeUtf8(bytes: Uint8Array): DecodedText;
|
|
20
|
+
//# sourceMappingURL=_internal.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"_internal.d.ts","sourceRoot":"","sources":["../../src/parsers/_internal.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EACV,UAAU,EACV,UAAU,EACV,gBAAgB,EAChB,cAAc,EACd,YAAY,EACb,MAAM,+BAA+B,CAAC;AAEvC,OAAO,KAAK,EAAE,gBAAgB,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAEnF,wBAAgB,WAAW,CACzB,UAAU,EAAE,gBAAgB,EAC5B,UAAU,EAAE,UAAU,EACtB,OAAO,EAAE,aAAa,EACtB,WAAW,GAAE,SAAS,gBAAgB,EAAO,EAC7C,KAAK,GAAE,SAAS,UAAU,EAAO,GAChC,YAAY,CAUd;AAED,wBAAgB,cAAc,CAAC,UAAU,EAAE,gBAAgB,GAAG,cAAc,CAQ3E;AAED,wBAAgB,UAAU,CACxB,IAAI,EAAE,eAAe,EACrB,OAAO,EAAE,MAAM,EACf,UAAU,EAAE,UAAU,EACtB,QAAQ,GAAE,gBAAgB,CAAC,UAAU,CAAU,GAC9C,gBAAgB,CAElB;AAID,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC;IACvB,QAAQ,CAAC,IAAI,CAAC,EAAE,eAAe,CAAC;IAChC,QAAQ,CAAC,OAAO,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,wBAAgB,UAAU,CACxB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,aAAa,EACtB,YAAY,EAAE,MAAM,GACnB,UAAU,CAmBZ;AAED,wBAAgB,kBAAkB,CAChC,UAAU,EAAE,UAAU,EACtB,UAAU,EAAE,MAAM,EAClB,QAAQ,EAAE,MAAM,GACf,gBAAgB,CAOlB;AAED,wBAAgB,qBAAqB,CACnC,UAAU,EAAE,UAAU,EACtB,qBAAqB,EAAE,MAAM,GAC5B,gBAAgB,CAOlB;AAMD,MAAM,WAAW,WAAW;IAC1B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;CAC3B;AAkDD,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAWvD;AAED,wBAAgB,UAAU,CAAC,KAAK,EAAE,UAAU,GAAG,WAAW,CASzD"}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
// Shared helpers for the parser adapters (Epic #189, Issue #266). NOT exported from the
|
|
2
|
+
// package barrel — kept internal so the adapter surface stays minimal.
|
|
3
|
+
export function emptyResult(capability, documentId, options, diagnostics = [], units = []) {
|
|
4
|
+
return {
|
|
5
|
+
documentId,
|
|
6
|
+
parser: parserIdentity(capability),
|
|
7
|
+
pages: [],
|
|
8
|
+
sections: [],
|
|
9
|
+
units,
|
|
10
|
+
diagnostics,
|
|
11
|
+
extractedAt: options.now(),
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
export function parserIdentity(capability) {
|
|
15
|
+
return capability.dependencyVersions === undefined
|
|
16
|
+
? { parserId: capability.parserId, parserVersion: capability.parserVersion }
|
|
17
|
+
: {
|
|
18
|
+
parserId: capability.parserId,
|
|
19
|
+
parserVersion: capability.parserVersion,
|
|
20
|
+
dependencyVersions: capability.dependencyVersions,
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
export function diagnostic(code, message, documentId, severity = "info") {
|
|
24
|
+
return { severity, code, message, documentId };
|
|
25
|
+
}
|
|
26
|
+
export function shouldStop(startedAt, options, emittedUnits) {
|
|
27
|
+
if (options.signal?.aborted === true) {
|
|
28
|
+
return { stop: true, code: "PARSER_CANCELLED", message: "caller aborted parser" };
|
|
29
|
+
}
|
|
30
|
+
if (options.now() - startedAt > options.timeoutMs) {
|
|
31
|
+
return {
|
|
32
|
+
stop: true,
|
|
33
|
+
code: "PARSER_TIMEOUT",
|
|
34
|
+
message: `exceeded ${String(options.timeoutMs)}ms deadline`,
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
if (emittedUnits >= options.maxUnitsPerDocument) {
|
|
38
|
+
return {
|
|
39
|
+
stop: true,
|
|
40
|
+
code: "UNIT_LIMIT_REACHED",
|
|
41
|
+
message: `reached maxUnitsPerDocument=${String(options.maxUnitsPerDocument)}`,
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
return { stop: false };
|
|
45
|
+
}
|
|
46
|
+
export function oversizeDiagnostic(documentId, byteLength, maxBytes) {
|
|
47
|
+
return diagnostic("OVERSIZED_FILE", `input size ${String(byteLength)} exceeds maxBytes=${String(maxBytes)}`, documentId, "info");
|
|
48
|
+
}
|
|
49
|
+
export function objectLimitDiagnostic(documentId, maxObjectsPerDocument) {
|
|
50
|
+
return diagnostic("OBJECT_LIMIT_REACHED", `reached maxObjectsPerDocument=${String(maxObjectsPerDocument)}`, documentId, "error");
|
|
51
|
+
}
|
|
52
|
+
// GRD-012: detect UTF-16 LE/BE by BOM (common Windows .txt/.csv/.json exports) and re-decode
|
|
53
|
+
// with the matching codec. Without this they decode as UTF-8 mojibake (every other byte a NUL /
|
|
54
|
+
// replacement char) and are silently chunked/embedded as garbage. UTF-32 LE (FF FE 00 00) is
|
|
55
|
+
// explicitly excluded so it is not mis-read as UTF-16 LE.
|
|
56
|
+
function utf16CodecForBom(bytes) {
|
|
57
|
+
if (bytes.byteLength < 2)
|
|
58
|
+
return undefined;
|
|
59
|
+
const b0 = bytes[0];
|
|
60
|
+
const b1 = bytes[1];
|
|
61
|
+
if (b0 === 0xfe && b1 === 0xff)
|
|
62
|
+
return "utf-16be";
|
|
63
|
+
if (b0 !== 0xff || b1 !== 0xfe)
|
|
64
|
+
return undefined;
|
|
65
|
+
// FF FE is UTF-16 LE — unless it is the UTF-32 LE BOM (FF FE 00 00), which is not handled here.
|
|
66
|
+
const isUtf32Le = bytes.byteLength >= 4 && bytes[2] === 0x00 && bytes[3] === 0x00;
|
|
67
|
+
return isUtf32Le ? undefined : "utf-16le";
|
|
68
|
+
}
|
|
69
|
+
function decodeUtf16(bytes) {
|
|
70
|
+
const codec = utf16CodecForBom(bytes);
|
|
71
|
+
if (codec === undefined)
|
|
72
|
+
return undefined;
|
|
73
|
+
const text = new TextDecoder(codec, { fatal: false }).decode(bytes);
|
|
74
|
+
// The 2-byte UTF-16 BOM is normally consumed by TextDecoder; strip defensively so a leading
|
|
75
|
+
// U+FEFF never survives into offsets. bomBytes is the consumed BOM byte length (2).
|
|
76
|
+
const stripped = text.length > 0 && text.charCodeAt(0) === 0xfeff ? text.slice(1) : text;
|
|
77
|
+
return { text: stripped, bomBytes: 2 };
|
|
78
|
+
}
|
|
79
|
+
// GRD-027: decode an XML numeric character reference body (the part between `&#` and `;`),
|
|
80
|
+
// e.g. "8217" (decimal) or "xE9" / "x2019" (hex). Returns undefined for malformed or
|
|
81
|
+
// out-of-range references (incl. surrogates) so the caller leaves the literal text intact —
|
|
82
|
+
// String.fromCodePoint throws on those, which must never crash the parser.
|
|
83
|
+
// Valid Unicode scalar value: in range and not a lone surrogate (String.fromCodePoint throws
|
|
84
|
+
// on surrogates / out-of-range).
|
|
85
|
+
function isValidScalarCodePoint(cp) {
|
|
86
|
+
if (!Number.isInteger(cp) || cp < 0 || cp > 0x10ffff)
|
|
87
|
+
return false;
|
|
88
|
+
return cp < 0xd800 || cp > 0xdfff;
|
|
89
|
+
}
|
|
90
|
+
function decodeNumericCharacterReference(body) {
|
|
91
|
+
const isHex = body.startsWith("x") || body.startsWith("X");
|
|
92
|
+
const digits = isHex ? body.slice(1) : body;
|
|
93
|
+
if (digits.length === 0)
|
|
94
|
+
return undefined;
|
|
95
|
+
if (!(isHex ? /^[0-9a-fA-F]+$/ : /^[0-9]+$/).test(digits))
|
|
96
|
+
return undefined;
|
|
97
|
+
const codePoint = Number.parseInt(digits, isHex ? 16 : 10);
|
|
98
|
+
return isValidScalarCodePoint(codePoint) ? String.fromCodePoint(codePoint) : undefined;
|
|
99
|
+
}
|
|
100
|
+
// Shared OOXML/HTML entity decoder for docx/xlsx text runs. Decodes numeric references first
|
|
101
|
+
// (decimal `’` and hex `é` — smart quotes, accents), then the five named refs, with
|
|
102
|
+
// `&` LAST so an escaped ampersand (`&#65;`) is not re-interpreted as a numeric ref.
|
|
103
|
+
export function decodeXmlEntities(value) {
|
|
104
|
+
const withNumeric = value.replace(/&#(x?[0-9a-fA-F]+);/g, (match, body) => decodeNumericCharacterReference(body) ?? match);
|
|
105
|
+
return withNumeric
|
|
106
|
+
.replaceAll("<", "<")
|
|
107
|
+
.replaceAll(">", ">")
|
|
108
|
+
.replaceAll(""", '"')
|
|
109
|
+
.replaceAll("'", "'")
|
|
110
|
+
.replaceAll("&", "&");
|
|
111
|
+
}
|
|
112
|
+
export function decodeUtf8(bytes) {
|
|
113
|
+
const utf16 = decodeUtf16(bytes);
|
|
114
|
+
if (utf16 !== undefined)
|
|
115
|
+
return utf16;
|
|
116
|
+
const decoder = new TextDecoder("utf-8", { fatal: false });
|
|
117
|
+
const raw = decoder.decode(bytes);
|
|
118
|
+
if (raw.length > 0 && raw.charCodeAt(0) === 0xfeff) {
|
|
119
|
+
return { text: raw.slice(1), bomBytes: 3 };
|
|
120
|
+
}
|
|
121
|
+
return { text: raw, bomBytes: 0 };
|
|
122
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"csv-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/csv-parser.ts"],"names":[],"mappings":"AAwBA,OAAO,KAAK,EAAE,aAAa,EAAuC,MAAM,YAAY,CAAC;AAiMrF,eAAO,MAAM,SAAS,EAAE,aA8BtB,CAAC"}
|