@oscharko-dev/keiko-local-knowledge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/bounded-document-extraction.d.ts +27 -0
- package/dist/bounded-document-extraction.d.ts.map +1 -0
- package/dist/bounded-document-extraction.js +214 -0
- package/dist/capsule-lifecycle.d.ts +33 -0
- package/dist/capsule-lifecycle.d.ts.map +1 -0
- package/dist/capsule-lifecycle.js +292 -0
- package/dist/capsule-set-lifecycle.d.ts +15 -0
- package/dist/capsule-set-lifecycle.d.ts.map +1 -0
- package/dist/capsule-set-lifecycle.js +158 -0
- package/dist/chunking/chunker-persist.d.ts +36 -0
- package/dist/chunking/chunker-persist.d.ts.map +1 -0
- package/dist/chunking/chunker-persist.js +74 -0
- package/dist/chunking/chunker-runner.d.ts +9 -0
- package/dist/chunking/chunker-runner.d.ts.map +1 -0
- package/dist/chunking/chunker-runner.js +218 -0
- package/dist/chunking/chunker.d.ts +7 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +139 -0
- package/dist/chunking/citation-mapper.d.ts +4 -0
- package/dist/chunking/citation-mapper.d.ts.map +1 -0
- package/dist/chunking/citation-mapper.js +180 -0
- package/dist/chunking/index.d.ts +6 -0
- package/dist/chunking/index.d.ts.map +1 -0
- package/dist/chunking/index.js +8 -0
- package/dist/chunking/token-estimator.d.ts +3 -0
- package/dist/chunking/token-estimator.d.ts.map +1 -0
- package/dist/chunking/token-estimator.js +26 -0
- package/dist/chunking/types.d.ts +49 -0
- package/dist/chunking/types.d.ts.map +1 -0
- package/dist/chunking/types.js +26 -0
- package/dist/composition.d.ts +57 -0
- package/dist/composition.d.ts.map +1 -0
- package/dist/composition.js +310 -0
- package/dist/conversation/citation-attacher.d.ts +8 -0
- package/dist/conversation/citation-attacher.d.ts.map +1 -0
- package/dist/conversation/citation-attacher.js +55 -0
- package/dist/conversation/citation-excerpts.d.ts +4 -0
- package/dist/conversation/citation-excerpts.d.ts.map +1 -0
- package/dist/conversation/citation-excerpts.js +41 -0
- package/dist/conversation/grounded-answer-runner.d.ts +9 -0
- package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
- package/dist/conversation/grounded-answer-runner.js +61 -0
- package/dist/conversation/index.d.ts +5 -0
- package/dist/conversation/index.d.ts.map +1 -0
- package/dist/conversation/index.js +7 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
- package/dist/conversation/model-gateway-answer-generator.js +105 -0
- package/dist/conversation/types.d.ts +35 -0
- package/dist/conversation/types.d.ts.map +1 -0
- package/dist/conversation/types.js +24 -0
- package/dist/discovery/discovery-runner.d.ts +23 -0
- package/dist/discovery/discovery-runner.d.ts.map +1 -0
- package/dist/discovery/discovery-runner.js +109 -0
- package/dist/discovery/extract-progressive.d.ts +17 -0
- package/dist/discovery/extract-progressive.d.ts.map +1 -0
- package/dist/discovery/extract-progressive.js +522 -0
- package/dist/discovery/extract.d.ts +26 -0
- package/dist/discovery/extract.d.ts.map +1 -0
- package/dist/discovery/extract.js +906 -0
- package/dist/discovery/glob.d.ts +10 -0
- package/dist/discovery/glob.d.ts.map +1 -0
- package/dist/discovery/glob.js +72 -0
- package/dist/discovery/index.d.ts +6 -0
- package/dist/discovery/index.d.ts.map +1 -0
- package/dist/discovery/index.js +8 -0
- package/dist/discovery/media-type.d.ts +4 -0
- package/dist/discovery/media-type.d.ts.map +1 -0
- package/dist/discovery/media-type.js +62 -0
- package/dist/discovery/persist.d.ts +63 -0
- package/dist/discovery/persist.d.ts.map +1 -0
- package/dist/discovery/persist.js +345 -0
- package/dist/discovery/test-support.d.ts +16 -0
- package/dist/discovery/test-support.d.ts.map +1 -0
- package/dist/discovery/test-support.js +127 -0
- package/dist/discovery/types.d.ts +63 -0
- package/dist/discovery/types.d.ts.map +1 -0
- package/dist/discovery/types.js +28 -0
- package/dist/discovery/walk.d.ts +12 -0
- package/dist/discovery/walk.d.ts.map +1 -0
- package/dist/discovery/walk.js +302 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +22 -0
- package/dist/evaluations/dimensions.d.ts +14 -0
- package/dist/evaluations/dimensions.d.ts.map +1 -0
- package/dist/evaluations/dimensions.js +191 -0
- package/dist/evaluations/fixtures.d.ts +18 -0
- package/dist/evaluations/fixtures.d.ts.map +1 -0
- package/dist/evaluations/fixtures.js +858 -0
- package/dist/evaluations/index.d.ts +7 -0
- package/dist/evaluations/index.d.ts.map +1 -0
- package/dist/evaluations/index.js +10 -0
- package/dist/evaluations/report.d.ts +3 -0
- package/dist/evaluations/report.d.ts.map +1 -0
- package/dist/evaluations/report.js +31 -0
- package/dist/evaluations/runner-seed.d.ts +12 -0
- package/dist/evaluations/runner-seed.d.ts.map +1 -0
- package/dist/evaluations/runner-seed.js +175 -0
- package/dist/evaluations/runner.d.ts +8 -0
- package/dist/evaluations/runner.d.ts.map +1 -0
- package/dist/evaluations/runner.js +205 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
- package/dist/evaluations/scripted-embedding-adapter.js +163 -0
- package/dist/evaluations/types.d.ts +116 -0
- package/dist/evaluations/types.d.ts.map +1 -0
- package/dist/evaluations/types.js +27 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +41 -0
- package/dist/indexing/bounded-indexing.d.ts +41 -0
- package/dist/indexing/bounded-indexing.d.ts.map +1 -0
- package/dist/indexing/bounded-indexing.js +240 -0
- package/dist/indexing/checkpoint-persist.d.ts +8 -0
- package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
- package/dist/indexing/checkpoint-persist.js +135 -0
- package/dist/indexing/checkpoint-resume.d.ts +20 -0
- package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
- package/dist/indexing/checkpoint-resume.js +50 -0
- package/dist/indexing/embedding-batcher.d.ts +3 -0
- package/dist/indexing/embedding-batcher.d.ts.map +1 -0
- package/dist/indexing/embedding-batcher.js +390 -0
- package/dist/indexing/index.d.ts +7 -0
- package/dist/indexing/index.d.ts.map +1 -0
- package/dist/indexing/index.js +11 -0
- package/dist/indexing/job-persist.d.ts +46 -0
- package/dist/indexing/job-persist.d.ts.map +1 -0
- package/dist/indexing/job-persist.js +157 -0
- package/dist/indexing/job-resume.d.ts +4 -0
- package/dist/indexing/job-resume.d.ts.map +1 -0
- package/dist/indexing/job-resume.js +14 -0
- package/dist/indexing/orchestrator.d.ts +3 -0
- package/dist/indexing/orchestrator.d.ts.map +1 -0
- package/dist/indexing/orchestrator.js +1151 -0
- package/dist/indexing/types.d.ts +156 -0
- package/dist/indexing/types.d.ts.map +1 -0
- package/dist/indexing/types.js +30 -0
- package/dist/indexing/vector-persist.d.ts +32 -0
- package/dist/indexing/vector-persist.d.ts.map +1 -0
- package/dist/indexing/vector-persist.js +105 -0
- package/dist/parsers/_internal.d.ts +20 -0
- package/dist/parsers/_internal.d.ts.map +1 -0
- package/dist/parsers/_internal.js +122 -0
- package/dist/parsers/csv-parser.d.ts +3 -0
- package/dist/parsers/csv-parser.d.ts.map +1 -0
- package/dist/parsers/csv-parser.js +202 -0
- package/dist/parsers/docx-parser.d.ts +3 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +390 -0
- package/dist/parsers/html-parser.d.ts +3 -0
- package/dist/parsers/html-parser.d.ts.map +1 -0
- package/dist/parsers/html-parser.js +310 -0
- package/dist/parsers/index.d.ts +15 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/index.js +41 -0
- package/dist/parsers/json-parser.d.ts +3 -0
- package/dist/parsers/json-parser.d.ts.map +1 -0
- package/dist/parsers/json-parser.js +192 -0
- package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
- package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
- package/dist/parsers/large-document/capability-discovery.js +76 -0
- package/dist/parsers/large-document/diagnostics.d.ts +3 -0
- package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
- package/dist/parsers/large-document/diagnostics.js +11 -0
- package/dist/parsers/large-document/index.d.ts +15 -0
- package/dist/parsers/large-document/index.d.ts.map +1 -0
- package/dist/parsers/large-document/index.js +10 -0
- package/dist/parsers/large-document/legacy-format.d.ts +5 -0
- package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
- package/dist/parsers/large-document/legacy-format.js +25 -0
- package/dist/parsers/large-document/preflight.d.ts +9 -0
- package/dist/parsers/large-document/preflight.d.ts.map +1 -0
- package/dist/parsers/large-document/preflight.js +43 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-extraction.js +123 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-pdf.js +145 -0
- package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
- package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
- package/dist/parsers/large-document/synthetic-source.js +101 -0
- package/dist/parsers/large-document/window-builder.d.ts +24 -0
- package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
- package/dist/parsers/large-document/window-builder.js +75 -0
- package/dist/parsers/ocr/index.d.ts +4 -0
- package/dist/parsers/ocr/index.d.ts.map +1 -0
- package/dist/parsers/ocr/index.js +4 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
- package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
- package/dist/parsers/ocr/types.d.ts +16 -0
- package/dist/parsers/ocr/types.d.ts.map +1 -0
- package/dist/parsers/ocr/types.js +4 -0
- package/dist/parsers/parser-test-fixtures.d.ts +28 -0
- package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
- package/dist/parsers/parser-test-fixtures.js +139 -0
- package/dist/parsers/pdf-parser.d.ts +43 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +388 -0
- package/dist/parsers/registry.d.ts +8 -0
- package/dist/parsers/registry.d.ts.map +1 -0
- package/dist/parsers/registry.js +57 -0
- package/dist/parsers/text-parser.d.ts +3 -0
- package/dist/parsers/text-parser.d.ts.map +1 -0
- package/dist/parsers/text-parser.js +214 -0
- package/dist/parsers/types.d.ts +53 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers/types.js +21 -0
- package/dist/parsers/unsupported-parser.d.ts +4 -0
- package/dist/parsers/unsupported-parser.d.ts.map +1 -0
- package/dist/parsers/unsupported-parser.js +97 -0
- package/dist/parsers/xlsx-parser.d.ts +3 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +425 -0
- package/dist/privacy/audit-emitter.d.ts +5 -0
- package/dist/privacy/audit-emitter.d.ts.map +1 -0
- package/dist/privacy/audit-emitter.js +93 -0
- package/dist/privacy/diagnostic-redactor.d.ts +2 -0
- package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
- package/dist/privacy/diagnostic-redactor.js +153 -0
- package/dist/privacy/index.d.ts +5 -0
- package/dist/privacy/index.d.ts.map +1 -0
- package/dist/privacy/index.js +6 -0
- package/dist/privacy/retention-applier.d.ts +5 -0
- package/dist/privacy/retention-applier.d.ts.map +1 -0
- package/dist/privacy/retention-applier.js +88 -0
- package/dist/privacy/types.d.ts +98 -0
- package/dist/privacy/types.d.ts.map +1 -0
- package/dist/privacy/types.js +12 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
- package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
- package/dist/qualityIntelligence/index.d.ts +3 -0
- package/dist/qualityIntelligence/index.d.ts.map +1 -0
- package/dist/qualityIntelligence/index.js +5 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
- package/dist/qualityIntelligence/qiHandoff.js +82 -0
- package/dist/retrieval/answer-grounding.d.ts +9 -0
- package/dist/retrieval/answer-grounding.d.ts.map +1 -0
- package/dist/retrieval/answer-grounding.js +31 -0
- package/dist/retrieval/context-pack-assembler.d.ts +24 -0
- package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
- package/dist/retrieval/context-pack-assembler.js +50 -0
- package/dist/retrieval/index.d.ts +6 -0
- package/dist/retrieval/index.d.ts.map +1 -0
- package/dist/retrieval/index.js +9 -0
- package/dist/retrieval/retrieval-runner.d.ts +10 -0
- package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
- package/dist/retrieval/retrieval-runner.js +163 -0
- package/dist/retrieval/scoped-vector-search.d.ts +24 -0
- package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
- package/dist/retrieval/scoped-vector-search.js +864 -0
- package/dist/retrieval/types.d.ts +28 -0
- package/dist/retrieval/types.d.ts.map +1 -0
- package/dist/retrieval/types.js +33 -0
- package/dist/section-path-hash.d.ts +3 -0
- package/dist/section-path-hash.d.ts.map +1 -0
- package/dist/section-path-hash.js +9 -0
- package/dist/source-lifecycle.d.ts +14 -0
- package/dist/source-lifecycle.d.ts.map +1 -0
- package/dist/source-lifecycle.js +155 -0
- package/dist/source-routing-validation.d.ts +11 -0
- package/dist/source-routing-validation.d.ts.map +1 -0
- package/dist/source-routing-validation.js +140 -0
- package/dist/store-content-cipher.d.ts +11 -0
- package/dist/store-content-cipher.d.ts.map +1 -0
- package/dist/store-content-cipher.js +67 -0
- package/dist/store-content-encryption.d.ts +12 -0
- package/dist/store-content-encryption.d.ts.map +1 -0
- package/dist/store-content-encryption.js +275 -0
- package/dist/store-paths.d.ts +6 -0
- package/dist/store-paths.d.ts.map +1 -0
- package/dist/store-paths.js +61 -0
- package/dist/store.d.ts +30 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +219 -0
- package/dist/testing.d.ts +47 -0
- package/dist/testing.d.ts.map +1 -0
- package/dist/testing.js +170 -0
- package/dist/version.d.ts +2 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +4 -0
- package/package.json +43 -0
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
import { Buffer } from "node:buffer";
|
|
2
|
+
import yauzl from "yauzl";
|
|
3
|
+
import { decodeXmlEntities, diagnostic, emptyResult, objectLimitDiagnostic, oversizeDiagnostic, shouldStop, } from "./_internal.js";
|
|
4
|
+
const PARSER_ID = "xlsx";
|
|
5
|
+
const PARSER_VERSION = "1";
|
|
6
|
+
const DEPENDENCY_VERSIONS = Object.freeze([
|
|
7
|
+
Object.freeze({ packageName: "yauzl", version: "3.4.0" }),
|
|
8
|
+
]);
|
|
9
|
+
const XLSX_MEDIA = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
|
|
10
|
+
const MAX_XML_INFLATED_BYTES = 32 * 1024 * 1024;
|
|
11
|
+
const MAX_XML_INFLATE_RATIO = 100;
|
|
12
|
+
const SHEET_ENTRY_PREFIX = "xl/worksheets/";
|
|
13
|
+
function isXlsx(input) {
|
|
14
|
+
const ext = input.extension.toLowerCase();
|
|
15
|
+
const media = input.mediaType.toLowerCase();
|
|
16
|
+
return ext === "xlsx" || media === XLSX_MEDIA;
|
|
17
|
+
}
|
|
18
|
+
function cancelled(capability, input, options) {
|
|
19
|
+
return emptyResult(capability, input.documentId, options, [
|
|
20
|
+
diagnostic("PARSER_CANCELLED", "caller aborted parser", input.documentId, "info"),
|
|
21
|
+
]);
|
|
22
|
+
}
|
|
23
|
+
function syncFallback(capability) {
|
|
24
|
+
return (input, options) => emptyResult(capability, input.documentId, options, [
|
|
25
|
+
diagnostic("UNSUPPORTED_FORMAT", "xlsx parser requires async caller; use parseAsync via discovery", input.documentId, "info"),
|
|
26
|
+
], [{ kind: "unsupported-media", documentId: input.documentId, reason: "xlsx-async-required" }]);
|
|
27
|
+
}
|
|
28
|
+
function toError(error, fallback) {
|
|
29
|
+
return error instanceof Error ? error : new Error(fallback);
|
|
30
|
+
}
|
|
31
|
+
function closeZipQuietly(zip) {
|
|
32
|
+
try {
|
|
33
|
+
zip.close();
|
|
34
|
+
}
|
|
35
|
+
catch {
|
|
36
|
+
// Close failures are non-fatal during parser cleanup.
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
function openZip(bytes) {
|
|
40
|
+
return new Promise((resolve, reject) => {
|
|
41
|
+
yauzl.fromBuffer(Buffer.from(bytes), { lazyEntries: true, decodeStrings: true }, (error, zip) => {
|
|
42
|
+
if (error !== null) {
|
|
43
|
+
reject(toError(error, "failed to open xlsx zip"));
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
resolve(zip);
|
|
47
|
+
});
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
function maxInflatedEntryBytes(maxInputBytes) {
|
|
51
|
+
const inputCap = Math.max(1, Math.floor(maxInputBytes));
|
|
52
|
+
return Math.min(MAX_XML_INFLATED_BYTES, inputCap * 10);
|
|
53
|
+
}
|
|
54
|
+
function isRelevantEntry(name) {
|
|
55
|
+
return (name === "xl/workbook.xml" ||
|
|
56
|
+
name === "xl/_rels/workbook.xml.rels" ||
|
|
57
|
+
name === "xl/sharedStrings.xml" ||
|
|
58
|
+
(name.startsWith(SHEET_ENTRY_PREFIX) && name.endsWith(".xml")));
|
|
59
|
+
}
|
|
60
|
+
function assertEntryWithinLimits(entry, maxInflatedBytes) {
|
|
61
|
+
if (entry.uncompressedSize > maxInflatedBytes) {
|
|
62
|
+
throw new Error("xlsx xml inflated size exceeds parser limit");
|
|
63
|
+
}
|
|
64
|
+
if (entry.compressedSize > 0 &&
|
|
65
|
+
entry.uncompressedSize / entry.compressedSize > MAX_XML_INFLATE_RATIO) {
|
|
66
|
+
throw new Error("xlsx xml compression ratio exceeds parser limit");
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
function destroyStream(readStream, error) {
|
|
70
|
+
const destroy = readStream.destroy;
|
|
71
|
+
if (typeof destroy === "function") {
|
|
72
|
+
destroy.call(readStream, error);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
function readEntryText(zip, entry, maxInflatedBytes) {
|
|
76
|
+
assertEntryWithinLimits(entry, maxInflatedBytes);
|
|
77
|
+
return new Promise((resolve, reject) => {
|
|
78
|
+
zip.openReadStream(entry, (error, stream) => {
|
|
79
|
+
if (error !== null) {
|
|
80
|
+
reject(toError(error, "failed to open xlsx entry stream"));
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
const readStream = stream;
|
|
84
|
+
const chunks = [];
|
|
85
|
+
let inflatedBytes = 0;
|
|
86
|
+
let settled = false;
|
|
87
|
+
const rejectOnce = (streamError) => {
|
|
88
|
+
if (settled)
|
|
89
|
+
return;
|
|
90
|
+
settled = true;
|
|
91
|
+
reject(streamError);
|
|
92
|
+
destroyStream(readStream, streamError);
|
|
93
|
+
};
|
|
94
|
+
readStream.on("data", (chunk) => {
|
|
95
|
+
if (settled)
|
|
96
|
+
return;
|
|
97
|
+
inflatedBytes += chunk.byteLength;
|
|
98
|
+
if (inflatedBytes > maxInflatedBytes) {
|
|
99
|
+
rejectOnce(new Error("xlsx xml inflated stream exceeds parser limit"));
|
|
100
|
+
return;
|
|
101
|
+
}
|
|
102
|
+
chunks.push(chunk);
|
|
103
|
+
});
|
|
104
|
+
readStream.on("end", () => {
|
|
105
|
+
if (settled)
|
|
106
|
+
return;
|
|
107
|
+
settled = true;
|
|
108
|
+
resolve(Buffer.concat(chunks).toString("utf8"));
|
|
109
|
+
});
|
|
110
|
+
readStream.on("error", (streamError) => {
|
|
111
|
+
rejectOnce(streamError);
|
|
112
|
+
});
|
|
113
|
+
});
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
function removeZipEntryListeners(zip, listeners) {
|
|
117
|
+
zip.removeListener("entry", listeners.onEntry);
|
|
118
|
+
zip.removeListener("end", listeners.onEnd);
|
|
119
|
+
zip.removeListener("error", listeners.onError);
|
|
120
|
+
}
|
|
121
|
+
function readRelevantEntries(zip, maxInflatedBytes) {
|
|
122
|
+
return new Promise((resolve, reject) => {
|
|
123
|
+
const entries = [];
|
|
124
|
+
let settled = false;
|
|
125
|
+
const resolveOnce = () => {
|
|
126
|
+
if (settled)
|
|
127
|
+
return;
|
|
128
|
+
settled = true;
|
|
129
|
+
removeZipEntryListeners(zip, { onEntry, onEnd, onError });
|
|
130
|
+
resolve(entries);
|
|
131
|
+
};
|
|
132
|
+
const rejectOnce = (error) => {
|
|
133
|
+
if (settled)
|
|
134
|
+
return;
|
|
135
|
+
settled = true;
|
|
136
|
+
removeZipEntryListeners(zip, { onEntry, onEnd, onError });
|
|
137
|
+
reject(error);
|
|
138
|
+
};
|
|
139
|
+
const onEnd = () => {
|
|
140
|
+
resolveOnce();
|
|
141
|
+
};
|
|
142
|
+
const onError = (error) => {
|
|
143
|
+
rejectOnce(toError(error, "failed to read xlsx zip"));
|
|
144
|
+
};
|
|
145
|
+
const handleEntry = async (entry) => {
|
|
146
|
+
if (!isRelevantEntry(entry.fileName)) {
|
|
147
|
+
zip.readEntry();
|
|
148
|
+
return;
|
|
149
|
+
}
|
|
150
|
+
try {
|
|
151
|
+
const xml = await readEntryText(zip, entry, maxInflatedBytes);
|
|
152
|
+
entries.push({ name: entry.fileName, xml });
|
|
153
|
+
zip.readEntry();
|
|
154
|
+
}
|
|
155
|
+
catch (error) {
|
|
156
|
+
rejectOnce(toError(error, "failed to read xlsx entry"));
|
|
157
|
+
}
|
|
158
|
+
};
|
|
159
|
+
const onEntry = (entry) => {
|
|
160
|
+
void handleEntry(entry);
|
|
161
|
+
};
|
|
162
|
+
zip.on("entry", onEntry);
|
|
163
|
+
zip.on("end", onEnd);
|
|
164
|
+
zip.on("error", onError);
|
|
165
|
+
zip.readEntry();
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
// GRD-027: delegate to the shared decoder so cell text resolves numeric character references
|
|
169
|
+
// (smart quotes, accents) instead of surfacing literal `’`.
|
|
170
|
+
function decodeXml(value) {
|
|
171
|
+
return decodeXmlEntities(value);
|
|
172
|
+
}
|
|
173
|
+
function attribute(tag, name) {
|
|
174
|
+
const pattern = new RegExp(`(?:^|\\s)${name}="([^"]*)"`, "u");
|
|
175
|
+
const match = pattern.exec(tag);
|
|
176
|
+
return match?.[1] === undefined ? undefined : decodeXml(match[1]);
|
|
177
|
+
}
|
|
178
|
+
function xmlTextContent(value) {
|
|
179
|
+
let out = "";
|
|
180
|
+
let inTag = false;
|
|
181
|
+
for (const char of value) {
|
|
182
|
+
if (char === "<") {
|
|
183
|
+
inTag = true;
|
|
184
|
+
continue;
|
|
185
|
+
}
|
|
186
|
+
if (inTag) {
|
|
187
|
+
if (char === ">")
|
|
188
|
+
inTag = false;
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
out += char;
|
|
192
|
+
}
|
|
193
|
+
return decodeXml(out);
|
|
194
|
+
}
|
|
195
|
+
function parseSharedStrings(xml, input, options) {
|
|
196
|
+
const strings = [];
|
|
197
|
+
const diagnostics = [];
|
|
198
|
+
const startedAt = options.now();
|
|
199
|
+
const itemPattern = /<si\b[\s\S]*?<\/si>/gi;
|
|
200
|
+
let match;
|
|
201
|
+
while ((match = itemPattern.exec(xml)) !== null) {
|
|
202
|
+
const limit = shouldStop(startedAt, options, strings.length);
|
|
203
|
+
if (limit.stop && limit.code !== undefined && limit.message !== undefined) {
|
|
204
|
+
diagnostics.push(diagnostic(limit.code, limit.message, input.documentId, "info"));
|
|
205
|
+
break;
|
|
206
|
+
}
|
|
207
|
+
if (strings.length >= options.maxObjectsPerDocument) {
|
|
208
|
+
diagnostics.push(objectLimitDiagnostic(input.documentId, options.maxObjectsPerDocument));
|
|
209
|
+
break;
|
|
210
|
+
}
|
|
211
|
+
const itemXml = match[0];
|
|
212
|
+
const parts = [...itemXml.matchAll(/<t\b[^>]*>([\s\S]*?)<\/t>/gi)].map((part) => decodeXml(part[1] ?? ""));
|
|
213
|
+
strings.push(parts.length > 0 ? parts.join("") : xmlTextContent(itemXml));
|
|
214
|
+
}
|
|
215
|
+
return { strings, diagnostics };
|
|
216
|
+
}
|
|
217
|
+
function parseRelationships(xml) {
|
|
218
|
+
const rels = new Map();
|
|
219
|
+
for (const match of xml.matchAll(/<Relationship\b[^>]*>/gi)) {
|
|
220
|
+
const tag = match[0];
|
|
221
|
+
const id = attribute(tag, "Id");
|
|
222
|
+
const target = attribute(tag, "Target");
|
|
223
|
+
if (id === undefined || target === undefined)
|
|
224
|
+
continue;
|
|
225
|
+
rels.set(id, target.startsWith("/") ? target.slice(1) : `xl/${target.replace(/^\.\//u, "")}`);
|
|
226
|
+
}
|
|
227
|
+
return rels;
|
|
228
|
+
}
|
|
229
|
+
function parseWorkbookSheets(workbookXml, relsXml, worksheetEntries) {
|
|
230
|
+
const rels = relsXml === undefined ? new Map() : parseRelationships(relsXml);
|
|
231
|
+
const sheets = [];
|
|
232
|
+
if (workbookXml !== undefined) {
|
|
233
|
+
for (const match of workbookXml.matchAll(/<sheet\b[^>]*>/gi)) {
|
|
234
|
+
const tag = match[0];
|
|
235
|
+
const name = attribute(tag, "name") ?? "Sheet";
|
|
236
|
+
const relId = attribute(tag, "r:id");
|
|
237
|
+
const entryName = relId === undefined ? undefined : rels.get(relId);
|
|
238
|
+
if (entryName !== undefined)
|
|
239
|
+
sheets.push({ name, entryName });
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
if (sheets.length > 0)
|
|
243
|
+
return sheets;
|
|
244
|
+
return worksheetEntries
|
|
245
|
+
.map((entry, index) => ({ name: `Sheet${String(index + 1)}`, entryName: entry.name }))
|
|
246
|
+
.sort((a, b) => a.entryName.localeCompare(b.entryName));
|
|
247
|
+
}
|
|
248
|
+
function columnName(ref, fallbackIndex) {
|
|
249
|
+
if (ref !== undefined) {
|
|
250
|
+
const match = /^([A-Z]+)/iu.exec(ref);
|
|
251
|
+
if (match?.[1] !== undefined)
|
|
252
|
+
return match[1].toUpperCase();
|
|
253
|
+
}
|
|
254
|
+
let n = fallbackIndex + 1;
|
|
255
|
+
let out = "";
|
|
256
|
+
while (n > 0) {
|
|
257
|
+
const rem = (n - 1) % 26;
|
|
258
|
+
out = String.fromCharCode(65 + rem) + out;
|
|
259
|
+
n = Math.floor((n - 1) / 26);
|
|
260
|
+
}
|
|
261
|
+
return out;
|
|
262
|
+
}
|
|
263
|
+
function rowNumber(rowTag, fallback) {
|
|
264
|
+
const raw = attribute(rowTag, "r");
|
|
265
|
+
if (raw === undefined)
|
|
266
|
+
return fallback;
|
|
267
|
+
const parsed = Number.parseInt(raw, 10);
|
|
268
|
+
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
|
|
269
|
+
}
|
|
270
|
+
function inlineStringValue(cellXml) {
|
|
271
|
+
const inline = /<is\b[\s\S]*?<\/is>/iu.exec(cellXml)?.[0];
|
|
272
|
+
if (inline === undefined)
|
|
273
|
+
return "";
|
|
274
|
+
return [...inline.matchAll(/<t\b[^>]*>([\s\S]*?)<\/t>/gi)]
|
|
275
|
+
.map((part) => decodeXml(part[1] ?? ""))
|
|
276
|
+
.join("");
|
|
277
|
+
}
|
|
278
|
+
function rawCellValue(type, raw, sharedStrings) {
|
|
279
|
+
const decoded = decodeXml(raw);
|
|
280
|
+
if (type !== "s")
|
|
281
|
+
return decoded;
|
|
282
|
+
const index = Number.parseInt(decoded, 10);
|
|
283
|
+
return Number.isFinite(index) ? (sharedStrings[index] ?? "") : "";
|
|
284
|
+
}
|
|
285
|
+
function formulaCellValue(cellXml) {
|
|
286
|
+
const formula = /<f\b[^>]*>([\s\S]*?)<\/f>/iu.exec(cellXml)?.[1];
|
|
287
|
+
return formula === undefined ? "" : `=${decodeXml(formula)}`;
|
|
288
|
+
}
|
|
289
|
+
function cellValue(cellXml, sharedStrings) {
|
|
290
|
+
const tag = /^<c\b[^>]*>/iu.exec(cellXml)?.[0] ?? "";
|
|
291
|
+
const type = attribute(tag, "t");
|
|
292
|
+
if (type === "inlineStr")
|
|
293
|
+
return inlineStringValue(cellXml);
|
|
294
|
+
const raw = /<v\b[^>]*>([\s\S]*?)<\/v>/iu.exec(cellXml)?.[1];
|
|
295
|
+
if (raw !== undefined)
|
|
296
|
+
return rawCellValue(type, raw, sharedStrings);
|
|
297
|
+
return formulaCellValue(cellXml);
|
|
298
|
+
}
|
|
299
|
+
function projectSheetRows(sheetName, xml, sharedStrings) {
|
|
300
|
+
const rows = [];
|
|
301
|
+
let fallbackRow = 1;
|
|
302
|
+
for (const rowMatch of xml.matchAll(/<row\b[^>]*>[\s\S]*?<\/row>/gi)) {
|
|
303
|
+
const rowXml = rowMatch[0];
|
|
304
|
+
const rowTag = /^<row\b[^>]*>/iu.exec(rowXml)?.[0] ?? "";
|
|
305
|
+
const number = rowNumber(rowTag, fallbackRow);
|
|
306
|
+
fallbackRow = number + 1;
|
|
307
|
+
const cells = [];
|
|
308
|
+
let fallbackCol = 0;
|
|
309
|
+
for (const cellMatch of rowXml.matchAll(/<c\b[^>]*>[\s\S]*?<\/c>/gi)) {
|
|
310
|
+
const cellXml = cellMatch[0];
|
|
311
|
+
const cellTag = /^<c\b[^>]*>/iu.exec(cellXml)?.[0] ?? "";
|
|
312
|
+
const value = cellValue(cellXml, sharedStrings).trim();
|
|
313
|
+
if (value.length === 0) {
|
|
314
|
+
fallbackCol += 1;
|
|
315
|
+
continue;
|
|
316
|
+
}
|
|
317
|
+
cells.push({ column: columnName(attribute(cellTag, "r"), fallbackCol), value });
|
|
318
|
+
fallbackCol += 1;
|
|
319
|
+
}
|
|
320
|
+
if (cells.length === 0)
|
|
321
|
+
continue;
|
|
322
|
+
rows.push({
|
|
323
|
+
sheetName,
|
|
324
|
+
rowNumber: number,
|
|
325
|
+
text: `${sheetName}!${String(number)}: ${cells
|
|
326
|
+
.map((cell) => `${cell.column}=${cell.value}`)
|
|
327
|
+
.join(" | ")}\n`,
|
|
328
|
+
});
|
|
329
|
+
}
|
|
330
|
+
return rows;
|
|
331
|
+
}
|
|
332
|
+
function emitUnits(rows, input, options) {
|
|
333
|
+
const units = [];
|
|
334
|
+
const diagnostics = [];
|
|
335
|
+
const parts = [];
|
|
336
|
+
let offset = 0;
|
|
337
|
+
const startedAt = options.now();
|
|
338
|
+
for (const row of rows) {
|
|
339
|
+
const limit = shouldStop(startedAt, options, units.length);
|
|
340
|
+
if (limit.stop && limit.code !== undefined && limit.message !== undefined) {
|
|
341
|
+
diagnostics.push(diagnostic(limit.code, limit.message, input.documentId, "info"));
|
|
342
|
+
break;
|
|
343
|
+
}
|
|
344
|
+
const start = offset;
|
|
345
|
+
parts.push(row.text);
|
|
346
|
+
offset += row.text.length;
|
|
347
|
+
units.push({
|
|
348
|
+
kind: "csv-row",
|
|
349
|
+
documentId: input.documentId,
|
|
350
|
+
tableName: row.sheetName,
|
|
351
|
+
rowIndex: Math.max(0, row.rowNumber - 1),
|
|
352
|
+
characterStart: start,
|
|
353
|
+
characterEnd: offset,
|
|
354
|
+
});
|
|
355
|
+
}
|
|
356
|
+
return { units, diagnostics, normalizedText: parts.join("") };
|
|
357
|
+
}
|
|
358
|
+
function parseWorkbook(entries, input, options) {
|
|
359
|
+
const byName = new Map(entries.map((entry) => [entry.name, entry.xml]));
|
|
360
|
+
const shared = byName.get("xl/sharedStrings.xml");
|
|
361
|
+
const sharedResult = shared === undefined
|
|
362
|
+
? { strings: [], diagnostics: [] }
|
|
363
|
+
: parseSharedStrings(shared, input, options);
|
|
364
|
+
if (sharedResult.diagnostics.some((entry) => entry.severity === "error")) {
|
|
365
|
+
return emptyResult(xlsxParser.capability, input.documentId, options, sharedResult.diagnostics);
|
|
366
|
+
}
|
|
367
|
+
const worksheetEntries = entries.filter((entry) => entry.name.startsWith(SHEET_ENTRY_PREFIX));
|
|
368
|
+
const workbookSheets = parseWorkbookSheets(byName.get("xl/workbook.xml"), byName.get("xl/_rels/workbook.xml.rels"), worksheetEntries);
|
|
369
|
+
const rows = [];
|
|
370
|
+
for (const sheet of workbookSheets) {
|
|
371
|
+
const xml = byName.get(sheet.entryName);
|
|
372
|
+
if (xml === undefined)
|
|
373
|
+
continue;
|
|
374
|
+
rows.push(...projectSheetRows(sheet.name, xml, sharedResult.strings));
|
|
375
|
+
if (rows.length > options.maxObjectsPerDocument) {
|
|
376
|
+
return emptyResult(xlsxParser.capability, input.documentId, options, [
|
|
377
|
+
objectLimitDiagnostic(input.documentId, options.maxObjectsPerDocument),
|
|
378
|
+
]);
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
const emitted = emitUnits(rows, input, options);
|
|
382
|
+
return {
|
|
383
|
+
...emptyResult(xlsxParser.capability, input.documentId, options, [...sharedResult.diagnostics, ...emitted.diagnostics], emitted.units),
|
|
384
|
+
normalizedText: emitted.normalizedText,
|
|
385
|
+
};
|
|
386
|
+
}
|
|
387
|
+
export const xlsxParser = Object.freeze({
|
|
388
|
+
capability: Object.freeze({
|
|
389
|
+
parserId: PARSER_ID,
|
|
390
|
+
parserVersion: PARSER_VERSION,
|
|
391
|
+
dependencyVersions: DEPENDENCY_VERSIONS,
|
|
392
|
+
matches: (input) => isXlsx(input),
|
|
393
|
+
}),
|
|
394
|
+
parse: syncFallback({
|
|
395
|
+
parserId: PARSER_ID,
|
|
396
|
+
parserVersion: PARSER_VERSION,
|
|
397
|
+
dependencyVersions: DEPENDENCY_VERSIONS,
|
|
398
|
+
matches: (input) => isXlsx(input),
|
|
399
|
+
}),
|
|
400
|
+
parseAsync: async (input, options) => {
|
|
401
|
+
if (input.bytes.byteLength > options.maxBytes) {
|
|
402
|
+
return emptyResult(xlsxParser.capability, input.documentId, options, [
|
|
403
|
+
oversizeDiagnostic(input.documentId, input.bytes.byteLength, options.maxBytes),
|
|
404
|
+
]);
|
|
405
|
+
}
|
|
406
|
+
if (options.signal?.aborted === true) {
|
|
407
|
+
return cancelled(xlsxParser.capability, input, options);
|
|
408
|
+
}
|
|
409
|
+
try {
|
|
410
|
+
const zip = await openZip(input.bytes);
|
|
411
|
+
try {
|
|
412
|
+
const entries = await readRelevantEntries(zip, maxInflatedEntryBytes(options.maxBytes));
|
|
413
|
+
return parseWorkbook(entries, input, options);
|
|
414
|
+
}
|
|
415
|
+
finally {
|
|
416
|
+
closeZipQuietly(zip);
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
catch {
|
|
420
|
+
return emptyResult(xlsxParser.capability, input.documentId, options, [
|
|
421
|
+
diagnostic("MALFORMED_INPUT", "xlsx parser rejected malformed or unsupported workbook", input.documentId, "error"),
|
|
422
|
+
]);
|
|
423
|
+
}
|
|
424
|
+
},
|
|
425
|
+
});
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { KnowledgeStore } from "../store.js";
|
|
2
|
+
import type { AuditEventSink, CapsuleAuditEvent } from "./types.js";
|
|
3
|
+
export declare function emitCapsuleAuditEvent(event: CapsuleAuditEvent, sink: AuditEventSink): void;
|
|
4
|
+
export declare function createSqliteAuditSink(store: KnowledgeStore): AuditEventSink;
|
|
5
|
+
//# sourceMappingURL=audit-emitter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"audit-emitter.d.ts","sourceRoot":"","sources":["../../src/privacy/audit-emitter.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,OAAO,KAAK,EACV,cAAc,EACd,iBAAiB,EAGlB,MAAM,YAAY,CAAC;AAcpB,wBAAgB,qBAAqB,CAAC,KAAK,EAAE,iBAAiB,EAAE,IAAI,EAAE,cAAc,GAAG,IAAI,CAE1F;AAED,wBAAgB,qBAAqB,CAAC,KAAK,EAAE,cAAc,GAAG,cAAc,CAgB3E"}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
// emitCapsuleAuditEvent — pure forwarder onto an `AuditEventSink`. The shape lets a caller
|
|
2
|
+
// compose the default node-sqlite sink (writes the schema-compatible event kinds to
|
|
3
|
+
// `capsule_membership_changes`) with any number of additional sinks (external evidence
|
|
4
|
+
// ledger, in-memory test capture, future sibling-table writer) by constructing their own
|
|
5
|
+
// `AuditEventSink` and chaining the calls.
|
|
6
|
+
//
|
|
7
|
+
// The default sink writes every metadata-only event to `capsule_audit_events` and also
|
|
8
|
+
// mirrors the source membership variants into the narrower `capsule_membership_changes`
|
|
9
|
+
// table that #263 already introduced for composition history.
|
|
10
|
+
import { createHash, randomUUID } from "node:crypto";
|
|
11
|
+
const INSERT_MEMBERSHIP_SQL = "INSERT INTO capsule_membership_changes (id, capsule_id, change_kind, source_id, details_json, occurred_at) VALUES (:id, :capsule_id, :change_kind, :source_id, :details_json, :occurred_at)";
|
|
12
|
+
const INSERT_AUDIT_SQL = "INSERT INTO capsule_audit_events (id, capsule_id, kind, source_id, job_id, error_code, processed_documents, failed_documents, deleted_vector_count, deleted_extracted_text_count, details_json, occurred_at) VALUES (:id, :capsule_id, :kind, :source_id, :job_id, :error_code, :processed_documents, :failed_documents, :deleted_vector_count, :deleted_extracted_text_count, :details_json, :occurred_at)";
|
|
13
|
+
export function emitCapsuleAuditEvent(event, sink) {
|
|
14
|
+
sink.emit(event);
|
|
15
|
+
}
|
|
16
|
+
export function createSqliteAuditSink(store) {
|
|
17
|
+
const insertAudit = store._internal.db.prepare(INSERT_AUDIT_SQL);
|
|
18
|
+
const insertMembership = store._internal.db.prepare(INSERT_MEMBERSHIP_SQL);
|
|
19
|
+
return {
|
|
20
|
+
emit: (event) => {
|
|
21
|
+
insertAuditEventRow(insertAudit, event);
|
|
22
|
+
if (event.kind === "source-added") {
|
|
23
|
+
insertMembershipRow(insertMembership, event, "add-source");
|
|
24
|
+
return;
|
|
25
|
+
}
|
|
26
|
+
if (event.kind === "source-removed") {
|
|
27
|
+
insertMembershipRow(insertMembership, event, "remove-source");
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
function insertAuditEventRow(statement, event) {
|
|
34
|
+
statement.run({
|
|
35
|
+
id: randomUUID(),
|
|
36
|
+
capsule_id: event.capsuleId,
|
|
37
|
+
kind: event.kind,
|
|
38
|
+
source_id: "sourceId" in event ? event.sourceId : null,
|
|
39
|
+
job_id: "jobId" in event ? event.jobId : null,
|
|
40
|
+
error_code: "errorCode" in event ? event.errorCode : null,
|
|
41
|
+
processed_documents: "processedDocuments" in event ? event.processedDocuments : null,
|
|
42
|
+
failed_documents: "failedDocuments" in event ? event.failedDocuments : null,
|
|
43
|
+
deleted_vector_count: "deletedVectorCount" in event ? event.deletedVectorCount : null,
|
|
44
|
+
deleted_extracted_text_count: "deletedExtractedTextCount" in event ? event.deletedExtractedTextCount : null,
|
|
45
|
+
details_json: buildAuditDetailsJson(event),
|
|
46
|
+
occurred_at: event.occurredAt,
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
function redactChunkIds(chunkIds) {
|
|
50
|
+
return chunkIds.map((chunkId) => createHash("sha256").update(chunkId).digest("hex").slice(0, 16));
|
|
51
|
+
}
|
|
52
|
+
function buildAuditDetails(event) {
|
|
53
|
+
if (event.kind === "indexing-job-started" ||
|
|
54
|
+
event.kind === "indexing-job-completed" ||
|
|
55
|
+
event.kind === "indexing-job-failed" ||
|
|
56
|
+
event.kind === "retention-applied") {
|
|
57
|
+
return {
|
|
58
|
+
sourceIds: [...event.sourceIds],
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
if (event.kind === "retrieval-performed") {
|
|
62
|
+
return {
|
|
63
|
+
sourceIds: [...event.sourceIds],
|
|
64
|
+
chunkIds: redactChunkIds(event.chunkIds),
|
|
65
|
+
referenceCount: event.referenceCount,
|
|
66
|
+
noEvidence: event.noEvidence,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
if (event.kind === "answer-context-assembled" || event.kind === "model-context-sent") {
|
|
70
|
+
return {
|
|
71
|
+
sourceIds: [...event.sourceIds],
|
|
72
|
+
chunkIds: redactChunkIds(event.chunkIds),
|
|
73
|
+
referenceCount: event.referenceCount,
|
|
74
|
+
citationCount: event.citationCount,
|
|
75
|
+
...("modelId" in event ? { modelId: event.modelId } : {}),
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
return null;
|
|
79
|
+
}
|
|
80
|
+
function buildAuditDetailsJson(event) {
|
|
81
|
+
const details = buildAuditDetails(event);
|
|
82
|
+
return details === null ? null : JSON.stringify(details);
|
|
83
|
+
}
|
|
84
|
+
function insertMembershipRow(statement, event, changeKind) {
|
|
85
|
+
statement.run({
|
|
86
|
+
id: randomUUID(),
|
|
87
|
+
capsule_id: event.capsuleId,
|
|
88
|
+
change_kind: changeKind,
|
|
89
|
+
source_id: event.sourceId,
|
|
90
|
+
details_json: null,
|
|
91
|
+
occurred_at: event.occurredAt,
|
|
92
|
+
});
|
|
93
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"diagnostic-redactor.d.ts","sourceRoot":"","sources":["../../src/privacy/diagnostic-redactor.ts"],"names":[],"mappings":"AAsCA,wBAAgB,uBAAuB,CAAC,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,MAAM,CAcnF"}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
// redactDiagnosticMessage — privacy-aware redactor for parser diagnostic messages before
|
|
2
|
+
// they reach a `parser_diagnostics` row. Defense-in-depth around the existing #265
|
|
3
|
+
// `redactPathInDiagnostic` helper: that helper is path-shaped (it home-rewrites a full
|
|
4
|
+
// path string), but a diagnostic message often embeds a path inside prose
|
|
5
|
+
// ("failed to parse /Users/foo/secret.pdf at offset 42"). This module:
|
|
6
|
+
//
|
|
7
|
+
// 1. Replaces any in-message occurrence of `homePrefix` with `~` so the raw home path
|
|
8
|
+
// cannot leak even when wrapped in surrounding text.
|
|
9
|
+
// 2. Defers to `redactPathInDiagnostic` for control-char stripping, NUL truncation, and
|
|
10
|
+
// drive-letter masking. That helper's tests in keiko-contracts already pin each step.
|
|
11
|
+
// 3. Hard-caps the output at 1024 chars (structural backstop in case a misbehaving parser
|
|
12
|
+
// hands us 5 KB of raw extracted text in the message field — the parser SHOULD cap
|
|
13
|
+
// already, but we cannot trust an unbounded message into the audit ledger).
|
|
14
|
+
//
|
|
15
|
+
// The cap is plain slice (no trailing ellipsis) so the result length is `<= 1024`, not the
|
|
16
|
+
// `<= 1025` you would get from "…"-suffix on a 1024-char prefix.
|
|
17
|
+
import { redactPathInDiagnostic } from "@oscharko-dev/keiko-contracts";
|
|
18
|
+
const HARD_CAP_CHARS = 1024;
|
|
19
|
+
const PATH_BREAK_CHARS = new Set([" ", "\n", "\r", "\t", '"', "'", "<", ">", "|"]);
|
|
20
|
+
const PATH_BOUNDARY_CHARS = new Set([
|
|
21
|
+
" ",
|
|
22
|
+
"\n",
|
|
23
|
+
"\r",
|
|
24
|
+
"\t",
|
|
25
|
+
'"',
|
|
26
|
+
"'",
|
|
27
|
+
"(",
|
|
28
|
+
"[",
|
|
29
|
+
"{",
|
|
30
|
+
",",
|
|
31
|
+
";",
|
|
32
|
+
":",
|
|
33
|
+
"=",
|
|
34
|
+
]);
|
|
35
|
+
const TRAILING_PUNCTUATION = new Set([".", ",", ";", ":", "!", "?", ")", "]"]);
|
|
36
|
+
export function redactDiagnosticMessage(message, homePrefix) {
|
|
37
|
+
if (typeof message !== "string")
|
|
38
|
+
return "";
|
|
39
|
+
// Step 1: strip control characters / NUL-truncate / drive-mask via the contracts helper.
|
|
40
|
+
// Passing the message as a "path" is safe — the helper only TRANSFORMS substrings it
|
|
41
|
+
// recognises and otherwise returns the input unchanged.
|
|
42
|
+
const sanitised = redactPathInDiagnostic(message, { homePrefix });
|
|
43
|
+
// Step 2: rewrite any in-prose occurrence of the home prefix. The contracts helper only
|
|
44
|
+
// home-rewrites when the prefix matches at offset 0; embedded paths slip through, so we
|
|
45
|
+
// additionally redact path-shaped prose tokens.
|
|
46
|
+
const normalisedPrefix = stripTrailingSlash(toForwardSlash(homePrefix));
|
|
47
|
+
const homeRewritten = redactPathCandidates(sanitised, normalisedPrefix);
|
|
48
|
+
// Step 3: hard cap. `slice` is O(n) and yields exactly HARD_CAP_CHARS chars on long input.
|
|
49
|
+
if (homeRewritten.length <= HARD_CAP_CHARS)
|
|
50
|
+
return homeRewritten;
|
|
51
|
+
return homeRewritten.slice(0, HARD_CAP_CHARS);
|
|
52
|
+
}
|
|
53
|
+
function redactPathCandidates(message, homePrefix) {
|
|
54
|
+
const parts = [];
|
|
55
|
+
let index = 0;
|
|
56
|
+
while (index < message.length) {
|
|
57
|
+
const start = pathCandidateStart(message, index);
|
|
58
|
+
if (start === -1) {
|
|
59
|
+
parts.push(message.slice(index));
|
|
60
|
+
break;
|
|
61
|
+
}
|
|
62
|
+
parts.push(message.slice(index, start));
|
|
63
|
+
const end = pathCandidateEnd(message, start);
|
|
64
|
+
const raw = message.slice(start, end);
|
|
65
|
+
parts.push(redactCandidate(raw, homePrefix));
|
|
66
|
+
index = end;
|
|
67
|
+
}
|
|
68
|
+
return parts.join("");
|
|
69
|
+
}
|
|
70
|
+
function pathCandidateStart(message, from) {
|
|
71
|
+
for (let index = from; index < message.length; index += 1) {
|
|
72
|
+
const current = message[index];
|
|
73
|
+
const next = message[index + 1];
|
|
74
|
+
if (current === "/" && hasPathBoundary(message, index))
|
|
75
|
+
return index;
|
|
76
|
+
if (current === "\\" && next === "\\" && hasPathBoundary(message, index))
|
|
77
|
+
return index;
|
|
78
|
+
if (isDriveLetterPrefix(current, next, message[index + 2]) && hasPathBoundary(message, index)) {
|
|
79
|
+
return index;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return -1;
|
|
83
|
+
}
|
|
84
|
+
function pathCandidateEnd(message, start) {
|
|
85
|
+
let end = start;
|
|
86
|
+
while (end < message.length) {
|
|
87
|
+
const current = message[end];
|
|
88
|
+
if (current !== undefined && PATH_BREAK_CHARS.has(current))
|
|
89
|
+
break;
|
|
90
|
+
end += 1;
|
|
91
|
+
}
|
|
92
|
+
while (end > start) {
|
|
93
|
+
const trailing = message[end - 1];
|
|
94
|
+
if (trailing === undefined || !TRAILING_PUNCTUATION.has(trailing))
|
|
95
|
+
break;
|
|
96
|
+
end -= 1;
|
|
97
|
+
}
|
|
98
|
+
return end;
|
|
99
|
+
}
|
|
100
|
+
function redactCandidate(candidate, homePrefix) {
|
|
101
|
+
const normalised = toForwardSlash(candidate);
|
|
102
|
+
const leadingSlash = normalised.startsWith("//");
|
|
103
|
+
const homeRedacted = homePrefix.length > 0 && isPrefixedPath(normalised, homePrefix)
|
|
104
|
+
? `~${normalised.slice(homePrefix.length)}`
|
|
105
|
+
: normalised;
|
|
106
|
+
if (homeRedacted.startsWith("~")) {
|
|
107
|
+
return homeRedacted;
|
|
108
|
+
}
|
|
109
|
+
if (leadingSlash) {
|
|
110
|
+
return `<unc>/${basenameOf(normalised)}`;
|
|
111
|
+
}
|
|
112
|
+
if (/^[A-Za-z]:\//.test(normalised)) {
|
|
113
|
+
return `<drive>/${basenameOf(normalised)}`;
|
|
114
|
+
}
|
|
115
|
+
if (normalised.startsWith("/")) {
|
|
116
|
+
return `<path>/${basenameOf(normalised)}`;
|
|
117
|
+
}
|
|
118
|
+
return normalised;
|
|
119
|
+
}
|
|
120
|
+
function toForwardSlash(value) {
|
|
121
|
+
return value.replace(/\\/g, "/");
|
|
122
|
+
}
|
|
123
|
+
function basenameOf(value) {
|
|
124
|
+
const trimmed = stripTrailingSlash(value);
|
|
125
|
+
const lastSlash = trimmed.lastIndexOf("/");
|
|
126
|
+
return lastSlash === -1 ? trimmed : trimmed.slice(lastSlash + 1);
|
|
127
|
+
}
|
|
128
|
+
function isDriveLetterPrefix(current, next, afterColon) {
|
|
129
|
+
return (current !== undefined &&
|
|
130
|
+
next === ":" &&
|
|
131
|
+
afterColon !== undefined &&
|
|
132
|
+
((current >= "A" && current <= "Z") || (current >= "a" && current <= "z")) &&
|
|
133
|
+
(afterColon === "/" || afterColon === "\\"));
|
|
134
|
+
}
|
|
135
|
+
function isPrefixedPath(value, prefix) {
|
|
136
|
+
if (!value.startsWith(prefix))
|
|
137
|
+
return false;
|
|
138
|
+
const next = value[prefix.length];
|
|
139
|
+
return next === undefined || next === "/";
|
|
140
|
+
}
|
|
141
|
+
function hasPathBoundary(message, index) {
|
|
142
|
+
if (index === 0)
|
|
143
|
+
return true;
|
|
144
|
+
const previous = message[index - 1];
|
|
145
|
+
return previous !== undefined && PATH_BOUNDARY_CHARS.has(previous);
|
|
146
|
+
}
|
|
147
|
+
function stripTrailingSlash(value) {
|
|
148
|
+
let end = value.length;
|
|
149
|
+
while (end > 0 && value.charCodeAt(end - 1) === 47) {
|
|
150
|
+
end -= 1;
|
|
151
|
+
}
|
|
152
|
+
return end === value.length ? value : value.slice(0, end);
|
|
153
|
+
}
|