@oscharko-dev/keiko-local-knowledge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/bounded-document-extraction.d.ts +27 -0
- package/dist/bounded-document-extraction.d.ts.map +1 -0
- package/dist/bounded-document-extraction.js +214 -0
- package/dist/capsule-lifecycle.d.ts +33 -0
- package/dist/capsule-lifecycle.d.ts.map +1 -0
- package/dist/capsule-lifecycle.js +292 -0
- package/dist/capsule-set-lifecycle.d.ts +15 -0
- package/dist/capsule-set-lifecycle.d.ts.map +1 -0
- package/dist/capsule-set-lifecycle.js +158 -0
- package/dist/chunking/chunker-persist.d.ts +36 -0
- package/dist/chunking/chunker-persist.d.ts.map +1 -0
- package/dist/chunking/chunker-persist.js +74 -0
- package/dist/chunking/chunker-runner.d.ts +9 -0
- package/dist/chunking/chunker-runner.d.ts.map +1 -0
- package/dist/chunking/chunker-runner.js +218 -0
- package/dist/chunking/chunker.d.ts +7 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +139 -0
- package/dist/chunking/citation-mapper.d.ts +4 -0
- package/dist/chunking/citation-mapper.d.ts.map +1 -0
- package/dist/chunking/citation-mapper.js +180 -0
- package/dist/chunking/index.d.ts +6 -0
- package/dist/chunking/index.d.ts.map +1 -0
- package/dist/chunking/index.js +8 -0
- package/dist/chunking/token-estimator.d.ts +3 -0
- package/dist/chunking/token-estimator.d.ts.map +1 -0
- package/dist/chunking/token-estimator.js +26 -0
- package/dist/chunking/types.d.ts +49 -0
- package/dist/chunking/types.d.ts.map +1 -0
- package/dist/chunking/types.js +26 -0
- package/dist/composition.d.ts +57 -0
- package/dist/composition.d.ts.map +1 -0
- package/dist/composition.js +310 -0
- package/dist/conversation/citation-attacher.d.ts +8 -0
- package/dist/conversation/citation-attacher.d.ts.map +1 -0
- package/dist/conversation/citation-attacher.js +55 -0
- package/dist/conversation/citation-excerpts.d.ts +4 -0
- package/dist/conversation/citation-excerpts.d.ts.map +1 -0
- package/dist/conversation/citation-excerpts.js +41 -0
- package/dist/conversation/grounded-answer-runner.d.ts +9 -0
- package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
- package/dist/conversation/grounded-answer-runner.js +61 -0
- package/dist/conversation/index.d.ts +5 -0
- package/dist/conversation/index.d.ts.map +1 -0
- package/dist/conversation/index.js +7 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
- package/dist/conversation/model-gateway-answer-generator.js +105 -0
- package/dist/conversation/types.d.ts +35 -0
- package/dist/conversation/types.d.ts.map +1 -0
- package/dist/conversation/types.js +24 -0
- package/dist/discovery/discovery-runner.d.ts +23 -0
- package/dist/discovery/discovery-runner.d.ts.map +1 -0
- package/dist/discovery/discovery-runner.js +109 -0
- package/dist/discovery/extract-progressive.d.ts +17 -0
- package/dist/discovery/extract-progressive.d.ts.map +1 -0
- package/dist/discovery/extract-progressive.js +522 -0
- package/dist/discovery/extract.d.ts +26 -0
- package/dist/discovery/extract.d.ts.map +1 -0
- package/dist/discovery/extract.js +906 -0
- package/dist/discovery/glob.d.ts +10 -0
- package/dist/discovery/glob.d.ts.map +1 -0
- package/dist/discovery/glob.js +72 -0
- package/dist/discovery/index.d.ts +6 -0
- package/dist/discovery/index.d.ts.map +1 -0
- package/dist/discovery/index.js +8 -0
- package/dist/discovery/media-type.d.ts +4 -0
- package/dist/discovery/media-type.d.ts.map +1 -0
- package/dist/discovery/media-type.js +62 -0
- package/dist/discovery/persist.d.ts +63 -0
- package/dist/discovery/persist.d.ts.map +1 -0
- package/dist/discovery/persist.js +345 -0
- package/dist/discovery/test-support.d.ts +16 -0
- package/dist/discovery/test-support.d.ts.map +1 -0
- package/dist/discovery/test-support.js +127 -0
- package/dist/discovery/types.d.ts +63 -0
- package/dist/discovery/types.d.ts.map +1 -0
- package/dist/discovery/types.js +28 -0
- package/dist/discovery/walk.d.ts +12 -0
- package/dist/discovery/walk.d.ts.map +1 -0
- package/dist/discovery/walk.js +302 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +22 -0
- package/dist/evaluations/dimensions.d.ts +14 -0
- package/dist/evaluations/dimensions.d.ts.map +1 -0
- package/dist/evaluations/dimensions.js +191 -0
- package/dist/evaluations/fixtures.d.ts +18 -0
- package/dist/evaluations/fixtures.d.ts.map +1 -0
- package/dist/evaluations/fixtures.js +858 -0
- package/dist/evaluations/index.d.ts +7 -0
- package/dist/evaluations/index.d.ts.map +1 -0
- package/dist/evaluations/index.js +10 -0
- package/dist/evaluations/report.d.ts +3 -0
- package/dist/evaluations/report.d.ts.map +1 -0
- package/dist/evaluations/report.js +31 -0
- package/dist/evaluations/runner-seed.d.ts +12 -0
- package/dist/evaluations/runner-seed.d.ts.map +1 -0
- package/dist/evaluations/runner-seed.js +175 -0
- package/dist/evaluations/runner.d.ts +8 -0
- package/dist/evaluations/runner.d.ts.map +1 -0
- package/dist/evaluations/runner.js +205 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
- package/dist/evaluations/scripted-embedding-adapter.js +163 -0
- package/dist/evaluations/types.d.ts +116 -0
- package/dist/evaluations/types.d.ts.map +1 -0
- package/dist/evaluations/types.js +27 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +41 -0
- package/dist/indexing/bounded-indexing.d.ts +41 -0
- package/dist/indexing/bounded-indexing.d.ts.map +1 -0
- package/dist/indexing/bounded-indexing.js +240 -0
- package/dist/indexing/checkpoint-persist.d.ts +8 -0
- package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
- package/dist/indexing/checkpoint-persist.js +135 -0
- package/dist/indexing/checkpoint-resume.d.ts +20 -0
- package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
- package/dist/indexing/checkpoint-resume.js +50 -0
- package/dist/indexing/embedding-batcher.d.ts +3 -0
- package/dist/indexing/embedding-batcher.d.ts.map +1 -0
- package/dist/indexing/embedding-batcher.js +390 -0
- package/dist/indexing/index.d.ts +7 -0
- package/dist/indexing/index.d.ts.map +1 -0
- package/dist/indexing/index.js +11 -0
- package/dist/indexing/job-persist.d.ts +46 -0
- package/dist/indexing/job-persist.d.ts.map +1 -0
- package/dist/indexing/job-persist.js +157 -0
- package/dist/indexing/job-resume.d.ts +4 -0
- package/dist/indexing/job-resume.d.ts.map +1 -0
- package/dist/indexing/job-resume.js +14 -0
- package/dist/indexing/orchestrator.d.ts +3 -0
- package/dist/indexing/orchestrator.d.ts.map +1 -0
- package/dist/indexing/orchestrator.js +1151 -0
- package/dist/indexing/types.d.ts +156 -0
- package/dist/indexing/types.d.ts.map +1 -0
- package/dist/indexing/types.js +30 -0
- package/dist/indexing/vector-persist.d.ts +32 -0
- package/dist/indexing/vector-persist.d.ts.map +1 -0
- package/dist/indexing/vector-persist.js +105 -0
- package/dist/parsers/_internal.d.ts +20 -0
- package/dist/parsers/_internal.d.ts.map +1 -0
- package/dist/parsers/_internal.js +122 -0
- package/dist/parsers/csv-parser.d.ts +3 -0
- package/dist/parsers/csv-parser.d.ts.map +1 -0
- package/dist/parsers/csv-parser.js +202 -0
- package/dist/parsers/docx-parser.d.ts +3 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +390 -0
- package/dist/parsers/html-parser.d.ts +3 -0
- package/dist/parsers/html-parser.d.ts.map +1 -0
- package/dist/parsers/html-parser.js +310 -0
- package/dist/parsers/index.d.ts +15 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/index.js +41 -0
- package/dist/parsers/json-parser.d.ts +3 -0
- package/dist/parsers/json-parser.d.ts.map +1 -0
- package/dist/parsers/json-parser.js +192 -0
- package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
- package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
- package/dist/parsers/large-document/capability-discovery.js +76 -0
- package/dist/parsers/large-document/diagnostics.d.ts +3 -0
- package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
- package/dist/parsers/large-document/diagnostics.js +11 -0
- package/dist/parsers/large-document/index.d.ts +15 -0
- package/dist/parsers/large-document/index.d.ts.map +1 -0
- package/dist/parsers/large-document/index.js +10 -0
- package/dist/parsers/large-document/legacy-format.d.ts +5 -0
- package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
- package/dist/parsers/large-document/legacy-format.js +25 -0
- package/dist/parsers/large-document/preflight.d.ts +9 -0
- package/dist/parsers/large-document/preflight.d.ts.map +1 -0
- package/dist/parsers/large-document/preflight.js +43 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-extraction.js +123 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-pdf.js +145 -0
- package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
- package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
- package/dist/parsers/large-document/synthetic-source.js +101 -0
- package/dist/parsers/large-document/window-builder.d.ts +24 -0
- package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
- package/dist/parsers/large-document/window-builder.js +75 -0
- package/dist/parsers/ocr/index.d.ts +4 -0
- package/dist/parsers/ocr/index.d.ts.map +1 -0
- package/dist/parsers/ocr/index.js +4 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
- package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
- package/dist/parsers/ocr/types.d.ts +16 -0
- package/dist/parsers/ocr/types.d.ts.map +1 -0
- package/dist/parsers/ocr/types.js +4 -0
- package/dist/parsers/parser-test-fixtures.d.ts +28 -0
- package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
- package/dist/parsers/parser-test-fixtures.js +139 -0
- package/dist/parsers/pdf-parser.d.ts +43 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +388 -0
- package/dist/parsers/registry.d.ts +8 -0
- package/dist/parsers/registry.d.ts.map +1 -0
- package/dist/parsers/registry.js +57 -0
- package/dist/parsers/text-parser.d.ts +3 -0
- package/dist/parsers/text-parser.d.ts.map +1 -0
- package/dist/parsers/text-parser.js +214 -0
- package/dist/parsers/types.d.ts +53 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers/types.js +21 -0
- package/dist/parsers/unsupported-parser.d.ts +4 -0
- package/dist/parsers/unsupported-parser.d.ts.map +1 -0
- package/dist/parsers/unsupported-parser.js +97 -0
- package/dist/parsers/xlsx-parser.d.ts +3 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +425 -0
- package/dist/privacy/audit-emitter.d.ts +5 -0
- package/dist/privacy/audit-emitter.d.ts.map +1 -0
- package/dist/privacy/audit-emitter.js +93 -0
- package/dist/privacy/diagnostic-redactor.d.ts +2 -0
- package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
- package/dist/privacy/diagnostic-redactor.js +153 -0
- package/dist/privacy/index.d.ts +5 -0
- package/dist/privacy/index.d.ts.map +1 -0
- package/dist/privacy/index.js +6 -0
- package/dist/privacy/retention-applier.d.ts +5 -0
- package/dist/privacy/retention-applier.d.ts.map +1 -0
- package/dist/privacy/retention-applier.js +88 -0
- package/dist/privacy/types.d.ts +98 -0
- package/dist/privacy/types.d.ts.map +1 -0
- package/dist/privacy/types.js +12 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
- package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
- package/dist/qualityIntelligence/index.d.ts +3 -0
- package/dist/qualityIntelligence/index.d.ts.map +1 -0
- package/dist/qualityIntelligence/index.js +5 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
- package/dist/qualityIntelligence/qiHandoff.js +82 -0
- package/dist/retrieval/answer-grounding.d.ts +9 -0
- package/dist/retrieval/answer-grounding.d.ts.map +1 -0
- package/dist/retrieval/answer-grounding.js +31 -0
- package/dist/retrieval/context-pack-assembler.d.ts +24 -0
- package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
- package/dist/retrieval/context-pack-assembler.js +50 -0
- package/dist/retrieval/index.d.ts +6 -0
- package/dist/retrieval/index.d.ts.map +1 -0
- package/dist/retrieval/index.js +9 -0
- package/dist/retrieval/retrieval-runner.d.ts +10 -0
- package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
- package/dist/retrieval/retrieval-runner.js +163 -0
- package/dist/retrieval/scoped-vector-search.d.ts +24 -0
- package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
- package/dist/retrieval/scoped-vector-search.js +864 -0
- package/dist/retrieval/types.d.ts +28 -0
- package/dist/retrieval/types.d.ts.map +1 -0
- package/dist/retrieval/types.js +33 -0
- package/dist/section-path-hash.d.ts +3 -0
- package/dist/section-path-hash.d.ts.map +1 -0
- package/dist/section-path-hash.js +9 -0
- package/dist/source-lifecycle.d.ts +14 -0
- package/dist/source-lifecycle.d.ts.map +1 -0
- package/dist/source-lifecycle.js +155 -0
- package/dist/source-routing-validation.d.ts +11 -0
- package/dist/source-routing-validation.d.ts.map +1 -0
- package/dist/source-routing-validation.js +140 -0
- package/dist/store-content-cipher.d.ts +11 -0
- package/dist/store-content-cipher.d.ts.map +1 -0
- package/dist/store-content-cipher.js +67 -0
- package/dist/store-content-encryption.d.ts +12 -0
- package/dist/store-content-encryption.d.ts.map +1 -0
- package/dist/store-content-encryption.js +275 -0
- package/dist/store-paths.d.ts +6 -0
- package/dist/store-paths.d.ts.map +1 -0
- package/dist/store-paths.js +61 -0
- package/dist/store.d.ts +30 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +219 -0
- package/dist/testing.d.ts +47 -0
- package/dist/testing.d.ts.map +1 -0
- package/dist/testing.js +170 -0
- package/dist/version.d.ts +2 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +4 -0
- package/package.json +43 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
interface CompiledGlob {
|
|
2
|
+
readonly source: string;
|
|
3
|
+
readonly regex: RegExp;
|
|
4
|
+
}
|
|
5
|
+
export declare function compileGlob(glob: string): CompiledGlob;
|
|
6
|
+
export declare function matchesGlob(glob: CompiledGlob, relativePath: string): boolean;
|
|
7
|
+
export declare function matchesAny(globs: readonly CompiledGlob[], relativePath: string, defaultWhenEmpty: boolean): boolean;
|
|
8
|
+
export declare function compileGlobList(globs: readonly string[] | undefined): readonly CompiledGlob[];
|
|
9
|
+
export type { CompiledGlob };
|
|
10
|
+
//# sourceMappingURL=glob.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"glob.d.ts","sourceRoot":"","sources":["../../src/discovery/glob.ts"],"names":[],"mappings":"AAYA,UAAU,YAAY;IACpB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;CACxB;AAmCD,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAKtD;AAED,wBAAgB,WAAW,CAAC,IAAI,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAE7E;AAMD,wBAAgB,UAAU,CACxB,KAAK,EAAE,SAAS,YAAY,EAAE,EAC9B,YAAY,EAAE,MAAM,EACpB,gBAAgB,EAAE,OAAO,GACxB,OAAO,CAUT;AAED,wBAAgB,eAAe,CAAC,KAAK,EAAE,SAAS,MAAM,EAAE,GAAG,SAAS,GAAG,SAAS,YAAY,EAAE,CAK7F;AAED,YAAY,EAAE,YAAY,EAAE,CAAC"}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
// Minimal glob matcher for KnowledgeSourceScope include/exclude lists (Issue #194).
|
|
2
|
+
//
|
|
3
|
+
// We intentionally support ONLY the safe subset: `*` matches any sequence of characters
|
|
4
|
+
// EXCEPT `/`; `**` matches any sequence including `/`; `?` matches any single non-`/`
|
|
5
|
+
// character. All other characters are literal. Brace expansion and character classes are
|
|
6
|
+
// NOT supported — adding them now would let a malformed glob slip past the path-safety
|
|
7
|
+
// gate, and callers who really need them can keep multiple entries instead.
|
|
8
|
+
//
|
|
9
|
+
// Pure function — no FS access, no clock, no regex source built from the input character-
|
|
10
|
+
// for-character (which would risk ReDoS); we compile a single anchored RegExp per glob
|
|
11
|
+
// with linear-time alternation only.
|
|
12
|
+
const SPECIAL_RE = /[.+^${}()|[\]\\]/g;
|
|
13
|
+
// Build the regex source from the glob in a single pass so we never call `new RegExp` on
|
|
14
|
+
// an attacker-controlled string without escaping first. `**/` is treated as "zero or more
|
|
15
|
+
// path segments" (so `**/*.md` matches `foo.md` AND `a/b/foo.md`), matching the standard
|
|
16
|
+
// tooling convention; a bare `**` (not followed by `/`) matches any sequence including `/`.
|
|
17
|
+
function compileGlobSource(glob) {
|
|
18
|
+
let out = "^";
|
|
19
|
+
for (let i = 0; i < glob.length; i += 1) {
|
|
20
|
+
const ch = glob[i] ?? "";
|
|
21
|
+
if (ch === "*") {
|
|
22
|
+
if (glob[i + 1] === "*") {
|
|
23
|
+
if (glob[i + 2] === "/") {
|
|
24
|
+
out += "(?:.*/)?";
|
|
25
|
+
i += 2;
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
out += ".*";
|
|
29
|
+
i += 1;
|
|
30
|
+
continue;
|
|
31
|
+
}
|
|
32
|
+
out += "[^/]*";
|
|
33
|
+
continue;
|
|
34
|
+
}
|
|
35
|
+
if (ch === "?") {
|
|
36
|
+
out += "[^/]";
|
|
37
|
+
continue;
|
|
38
|
+
}
|
|
39
|
+
out += ch.replace(SPECIAL_RE, "\\$&");
|
|
40
|
+
}
|
|
41
|
+
return `${out}$`;
|
|
42
|
+
}
|
|
43
|
+
export function compileGlob(glob) {
|
|
44
|
+
const source = compileGlobSource(glob);
|
|
45
|
+
// The compiled pattern has bounded alternation (`.*`, `[^/]*`, `[^/]`, literals only),
|
|
46
|
+
// so backtracking is linear in input length — safe against ReDoS even on hostile globs.
|
|
47
|
+
return { source, regex: new RegExp(source) };
|
|
48
|
+
}
|
|
49
|
+
export function matchesGlob(glob, relativePath) {
|
|
50
|
+
return glob.regex.test(relativePath);
|
|
51
|
+
}
|
|
52
|
+
// Convenience: returns `true` when `relativePath` matches AT LEAST ONE of `globs`.
|
|
53
|
+
// An empty `globs` list returns `defaultWhenEmpty` — that distinguishes the two scopes:
|
|
54
|
+
// * includeGlobs: default to "match everything" when unset.
|
|
55
|
+
// * excludeGlobs: default to "match nothing" when unset.
|
|
56
|
+
export function matchesAny(globs, relativePath, defaultWhenEmpty) {
|
|
57
|
+
if (globs.length === 0) {
|
|
58
|
+
return defaultWhenEmpty;
|
|
59
|
+
}
|
|
60
|
+
for (const glob of globs) {
|
|
61
|
+
if (matchesGlob(glob, relativePath)) {
|
|
62
|
+
return true;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return false;
|
|
66
|
+
}
|
|
67
|
+
export function compileGlobList(globs) {
|
|
68
|
+
if (globs === undefined || globs.length === 0) {
|
|
69
|
+
return [];
|
|
70
|
+
}
|
|
71
|
+
return globs.map((glob) => compileGlob(glob));
|
|
72
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { DEFAULT_DISCOVERY_OPTIONS, documentIdFor, type DiscoveredFile, type DiscoveryError, type DiscoveryErrorCode, type DiscoveryOptions, type ExtractionEvent, type ExtractionOutcome, type ExtractionResult, } from "./types.js";
|
|
2
|
+
export { walkSource, type WalkYield } from "./walk.js";
|
|
3
|
+
export { extractDocument, type ExtractDocumentDeps, type ExtractDocumentParams, } from "./extract.js";
|
|
4
|
+
export { discoverAndExtract, type DiscoverAndExtractDeps, type DiscoverAndExtractParams, } from "./discovery-runner.js";
|
|
5
|
+
export { extensionOf, mediaTypeFor } from "./media-type.js";
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/discovery/index.ts"],"names":[],"mappings":"AAIA,OAAO,EACL,yBAAyB,EACzB,aAAa,EACb,KAAK,cAAc,EACnB,KAAK,cAAc,EACnB,KAAK,kBAAkB,EACvB,KAAK,gBAAgB,EACrB,KAAK,eAAe,EACpB,KAAK,iBAAiB,EACtB,KAAK,gBAAgB,GACtB,MAAM,YAAY,CAAC;AAEpB,OAAO,EAAE,UAAU,EAAE,KAAK,SAAS,EAAE,MAAM,WAAW,CAAC;AAEvD,OAAO,EACL,eAAe,EACf,KAAK,mBAAmB,EACxB,KAAK,qBAAqB,GAC3B,MAAM,cAAc,CAAC;AAEtB,OAAO,EACL,kBAAkB,EAClB,KAAK,sBAAsB,EAC3B,KAAK,wBAAwB,GAC9B,MAAM,uBAAuB,CAAC;AAE/B,OAAO,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
// Public surface of the discovery + extraction bridge (Epic #189, Issue #194). Consumers
|
|
2
|
+
// import everything from `@oscharko-dev/keiko-local-knowledge`; this module is the single
|
|
3
|
+
// re-export point so the package barrel stays a flat list of names.
|
|
4
|
+
export { DEFAULT_DISCOVERY_OPTIONS, documentIdFor, } from "./types.js";
|
|
5
|
+
export { walkSource } from "./walk.js";
|
|
6
|
+
export { extractDocument, } from "./extract.js";
|
|
7
|
+
export { discoverAndExtract, } from "./discovery-runner.js";
|
|
8
|
+
export { extensionOf, mediaTypeFor } from "./media-type.js";
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"media-type.d.ts","sourceRoot":"","sources":["../../src/discovery/media-type.ts"],"names":[],"mappings":"AAgDA,wBAAgB,WAAW,CAAC,YAAY,EAAE,MAAM,GAAG,MAAM,CAQxD;AAED,wBAAgB,YAAY,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAEtD;AAED,wBAAgB,UAAU,CAAC,YAAY,EAAE,MAAM,GAAG,MAAM,CAGvD"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
// Pure extension/media-type lookup table (Issue #194). The discovery layer must give the
|
|
2
|
+
// parser registry a `(extension, mediaType)` hint without sniffing the bytes. This table
|
|
3
|
+
// covers the formats supported by the shipped adapters; unknown extensions fall through to
|
|
4
|
+
// the unsupported sentinel, which then sniffs magic bytes.
|
|
5
|
+
//
|
|
6
|
+
// The mapping is intentionally NOT exhaustive — it's the minimal set that lets the parser
|
|
7
|
+
// registry route correctly. Adding a new entry is a one-line change.
|
|
8
|
+
const MEDIA_TYPES = Object.freeze({
|
|
9
|
+
// text-like
|
|
10
|
+
txt: "text/plain",
|
|
11
|
+
md: "text/markdown",
|
|
12
|
+
markdown: "text/markdown",
|
|
13
|
+
rst: "text/x-rst",
|
|
14
|
+
log: "text/plain",
|
|
15
|
+
ts: "text/x-typescript",
|
|
16
|
+
tsx: "text/x-typescript",
|
|
17
|
+
js: "text/javascript",
|
|
18
|
+
jsx: "text/javascript",
|
|
19
|
+
mjs: "text/javascript",
|
|
20
|
+
cjs: "text/javascript",
|
|
21
|
+
py: "text/x-python",
|
|
22
|
+
go: "text/x-go",
|
|
23
|
+
rs: "text/x-rust",
|
|
24
|
+
java: "text/x-java",
|
|
25
|
+
yaml: "text/yaml",
|
|
26
|
+
yml: "text/yaml",
|
|
27
|
+
// json
|
|
28
|
+
json: "application/json",
|
|
29
|
+
jsonl: "application/x-ndjson",
|
|
30
|
+
ndjson: "application/x-ndjson",
|
|
31
|
+
// csv / tsv
|
|
32
|
+
csv: "text/csv",
|
|
33
|
+
tsv: "text/tab-separated-values",
|
|
34
|
+
// html
|
|
35
|
+
html: "text/html",
|
|
36
|
+
htm: "text/html",
|
|
37
|
+
xhtml: "application/xhtml+xml",
|
|
38
|
+
// unsupported binaries (the unsupported adapter still classifies these explicitly)
|
|
39
|
+
pdf: "application/pdf",
|
|
40
|
+
docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
41
|
+
xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
42
|
+
png: "image/png",
|
|
43
|
+
jpg: "image/jpeg",
|
|
44
|
+
jpeg: "image/jpeg",
|
|
45
|
+
gif: "image/gif",
|
|
46
|
+
});
|
|
47
|
+
export function extensionOf(relativePath) {
|
|
48
|
+
const slash = relativePath.lastIndexOf("/");
|
|
49
|
+
const basename = slash === -1 ? relativePath : relativePath.slice(slash + 1);
|
|
50
|
+
const dot = basename.lastIndexOf(".");
|
|
51
|
+
if (dot <= 0 || dot === basename.length - 1) {
|
|
52
|
+
return "";
|
|
53
|
+
}
|
|
54
|
+
return basename.slice(dot + 1).toLowerCase();
|
|
55
|
+
}
|
|
56
|
+
export function mediaTypeFor(extension) {
|
|
57
|
+
return MEDIA_TYPES[extension] ?? "";
|
|
58
|
+
}
|
|
59
|
+
export function basenameOf(relativePath) {
|
|
60
|
+
const slash = relativePath.lastIndexOf("/");
|
|
61
|
+
return slash === -1 ? relativePath : relativePath.slice(slash + 1);
|
|
62
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import type { DocumentId, KnowledgeCapsuleId, KnowledgeSourceId, PageRecord, ParsedUnit, ParserDiagnostic, SectionRecord } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
import type { DatabaseSync } from "node:sqlite";
|
|
3
|
+
import type { StoreContentCipher } from "../store-content-cipher.js";
|
|
4
|
+
export interface DocumentInsertRow {
|
|
5
|
+
readonly id: DocumentId;
|
|
6
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
7
|
+
readonly sourceId: string;
|
|
8
|
+
readonly documentPath: string;
|
|
9
|
+
readonly sizeBytes: number;
|
|
10
|
+
readonly mediaType: string;
|
|
11
|
+
readonly contentHash: string;
|
|
12
|
+
readonly parserId: string;
|
|
13
|
+
readonly parserVersion: string;
|
|
14
|
+
readonly lastExtractedAt: number;
|
|
15
|
+
readonly status: string;
|
|
16
|
+
readonly safeDisplayName: string;
|
|
17
|
+
}
|
|
18
|
+
export declare function insertDocumentRow(db: DatabaseSync, row: DocumentInsertRow): void;
|
|
19
|
+
export declare function deleteDependentRows(db: DatabaseSync, capsuleId: KnowledgeCapsuleId, documentId: DocumentId): void;
|
|
20
|
+
export declare function insertDocumentTextRow(db: DatabaseSync, cipher: StoreContentCipher, capsuleId: KnowledgeCapsuleId, documentId: DocumentId, normalizedText: string): void;
|
|
21
|
+
export declare function readDocumentTextRow(db: DatabaseSync, cipher: StoreContentCipher, capsuleId: KnowledgeCapsuleId, documentId: DocumentId): string | undefined;
|
|
22
|
+
export interface DocumentTextWindowInsertRow {
|
|
23
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
24
|
+
readonly documentId: DocumentId;
|
|
25
|
+
readonly windowIndex: number;
|
|
26
|
+
readonly characterStart: number;
|
|
27
|
+
readonly characterEnd: number;
|
|
28
|
+
readonly normalizedText: string;
|
|
29
|
+
}
|
|
30
|
+
export declare function insertDocumentTextWindowRow(db: DatabaseSync, cipher: StoreContentCipher, row: DocumentTextWindowInsertRow): void;
|
|
31
|
+
export declare function deleteDocumentTextWindows(db: DatabaseSync, capsuleId: KnowledgeCapsuleId, documentId: DocumentId): void;
|
|
32
|
+
export declare function readDocumentTextSpan(db: DatabaseSync, cipher: StoreContentCipher, capsuleId: KnowledgeCapsuleId, documentId: DocumentId, charStart: number, charEnd: number): string | undefined;
|
|
33
|
+
export interface PersistedSourceDocumentRow {
|
|
34
|
+
readonly id: DocumentId;
|
|
35
|
+
readonly document_path: string;
|
|
36
|
+
}
|
|
37
|
+
export declare function listPersistedDocumentsForSource(db: DatabaseSync, capsuleId: KnowledgeCapsuleId, sourceId: KnowledgeSourceId): readonly PersistedSourceDocumentRow[];
|
|
38
|
+
export declare function deleteDocumentRow(db: DatabaseSync, capsuleId: KnowledgeCapsuleId, documentId: DocumentId): void;
|
|
39
|
+
export declare function updateDocumentStatusRow(db: DatabaseSync, capsuleId: KnowledgeCapsuleId, documentId: DocumentId, status: DocumentInsertRow["status"]): void;
|
|
40
|
+
export declare function insertPageRow(db: DatabaseSync, capsuleId: KnowledgeCapsuleId, page: PageRecord): void;
|
|
41
|
+
export declare function insertSectionRow(db: DatabaseSync, cipher: StoreContentCipher, capsuleId: KnowledgeCapsuleId, section: SectionRecord): void;
|
|
42
|
+
export declare function insertParsedUnitRow(db: DatabaseSync, cipher: StoreContentCipher, capsuleId: KnowledgeCapsuleId, unitId: string, unit: ParsedUnit): void;
|
|
43
|
+
export declare function insertDiagnosticRow(db: DatabaseSync, params: {
|
|
44
|
+
readonly id: string;
|
|
45
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
46
|
+
readonly diagnostic: ParserDiagnostic;
|
|
47
|
+
readonly createdAt: number;
|
|
48
|
+
}): void;
|
|
49
|
+
interface ExistingDocumentRow {
|
|
50
|
+
readonly content_hash: string;
|
|
51
|
+
readonly status: string;
|
|
52
|
+
readonly size_bytes: number;
|
|
53
|
+
readonly media_type: string;
|
|
54
|
+
readonly parser_id: string;
|
|
55
|
+
readonly parser_version: string;
|
|
56
|
+
readonly last_extracted_at: number;
|
|
57
|
+
readonly safe_display_name: string;
|
|
58
|
+
readonly document_path: string;
|
|
59
|
+
readonly source_id: string;
|
|
60
|
+
}
|
|
61
|
+
export declare function readExistingDocumentRow(db: DatabaseSync, capsuleId: KnowledgeCapsuleId, documentId: DocumentId): ExistingDocumentRow | undefined;
|
|
62
|
+
export {};
|
|
63
|
+
//# sourceMappingURL=persist.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"persist.d.ts","sourceRoot":"","sources":["../../src/discovery/persist.ts"],"names":[],"mappings":"AASA,OAAO,KAAK,EACV,UAAU,EACV,kBAAkB,EAClB,iBAAiB,EACjB,UAAU,EACV,UAAU,EACV,gBAAgB,EAChB,aAAa,EACd,MAAM,+BAA+B,CAAC;AACvC,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAGhD,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAuLrE,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,EAAE,EAAE,UAAU,CAAC;IACxB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;IACjC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;CAClC;AAED,wBAAgB,iBAAiB,CAAC,EAAE,EAAE,YAAY,EAAE,GAAG,EAAE,iBAAiB,GAAG,IAAI,CAehF;AAED,wBAAgB,mBAAmB,CACjC,EAAE,EAAE,YAAY,EAChB,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,GACrB,IAAI,CASN;AAED,wBAAgB,qBAAqB,CACnC,EAAE,EAAE,YAAY,EAChB,MAAM,EAAE,kBAAkB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,EACtB,cAAc,EAAE,MAAM,GACrB,IAAI,CAMN;AAMD,wBAAgB,mBAAmB,CACjC,EAAE,EAAE,YAAY,EAChB,MAAM,EAAE,kBAAkB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,GACrB,MAAM,GAAG,SAAS,CAMpB;AAGD,MAAM,WAAW,2BAA2B;IAC1C,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;IAChC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;CACjC;AAED,wBAAgB,2BAA2B,CACzC,EAAE,EAAE,YAAY,EAChB,MAAM,EAAE,kBAAkB,EAC1B,GAAG,EAAE,2BAA2B,GAC/B,IAAI,CASN;AAED,wBAAgB,yBAAyB,CACvC,EAAE,EAAE,YAAY,EAChB,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,GACrB,IAAI,CAEN;AAoBD,wBAAgB,oBAAoB,CAClC,EAAE,EAAE,YAAY,EAChB,MAAM,EAAE,kBAAkB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,EACtB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CA8BpB;AAED,MAAM,WAAW,0BAA0B;IACzC,QAAQ,CAAC,EAAE,EAAE,UAAU,CAAC;IACxB,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;CAChC;AAED,wBAAgB,+BAA+B,CAC7C,EAAE,EAAE,YAAY,EAChB,SAAS,EAAE,kBAAkB,EAC7B,QAAQ,EAAE,iBAAiB,GAC1B,SAAS,0BAA0B,EAAE,CAMvC;AAED,wBAAgB,iBAAiB,CAC/B,EAAE,EAAE,YAAY,EAChB,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,GACrB,IAAI,CAGN;AAED,wBAAgB,uBAAuB,CACrC,EAAE,EAAE,YAAY,EAChB,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,EACtB,MAAM,EAAE,iBAAiB,CAAC,QAAQ,CAAC,GAClC,IAAI,CAMN;AAED,wBAAgB,aAAa,CAC3B,EAAE,EAAE,YAAY,EAChB,SAAS,EAAE,kBAAkB,EAC7B,IAAI,EAAE,UAAU,GACf,IAAI,CAaN;AAED,wBAAgB,gBAAgB,CAC9B,EAAE,EAAE,YAAY,EAChB,MAAM,EAAE,kBAAkB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,OAAO,EAAE,aAAa,GACrB,IAAI,CAUN;AAkFD,wBAAgB,mBAAmB,CACjC,EAAE,EAAE,YAAY,EAChB,MAAM,EAAE,kBAAkB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,MAAM,EAAE,MAAM,EACd,IAAI,EAAE,UAAU,GACf,IAAI,CAEN;AAED,wBAAgB,mBAAmB,CACjC,EAAE,EAAE,YAAY,EAChB,MAAM,EAAE;IACN,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,UAAU,EAAE,gBAAgB,CAAC;IACtC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B,GACA,IAAI,CAWN;AAED,UAAU,mBAAmB;IAC3B,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,iBAAiB,EAAE,MAAM,CAAC;IACnC,QAAQ,CAAC,iBAAiB,EAAE,MAAM,CAAC;IACnC,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAED,wBAAgB,uBAAuB,CACrC,EAAE,EAAE,YAAY,EAChB,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,GACrB,mBAAmB,GAAG,SAAS,CAKjC"}
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
// SQLite persistence helpers for the discovery layer (Issue #194). Every helper here is a
|
|
2
|
+
// prepared-statement wrapper around a single table; the transaction boundary lives in
|
|
3
|
+
// extract.ts so a per-file failure rolls back exactly the rows from that file.
|
|
4
|
+
//
|
|
5
|
+
// All inserts use REPLACE semantics on the document row (PRIMARY KEY id), but the
|
|
6
|
+
// dependent rows (pages, sections, parsed_units, parser_diagnostics) are deleted first via
|
|
7
|
+
// the documents-cascade chain — see deleteDependentRows. That keeps a re-extract idempotent:
|
|
8
|
+
// running extract twice on the same file leaves exactly one set of rows on disk.
|
|
9
|
+
import { sectionPathHash } from "../section-path-hash.js";
|
|
10
|
+
const INSERT_DOCUMENT_SQL = [
|
|
11
|
+
"INSERT OR REPLACE INTO documents (",
|
|
12
|
+
" id, capsule_id, source_id, document_path, size_bytes, media_type,",
|
|
13
|
+
" content_hash, parser_id, parser_version, last_extracted_at, status, safe_display_name",
|
|
14
|
+
") VALUES (",
|
|
15
|
+
" :id, :capsule_id, :source_id, :document_path, :size_bytes, :media_type,",
|
|
16
|
+
" :content_hash, :parser_id, :parser_version, :last_extracted_at, :status, :safe_display_name",
|
|
17
|
+
")",
|
|
18
|
+
].join(" ");
|
|
19
|
+
const INSERT_DOCUMENT_TEXT_SQL = [
|
|
20
|
+
"INSERT OR REPLACE INTO document_texts (",
|
|
21
|
+
" capsule_id, document_id, normalized_text",
|
|
22
|
+
") VALUES (",
|
|
23
|
+
" :capsule_id, :document_id, :normalized_text",
|
|
24
|
+
")",
|
|
25
|
+
].join(" ");
|
|
26
|
+
const INSERT_PAGE_SQL = [
|
|
27
|
+
"INSERT INTO pages (",
|
|
28
|
+
" capsule_id, document_id, page_number, page_label, character_start, character_end,",
|
|
29
|
+
" bbox_x, bbox_y, bbox_w, bbox_h",
|
|
30
|
+
") VALUES (",
|
|
31
|
+
" :capsule_id, :document_id, :page_number, :page_label, :character_start, :character_end,",
|
|
32
|
+
" :bbox_x, :bbox_y, :bbox_w, :bbox_h",
|
|
33
|
+
")",
|
|
34
|
+
].join(" ");
|
|
35
|
+
const INSERT_SECTION_SQL = [
|
|
36
|
+
"INSERT INTO sections (",
|
|
37
|
+
" capsule_id, document_id, section_path_json, section_path_hash, character_start, character_end",
|
|
38
|
+
") VALUES (",
|
|
39
|
+
" :capsule_id, :document_id, :section_path_json, :section_path_hash, :character_start, :character_end",
|
|
40
|
+
")",
|
|
41
|
+
].join(" ");
|
|
42
|
+
const INSERT_PARSED_UNIT_SQL = [
|
|
43
|
+
"INSERT INTO parsed_units (",
|
|
44
|
+
" id, capsule_id, document_id, kind, page_number, page_label, section_path_json,",
|
|
45
|
+
" json_pointer, table_name, row_index, heading_path_json, unsupported_reason,",
|
|
46
|
+
" character_start, character_end",
|
|
47
|
+
") VALUES (",
|
|
48
|
+
" :id, :capsule_id, :document_id, :kind, :page_number, :page_label, :section_path_json,",
|
|
49
|
+
" :json_pointer, :table_name, :row_index, :heading_path_json, :unsupported_reason,",
|
|
50
|
+
" :character_start, :character_end",
|
|
51
|
+
")",
|
|
52
|
+
].join(" ");
|
|
53
|
+
const INSERT_DIAGNOSTIC_SQL = [
|
|
54
|
+
"INSERT INTO parser_diagnostics (",
|
|
55
|
+
" id, capsule_id, document_id, severity, code, message, page_number, created_at",
|
|
56
|
+
") VALUES (",
|
|
57
|
+
" :id, :capsule_id, :document_id, :severity, :code, :message, :page_number, :created_at",
|
|
58
|
+
")",
|
|
59
|
+
].join(" ");
|
|
60
|
+
const DELETE_PAGES_SQL = "DELETE FROM pages WHERE capsule_id = :c AND document_id = :d";
|
|
61
|
+
const DELETE_SECTIONS_SQL = "DELETE FROM sections WHERE capsule_id = :c AND document_id = :d";
|
|
62
|
+
const DELETE_DOCUMENT_TEXT_SQL = "DELETE FROM document_texts WHERE capsule_id = :c AND document_id = :d";
|
|
63
|
+
const DELETE_PARSED_UNITS_SQL = "DELETE FROM parsed_units WHERE capsule_id = :c AND document_id = :d";
|
|
64
|
+
const DELETE_DIAGNOSTICS_SQL = "DELETE FROM parser_diagnostics WHERE capsule_id = :c AND document_id = :d";
|
|
65
|
+
const SELECT_DOCUMENT_TEXT_SQL = "SELECT normalized_text FROM document_texts WHERE capsule_id = :c AND document_id = :d";
|
|
66
|
+
// ─── Bounded large-document text windows (Epic #1160, Issue #1286) ─────────────
|
|
67
|
+
const INSERT_DOCUMENT_TEXT_WINDOW_SQL = [
|
|
68
|
+
"INSERT OR REPLACE INTO document_text_windows (",
|
|
69
|
+
" capsule_id, document_id, window_index, character_start, character_end, normalized_text",
|
|
70
|
+
") VALUES (",
|
|
71
|
+
" :capsule_id, :document_id, :window_index, :character_start, :character_end, :normalized_text",
|
|
72
|
+
")",
|
|
73
|
+
].join(" ");
|
|
74
|
+
const DELETE_DOCUMENT_TEXT_WINDOWS_SQL = "DELETE FROM document_text_windows WHERE capsule_id = :c AND document_id = :d";
|
|
75
|
+
const DELETE_EXTRACTION_CHECKPOINT_SQL = "DELETE FROM extraction_checkpoints WHERE capsule_id = :c AND document_id = :d";
|
|
76
|
+
// SQLite SUBSTR is 1-indexed; a document-relative char offset `s` maps to SUBSTR position `s + 1`.
|
|
77
|
+
const SELECT_DOCUMENT_TEXT_SPAN_SQL = "SELECT SUBSTR(normalized_text, :start + 1, :len) AS span FROM document_texts WHERE capsule_id = :c AND document_id = :d";
|
|
78
|
+
const SELECT_DOCUMENT_TEXT_WINDOW_SPAN_SQL = [
|
|
79
|
+
"SELECT SUBSTR(normalized_text, :start - character_start + 1, :len) AS span",
|
|
80
|
+
"FROM document_text_windows",
|
|
81
|
+
"WHERE capsule_id = :c AND document_id = :d AND character_start <= :start AND character_end >= :end",
|
|
82
|
+
"ORDER BY window_index ASC LIMIT 1",
|
|
83
|
+
].join(" ");
|
|
84
|
+
// Encrypted-store span read: SQLite SUBSTR cannot slice a sealed envelope, so the encrypted path
|
|
85
|
+
// fetches the one bounded window that contains the span (with its character_start), decrypts that
|
|
86
|
+
// single window, and slices in JS. Returns the whole sealed window text so the caller decrypts once.
|
|
87
|
+
const SELECT_DOCUMENT_TEXT_WINDOW_FULL_SQL = [
|
|
88
|
+
"SELECT normalized_text, character_start AS character_start",
|
|
89
|
+
"FROM document_text_windows",
|
|
90
|
+
"WHERE capsule_id = :c AND document_id = :d AND character_start <= :start AND character_end >= :end",
|
|
91
|
+
"ORDER BY window_index ASC LIMIT 1",
|
|
92
|
+
].join(" ");
|
|
93
|
+
const SELECT_DOCUMENTS_FOR_SOURCE_SQL = [
|
|
94
|
+
"SELECT id, document_path FROM documents",
|
|
95
|
+
"WHERE capsule_id = :c AND source_id = :s",
|
|
96
|
+
"ORDER BY document_path ASC",
|
|
97
|
+
].join(" ");
|
|
98
|
+
const DELETE_DOCUMENT_SQL = "DELETE FROM documents WHERE capsule_id = :c AND id = :d";
|
|
99
|
+
const UPDATE_DOCUMENT_STATUS_SQL = "UPDATE documents SET status = :status WHERE capsule_id = :c AND id = :d";
|
|
100
|
+
const statementsByDb = new WeakMap();
|
|
101
|
+
function statements(db) {
|
|
102
|
+
const cached = statementsByDb.get(db);
|
|
103
|
+
if (cached !== undefined) {
|
|
104
|
+
return cached;
|
|
105
|
+
}
|
|
106
|
+
const prepared = {
|
|
107
|
+
insertDocument: db.prepare(INSERT_DOCUMENT_SQL),
|
|
108
|
+
insertDocumentText: db.prepare(INSERT_DOCUMENT_TEXT_SQL),
|
|
109
|
+
insertPage: db.prepare(INSERT_PAGE_SQL),
|
|
110
|
+
insertSection: db.prepare(INSERT_SECTION_SQL),
|
|
111
|
+
insertParsedUnit: db.prepare(INSERT_PARSED_UNIT_SQL),
|
|
112
|
+
insertDiagnostic: db.prepare(INSERT_DIAGNOSTIC_SQL),
|
|
113
|
+
deletePages: db.prepare(DELETE_PAGES_SQL),
|
|
114
|
+
deleteSections: db.prepare(DELETE_SECTIONS_SQL),
|
|
115
|
+
deleteDocumentText: db.prepare(DELETE_DOCUMENT_TEXT_SQL),
|
|
116
|
+
deleteParsedUnits: db.prepare(DELETE_PARSED_UNITS_SQL),
|
|
117
|
+
deleteDiagnostics: db.prepare(DELETE_DIAGNOSTICS_SQL),
|
|
118
|
+
deleteDocument: db.prepare(DELETE_DOCUMENT_SQL),
|
|
119
|
+
selectDocumentText: db.prepare(SELECT_DOCUMENT_TEXT_SQL),
|
|
120
|
+
insertDocumentTextWindow: db.prepare(INSERT_DOCUMENT_TEXT_WINDOW_SQL),
|
|
121
|
+
deleteDocumentTextWindows: db.prepare(DELETE_DOCUMENT_TEXT_WINDOWS_SQL),
|
|
122
|
+
deleteExtractionCheckpoint: db.prepare(DELETE_EXTRACTION_CHECKPOINT_SQL),
|
|
123
|
+
selectDocumentTextSpan: db.prepare(SELECT_DOCUMENT_TEXT_SPAN_SQL),
|
|
124
|
+
selectDocumentTextWindowSpan: db.prepare(SELECT_DOCUMENT_TEXT_WINDOW_SPAN_SQL),
|
|
125
|
+
selectDocumentTextWindowFull: db.prepare(SELECT_DOCUMENT_TEXT_WINDOW_FULL_SQL),
|
|
126
|
+
selectDocumentsForSource: db.prepare(SELECT_DOCUMENTS_FOR_SOURCE_SQL),
|
|
127
|
+
};
|
|
128
|
+
statementsByDb.set(db, prepared);
|
|
129
|
+
return prepared;
|
|
130
|
+
}
|
|
131
|
+
export function insertDocumentRow(db, row) {
|
|
132
|
+
statements(db).insertDocument.run({
|
|
133
|
+
id: row.id,
|
|
134
|
+
capsule_id: row.capsuleId,
|
|
135
|
+
source_id: row.sourceId,
|
|
136
|
+
document_path: row.documentPath,
|
|
137
|
+
size_bytes: row.sizeBytes,
|
|
138
|
+
media_type: row.mediaType,
|
|
139
|
+
content_hash: row.contentHash,
|
|
140
|
+
parser_id: row.parserId,
|
|
141
|
+
parser_version: row.parserVersion,
|
|
142
|
+
last_extracted_at: row.lastExtractedAt,
|
|
143
|
+
status: row.status,
|
|
144
|
+
safe_display_name: row.safeDisplayName,
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
export function deleteDependentRows(db, capsuleId, documentId) {
|
|
148
|
+
const params = { c: capsuleId, d: documentId };
|
|
149
|
+
statements(db).deleteDocumentText.run(params);
|
|
150
|
+
statements(db).deleteDocumentTextWindows.run(params);
|
|
151
|
+
statements(db).deleteExtractionCheckpoint.run(params);
|
|
152
|
+
statements(db).deletePages.run(params);
|
|
153
|
+
statements(db).deleteSections.run(params);
|
|
154
|
+
statements(db).deleteParsedUnits.run(params);
|
|
155
|
+
statements(db).deleteDiagnostics.run(params);
|
|
156
|
+
}
|
|
157
|
+
export function insertDocumentTextRow(db, cipher, capsuleId, documentId, normalizedText) {
|
|
158
|
+
statements(db).insertDocumentText.run({
|
|
159
|
+
capsule_id: capsuleId,
|
|
160
|
+
document_id: documentId,
|
|
161
|
+
normalized_text: cipher.sealText(normalizedText),
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
export function readDocumentTextRow(db, cipher, capsuleId, documentId) {
|
|
165
|
+
const row = statements(db).selectDocumentText.get({
|
|
166
|
+
c: capsuleId,
|
|
167
|
+
d: documentId,
|
|
168
|
+
});
|
|
169
|
+
return row === undefined ? undefined : cipher.openText(row.normalized_text);
|
|
170
|
+
}
|
|
171
|
+
export function insertDocumentTextWindowRow(db, cipher, row) {
|
|
172
|
+
statements(db).insertDocumentTextWindow.run({
|
|
173
|
+
capsule_id: String(row.capsuleId),
|
|
174
|
+
document_id: String(row.documentId),
|
|
175
|
+
window_index: row.windowIndex,
|
|
176
|
+
character_start: row.characterStart,
|
|
177
|
+
character_end: row.characterEnd,
|
|
178
|
+
normalized_text: cipher.sealText(row.normalizedText),
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
export function deleteDocumentTextWindows(db, capsuleId, documentId) {
|
|
182
|
+
statements(db).deleteDocumentTextWindows.run({ c: capsuleId, d: documentId });
|
|
183
|
+
}
|
|
184
|
+
// Reads a bounded, document-relative character span without ever materializing the whole
|
|
185
|
+
// document text. Resolves from `document_texts` when a small file stored a single row, otherwise
|
|
186
|
+
// from the one `document_text_windows` row that contains the span (every chunk lies inside one
|
|
187
|
+
// page → one window). Returns undefined when no text is stored for the document.
|
|
188
|
+
//
|
|
189
|
+
// Plaintext stores slice the span in SQLite via SUBSTR so the whole column never enters JS. Encrypted
|
|
190
|
+
// stores cannot SUBSTR a sealed envelope, so they decrypt exactly one bounded unit — the small-document
|
|
191
|
+
// row or the single window that contains the span — and slice in JS. The Issue #1286 memory bound holds
|
|
192
|
+
// either way: a window is bounded and a document_texts row is only used for small documents.
|
|
193
|
+
export function readDocumentTextSpan(db, cipher, capsuleId, documentId, charStart, charEnd) {
|
|
194
|
+
const start = Math.max(0, Math.floor(charStart));
|
|
195
|
+
const end = Math.max(start, Math.floor(charEnd));
|
|
196
|
+
const len = end - start;
|
|
197
|
+
const c = String(capsuleId);
|
|
198
|
+
const d = String(documentId);
|
|
199
|
+
if (!cipher.isEncrypted) {
|
|
200
|
+
const single = statements(db).selectDocumentTextSpan.get({ c, d, start, len });
|
|
201
|
+
if (single !== undefined) {
|
|
202
|
+
return single.span ?? "";
|
|
203
|
+
}
|
|
204
|
+
const windowed = statements(db).selectDocumentTextWindowSpan.get({ c, d, start, end, len });
|
|
205
|
+
return windowed === undefined ? undefined : (windowed.span ?? "");
|
|
206
|
+
}
|
|
207
|
+
const single = statements(db).selectDocumentText.get({ c, d });
|
|
208
|
+
if (single !== undefined) {
|
|
209
|
+
return cipher.openText(single.normalized_text).slice(start, start + len);
|
|
210
|
+
}
|
|
211
|
+
const windowed = statements(db).selectDocumentTextWindowFull.get({ c, d, start, end });
|
|
212
|
+
if (windowed === undefined) {
|
|
213
|
+
return undefined;
|
|
214
|
+
}
|
|
215
|
+
const offset = start - windowed.character_start;
|
|
216
|
+
return cipher.openText(windowed.normalized_text).slice(offset, offset + len);
|
|
217
|
+
}
|
|
218
|
+
export function listPersistedDocumentsForSource(db, capsuleId, sourceId) {
|
|
219
|
+
const rows = statements(db).selectDocumentsForSource.all({
|
|
220
|
+
c: capsuleId,
|
|
221
|
+
s: sourceId,
|
|
222
|
+
});
|
|
223
|
+
return rows;
|
|
224
|
+
}
|
|
225
|
+
export function deleteDocumentRow(db, capsuleId, documentId) {
|
|
226
|
+
deleteDependentRows(db, capsuleId, documentId);
|
|
227
|
+
statements(db).deleteDocument.run({ c: capsuleId, d: documentId });
|
|
228
|
+
}
|
|
229
|
+
export function updateDocumentStatusRow(db, capsuleId, documentId, status) {
|
|
230
|
+
db.prepare(UPDATE_DOCUMENT_STATUS_SQL).run({
|
|
231
|
+
status,
|
|
232
|
+
c: capsuleId,
|
|
233
|
+
d: documentId,
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
export function insertPageRow(db, capsuleId, page) {
|
|
237
|
+
statements(db).insertPage.run({
|
|
238
|
+
capsule_id: capsuleId,
|
|
239
|
+
document_id: page.documentId,
|
|
240
|
+
page_number: page.pageNumber,
|
|
241
|
+
page_label: page.pageLabel ?? null,
|
|
242
|
+
character_start: page.characterStart,
|
|
243
|
+
character_end: page.characterEnd,
|
|
244
|
+
bbox_x: page.boundingBox?.x ?? null,
|
|
245
|
+
bbox_y: page.boundingBox?.y ?? null,
|
|
246
|
+
bbox_w: page.boundingBox?.w ?? null,
|
|
247
|
+
bbox_h: page.boundingBox?.h ?? null,
|
|
248
|
+
});
|
|
249
|
+
}
|
|
250
|
+
export function insertSectionRow(db, cipher, capsuleId, section) {
|
|
251
|
+
const sectionPathJson = JSON.stringify(section.sectionPath);
|
|
252
|
+
statements(db).insertSection.run({
|
|
253
|
+
capsule_id: capsuleId,
|
|
254
|
+
document_id: section.documentId,
|
|
255
|
+
section_path_json: cipher.sealText(sectionPathJson),
|
|
256
|
+
section_path_hash: sectionPathHash(section.sectionPath),
|
|
257
|
+
character_start: section.characterStart,
|
|
258
|
+
character_end: section.characterEnd,
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
function parsedUnitParams(cipher, capsuleId, unitId, unit) {
|
|
262
|
+
const base = {
|
|
263
|
+
id: unitId,
|
|
264
|
+
capsule_id: String(capsuleId),
|
|
265
|
+
document_id: String(unit.documentId),
|
|
266
|
+
kind: unit.kind,
|
|
267
|
+
page_number: null,
|
|
268
|
+
page_label: null,
|
|
269
|
+
section_path_json: null,
|
|
270
|
+
json_pointer: null,
|
|
271
|
+
table_name: null,
|
|
272
|
+
row_index: null,
|
|
273
|
+
heading_path_json: null,
|
|
274
|
+
unsupported_reason: null,
|
|
275
|
+
character_start: null,
|
|
276
|
+
character_end: null,
|
|
277
|
+
};
|
|
278
|
+
return populateUnitFields(base, unit, cipher);
|
|
279
|
+
}
|
|
280
|
+
function populateUnitFields(base, unit, cipher) {
|
|
281
|
+
if (unit.kind === "page") {
|
|
282
|
+
return {
|
|
283
|
+
...base,
|
|
284
|
+
page_number: unit.pageNumber,
|
|
285
|
+
page_label: unit.pageLabel ?? null,
|
|
286
|
+
character_start: unit.characterStart,
|
|
287
|
+
character_end: unit.characterEnd,
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
if (unit.kind === "section") {
|
|
291
|
+
return {
|
|
292
|
+
...base,
|
|
293
|
+
section_path_json: cipher.sealText(JSON.stringify(unit.sectionPath)),
|
|
294
|
+
character_start: unit.characterStart,
|
|
295
|
+
character_end: unit.characterEnd,
|
|
296
|
+
};
|
|
297
|
+
}
|
|
298
|
+
if (unit.kind === "json-path") {
|
|
299
|
+
return {
|
|
300
|
+
...base,
|
|
301
|
+
json_pointer: unit.jsonPointer,
|
|
302
|
+
character_start: unit.characterStart,
|
|
303
|
+
character_end: unit.characterEnd,
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
if (unit.kind === "csv-row") {
|
|
307
|
+
return {
|
|
308
|
+
...base,
|
|
309
|
+
table_name: unit.tableName,
|
|
310
|
+
row_index: unit.rowIndex,
|
|
311
|
+
character_start: unit.characterStart,
|
|
312
|
+
character_end: unit.characterEnd,
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
if (unit.kind === "html-block") {
|
|
316
|
+
return {
|
|
317
|
+
...base,
|
|
318
|
+
heading_path_json: unit.headingPath !== undefined ? cipher.sealText(JSON.stringify(unit.headingPath)) : null,
|
|
319
|
+
character_start: unit.characterStart,
|
|
320
|
+
character_end: unit.characterEnd,
|
|
321
|
+
};
|
|
322
|
+
}
|
|
323
|
+
return { ...base, unsupported_reason: unit.reason };
|
|
324
|
+
}
|
|
325
|
+
export function insertParsedUnitRow(db, cipher, capsuleId, unitId, unit) {
|
|
326
|
+
statements(db).insertParsedUnit.run(parsedUnitParams(cipher, capsuleId, unitId, unit));
|
|
327
|
+
}
|
|
328
|
+
export function insertDiagnosticRow(db, params) {
|
|
329
|
+
statements(db).insertDiagnostic.run({
|
|
330
|
+
id: params.id,
|
|
331
|
+
capsule_id: params.capsuleId,
|
|
332
|
+
document_id: params.diagnostic.documentId ?? null,
|
|
333
|
+
severity: params.diagnostic.severity,
|
|
334
|
+
code: params.diagnostic.code,
|
|
335
|
+
message: params.diagnostic.message,
|
|
336
|
+
page_number: params.diagnostic.pageNumber ?? null,
|
|
337
|
+
created_at: params.createdAt,
|
|
338
|
+
});
|
|
339
|
+
}
|
|
340
|
+
export function readExistingDocumentRow(db, capsuleId, documentId) {
|
|
341
|
+
const row = db
|
|
342
|
+
.prepare("SELECT * FROM documents WHERE capsule_id = :c AND id = :d")
|
|
343
|
+
.get({ c: capsuleId, d: documentId });
|
|
344
|
+
return row === undefined ? undefined : row;
|
|
345
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { KnowledgeSourceScope } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
import type { WorkspaceFs } from "@oscharko-dev/keiko-workspace";
|
|
3
|
+
export interface MemoryFsFile {
|
|
4
|
+
readonly relativePath: string;
|
|
5
|
+
readonly content: string | Uint8Array;
|
|
6
|
+
readonly realPathOverride?: string;
|
|
7
|
+
readonly hardLinkCount?: number;
|
|
8
|
+
readonly isSymbolicLink?: boolean;
|
|
9
|
+
}
|
|
10
|
+
export declare function memoryFs(root: string, files: readonly MemoryFsFile[]): WorkspaceFs;
|
|
11
|
+
export declare function folderScope(rootPath: string, options?: {
|
|
12
|
+
readonly recursive?: boolean;
|
|
13
|
+
readonly includeGlobs?: readonly string[];
|
|
14
|
+
readonly excludeGlobs?: readonly string[];
|
|
15
|
+
}): KnowledgeSourceScope;
|
|
16
|
+
//# sourceMappingURL=test-support.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"test-support.d.ts","sourceRoot":"","sources":["../../src/discovery/test-support.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,+BAA+B,CAAC;AAC1E,OAAO,KAAK,EAAqB,WAAW,EAAiB,MAAM,+BAA+B,CAAC;AAmDnG,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,OAAO,EAAE,MAAM,GAAG,UAAU,CAAC;IACtC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC;IACnC,QAAQ,CAAC,aAAa,CAAC,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,cAAc,CAAC,EAAE,OAAO,CAAC;CACnC;AA4CD,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,YAAY,EAAE,GAAG,WAAW,CA4ClF;AAED,wBAAgB,WAAW,CACzB,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE;IACP,QAAQ,CAAC,SAAS,CAAC,EAAE,OAAO,CAAC;IAC7B,QAAQ,CAAC,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAC1C,QAAQ,CAAC,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;CACtC,GACL,oBAAoB,CAWtB"}
|