@oscharko-dev/keiko-local-knowledge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/bounded-document-extraction.d.ts +27 -0
- package/dist/bounded-document-extraction.d.ts.map +1 -0
- package/dist/bounded-document-extraction.js +214 -0
- package/dist/capsule-lifecycle.d.ts +33 -0
- package/dist/capsule-lifecycle.d.ts.map +1 -0
- package/dist/capsule-lifecycle.js +292 -0
- package/dist/capsule-set-lifecycle.d.ts +15 -0
- package/dist/capsule-set-lifecycle.d.ts.map +1 -0
- package/dist/capsule-set-lifecycle.js +158 -0
- package/dist/chunking/chunker-persist.d.ts +36 -0
- package/dist/chunking/chunker-persist.d.ts.map +1 -0
- package/dist/chunking/chunker-persist.js +74 -0
- package/dist/chunking/chunker-runner.d.ts +9 -0
- package/dist/chunking/chunker-runner.d.ts.map +1 -0
- package/dist/chunking/chunker-runner.js +218 -0
- package/dist/chunking/chunker.d.ts +7 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +139 -0
- package/dist/chunking/citation-mapper.d.ts +4 -0
- package/dist/chunking/citation-mapper.d.ts.map +1 -0
- package/dist/chunking/citation-mapper.js +180 -0
- package/dist/chunking/index.d.ts +6 -0
- package/dist/chunking/index.d.ts.map +1 -0
- package/dist/chunking/index.js +8 -0
- package/dist/chunking/token-estimator.d.ts +3 -0
- package/dist/chunking/token-estimator.d.ts.map +1 -0
- package/dist/chunking/token-estimator.js +26 -0
- package/dist/chunking/types.d.ts +49 -0
- package/dist/chunking/types.d.ts.map +1 -0
- package/dist/chunking/types.js +26 -0
- package/dist/composition.d.ts +57 -0
- package/dist/composition.d.ts.map +1 -0
- package/dist/composition.js +310 -0
- package/dist/conversation/citation-attacher.d.ts +8 -0
- package/dist/conversation/citation-attacher.d.ts.map +1 -0
- package/dist/conversation/citation-attacher.js +55 -0
- package/dist/conversation/citation-excerpts.d.ts +4 -0
- package/dist/conversation/citation-excerpts.d.ts.map +1 -0
- package/dist/conversation/citation-excerpts.js +41 -0
- package/dist/conversation/grounded-answer-runner.d.ts +9 -0
- package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
- package/dist/conversation/grounded-answer-runner.js +61 -0
- package/dist/conversation/index.d.ts +5 -0
- package/dist/conversation/index.d.ts.map +1 -0
- package/dist/conversation/index.js +7 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
- package/dist/conversation/model-gateway-answer-generator.js +105 -0
- package/dist/conversation/types.d.ts +35 -0
- package/dist/conversation/types.d.ts.map +1 -0
- package/dist/conversation/types.js +24 -0
- package/dist/discovery/discovery-runner.d.ts +23 -0
- package/dist/discovery/discovery-runner.d.ts.map +1 -0
- package/dist/discovery/discovery-runner.js +109 -0
- package/dist/discovery/extract-progressive.d.ts +17 -0
- package/dist/discovery/extract-progressive.d.ts.map +1 -0
- package/dist/discovery/extract-progressive.js +522 -0
- package/dist/discovery/extract.d.ts +26 -0
- package/dist/discovery/extract.d.ts.map +1 -0
- package/dist/discovery/extract.js +906 -0
- package/dist/discovery/glob.d.ts +10 -0
- package/dist/discovery/glob.d.ts.map +1 -0
- package/dist/discovery/glob.js +72 -0
- package/dist/discovery/index.d.ts +6 -0
- package/dist/discovery/index.d.ts.map +1 -0
- package/dist/discovery/index.js +8 -0
- package/dist/discovery/media-type.d.ts +4 -0
- package/dist/discovery/media-type.d.ts.map +1 -0
- package/dist/discovery/media-type.js +62 -0
- package/dist/discovery/persist.d.ts +63 -0
- package/dist/discovery/persist.d.ts.map +1 -0
- package/dist/discovery/persist.js +345 -0
- package/dist/discovery/test-support.d.ts +16 -0
- package/dist/discovery/test-support.d.ts.map +1 -0
- package/dist/discovery/test-support.js +127 -0
- package/dist/discovery/types.d.ts +63 -0
- package/dist/discovery/types.d.ts.map +1 -0
- package/dist/discovery/types.js +28 -0
- package/dist/discovery/walk.d.ts +12 -0
- package/dist/discovery/walk.d.ts.map +1 -0
- package/dist/discovery/walk.js +302 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +22 -0
- package/dist/evaluations/dimensions.d.ts +14 -0
- package/dist/evaluations/dimensions.d.ts.map +1 -0
- package/dist/evaluations/dimensions.js +191 -0
- package/dist/evaluations/fixtures.d.ts +18 -0
- package/dist/evaluations/fixtures.d.ts.map +1 -0
- package/dist/evaluations/fixtures.js +858 -0
- package/dist/evaluations/index.d.ts +7 -0
- package/dist/evaluations/index.d.ts.map +1 -0
- package/dist/evaluations/index.js +10 -0
- package/dist/evaluations/report.d.ts +3 -0
- package/dist/evaluations/report.d.ts.map +1 -0
- package/dist/evaluations/report.js +31 -0
- package/dist/evaluations/runner-seed.d.ts +12 -0
- package/dist/evaluations/runner-seed.d.ts.map +1 -0
- package/dist/evaluations/runner-seed.js +175 -0
- package/dist/evaluations/runner.d.ts +8 -0
- package/dist/evaluations/runner.d.ts.map +1 -0
- package/dist/evaluations/runner.js +205 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
- package/dist/evaluations/scripted-embedding-adapter.js +163 -0
- package/dist/evaluations/types.d.ts +116 -0
- package/dist/evaluations/types.d.ts.map +1 -0
- package/dist/evaluations/types.js +27 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +41 -0
- package/dist/indexing/bounded-indexing.d.ts +41 -0
- package/dist/indexing/bounded-indexing.d.ts.map +1 -0
- package/dist/indexing/bounded-indexing.js +240 -0
- package/dist/indexing/checkpoint-persist.d.ts +8 -0
- package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
- package/dist/indexing/checkpoint-persist.js +135 -0
- package/dist/indexing/checkpoint-resume.d.ts +20 -0
- package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
- package/dist/indexing/checkpoint-resume.js +50 -0
- package/dist/indexing/embedding-batcher.d.ts +3 -0
- package/dist/indexing/embedding-batcher.d.ts.map +1 -0
- package/dist/indexing/embedding-batcher.js +390 -0
- package/dist/indexing/index.d.ts +7 -0
- package/dist/indexing/index.d.ts.map +1 -0
- package/dist/indexing/index.js +11 -0
- package/dist/indexing/job-persist.d.ts +46 -0
- package/dist/indexing/job-persist.d.ts.map +1 -0
- package/dist/indexing/job-persist.js +157 -0
- package/dist/indexing/job-resume.d.ts +4 -0
- package/dist/indexing/job-resume.d.ts.map +1 -0
- package/dist/indexing/job-resume.js +14 -0
- package/dist/indexing/orchestrator.d.ts +3 -0
- package/dist/indexing/orchestrator.d.ts.map +1 -0
- package/dist/indexing/orchestrator.js +1151 -0
- package/dist/indexing/types.d.ts +156 -0
- package/dist/indexing/types.d.ts.map +1 -0
- package/dist/indexing/types.js +30 -0
- package/dist/indexing/vector-persist.d.ts +32 -0
- package/dist/indexing/vector-persist.d.ts.map +1 -0
- package/dist/indexing/vector-persist.js +105 -0
- package/dist/parsers/_internal.d.ts +20 -0
- package/dist/parsers/_internal.d.ts.map +1 -0
- package/dist/parsers/_internal.js +122 -0
- package/dist/parsers/csv-parser.d.ts +3 -0
- package/dist/parsers/csv-parser.d.ts.map +1 -0
- package/dist/parsers/csv-parser.js +202 -0
- package/dist/parsers/docx-parser.d.ts +3 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +390 -0
- package/dist/parsers/html-parser.d.ts +3 -0
- package/dist/parsers/html-parser.d.ts.map +1 -0
- package/dist/parsers/html-parser.js +310 -0
- package/dist/parsers/index.d.ts +15 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/index.js +41 -0
- package/dist/parsers/json-parser.d.ts +3 -0
- package/dist/parsers/json-parser.d.ts.map +1 -0
- package/dist/parsers/json-parser.js +192 -0
- package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
- package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
- package/dist/parsers/large-document/capability-discovery.js +76 -0
- package/dist/parsers/large-document/diagnostics.d.ts +3 -0
- package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
- package/dist/parsers/large-document/diagnostics.js +11 -0
- package/dist/parsers/large-document/index.d.ts +15 -0
- package/dist/parsers/large-document/index.d.ts.map +1 -0
- package/dist/parsers/large-document/index.js +10 -0
- package/dist/parsers/large-document/legacy-format.d.ts +5 -0
- package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
- package/dist/parsers/large-document/legacy-format.js +25 -0
- package/dist/parsers/large-document/preflight.d.ts +9 -0
- package/dist/parsers/large-document/preflight.d.ts.map +1 -0
- package/dist/parsers/large-document/preflight.js +43 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-extraction.js +123 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-pdf.js +145 -0
- package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
- package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
- package/dist/parsers/large-document/synthetic-source.js +101 -0
- package/dist/parsers/large-document/window-builder.d.ts +24 -0
- package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
- package/dist/parsers/large-document/window-builder.js +75 -0
- package/dist/parsers/ocr/index.d.ts +4 -0
- package/dist/parsers/ocr/index.d.ts.map +1 -0
- package/dist/parsers/ocr/index.js +4 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
- package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
- package/dist/parsers/ocr/types.d.ts +16 -0
- package/dist/parsers/ocr/types.d.ts.map +1 -0
- package/dist/parsers/ocr/types.js +4 -0
- package/dist/parsers/parser-test-fixtures.d.ts +28 -0
- package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
- package/dist/parsers/parser-test-fixtures.js +139 -0
- package/dist/parsers/pdf-parser.d.ts +43 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +388 -0
- package/dist/parsers/registry.d.ts +8 -0
- package/dist/parsers/registry.d.ts.map +1 -0
- package/dist/parsers/registry.js +57 -0
- package/dist/parsers/text-parser.d.ts +3 -0
- package/dist/parsers/text-parser.d.ts.map +1 -0
- package/dist/parsers/text-parser.js +214 -0
- package/dist/parsers/types.d.ts +53 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers/types.js +21 -0
- package/dist/parsers/unsupported-parser.d.ts +4 -0
- package/dist/parsers/unsupported-parser.d.ts.map +1 -0
- package/dist/parsers/unsupported-parser.js +97 -0
- package/dist/parsers/xlsx-parser.d.ts +3 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +425 -0
- package/dist/privacy/audit-emitter.d.ts +5 -0
- package/dist/privacy/audit-emitter.d.ts.map +1 -0
- package/dist/privacy/audit-emitter.js +93 -0
- package/dist/privacy/diagnostic-redactor.d.ts +2 -0
- package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
- package/dist/privacy/diagnostic-redactor.js +153 -0
- package/dist/privacy/index.d.ts +5 -0
- package/dist/privacy/index.d.ts.map +1 -0
- package/dist/privacy/index.js +6 -0
- package/dist/privacy/retention-applier.d.ts +5 -0
- package/dist/privacy/retention-applier.d.ts.map +1 -0
- package/dist/privacy/retention-applier.js +88 -0
- package/dist/privacy/types.d.ts +98 -0
- package/dist/privacy/types.d.ts.map +1 -0
- package/dist/privacy/types.js +12 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
- package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
- package/dist/qualityIntelligence/index.d.ts +3 -0
- package/dist/qualityIntelligence/index.d.ts.map +1 -0
- package/dist/qualityIntelligence/index.js +5 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
- package/dist/qualityIntelligence/qiHandoff.js +82 -0
- package/dist/retrieval/answer-grounding.d.ts +9 -0
- package/dist/retrieval/answer-grounding.d.ts.map +1 -0
- package/dist/retrieval/answer-grounding.js +31 -0
- package/dist/retrieval/context-pack-assembler.d.ts +24 -0
- package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
- package/dist/retrieval/context-pack-assembler.js +50 -0
- package/dist/retrieval/index.d.ts +6 -0
- package/dist/retrieval/index.d.ts.map +1 -0
- package/dist/retrieval/index.js +9 -0
- package/dist/retrieval/retrieval-runner.d.ts +10 -0
- package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
- package/dist/retrieval/retrieval-runner.js +163 -0
- package/dist/retrieval/scoped-vector-search.d.ts +24 -0
- package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
- package/dist/retrieval/scoped-vector-search.js +864 -0
- package/dist/retrieval/types.d.ts +28 -0
- package/dist/retrieval/types.d.ts.map +1 -0
- package/dist/retrieval/types.js +33 -0
- package/dist/section-path-hash.d.ts +3 -0
- package/dist/section-path-hash.d.ts.map +1 -0
- package/dist/section-path-hash.js +9 -0
- package/dist/source-lifecycle.d.ts +14 -0
- package/dist/source-lifecycle.d.ts.map +1 -0
- package/dist/source-lifecycle.js +155 -0
- package/dist/source-routing-validation.d.ts +11 -0
- package/dist/source-routing-validation.d.ts.map +1 -0
- package/dist/source-routing-validation.js +140 -0
- package/dist/store-content-cipher.d.ts +11 -0
- package/dist/store-content-cipher.d.ts.map +1 -0
- package/dist/store-content-cipher.js +67 -0
- package/dist/store-content-encryption.d.ts +12 -0
- package/dist/store-content-encryption.d.ts.map +1 -0
- package/dist/store-content-encryption.js +275 -0
- package/dist/store-paths.d.ts +6 -0
- package/dist/store-paths.d.ts.map +1 -0
- package/dist/store-paths.js +61 -0
- package/dist/store.d.ts +30 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +219 -0
- package/dist/testing.d.ts +47 -0
- package/dist/testing.d.ts.map +1 -0
- package/dist/testing.js +170 -0
- package/dist/version.d.ts +2 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +4 -0
- package/package.json +43 -0
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
// HTML parser adapter (Epic #189, Issue #266). Single-pass scanner — NO regex, NO DOM, NO
|
|
2
|
+
// new deps. Pure-string traversal.
|
|
3
|
+
//
|
|
4
|
+
// Security posture:
|
|
5
|
+
// * Embedded JavaScript is NEVER executed. We treat `<script>...</script>` content as a
|
|
6
|
+
// dropped substring; nothing parsed inside it lands in any unit. Same for `<style>` and
|
|
7
|
+
// `<noscript>` whose bodies could contain encoded payloads we do not want to surface to
|
|
8
|
+
// the chunker.
|
|
9
|
+
// * Tag content is consumed by raw string scanning. No `new Function`, no `eval`, no DOM
|
|
10
|
+
// APIs are touched; we cannot accidentally trigger sandbox escape because there is no
|
|
11
|
+
// execution path.
|
|
12
|
+
// * CodeQL `js/bad-tag-filter` does NOT fire: we never use regex to filter tags. We
|
|
13
|
+
// `indexOf("<")` then scan character-by-character to find the matching `>`.
|
|
14
|
+
//
|
|
15
|
+
// Emits one `html-block` ParsedUnit per visible-content run between heading boundaries.
|
|
16
|
+
// `headingPath` is set from the most recent `<h1>`-`<h6>` stack at the moment the block
|
|
17
|
+
// opens. Inline text outside any heading produces a `html-block` with `headingPath: []`.
|
|
18
|
+
import { decodeUtf8, diagnostic, emptyResult, oversizeDiagnostic, shouldStop, } from "./_internal.js";
|
|
19
|
+
// Collapse internal whitespace runs to single spaces and trim. Applied to each block's
|
|
20
|
+
// visible text so the cleaned projection reads as flowing prose (the raw HTML had source
|
|
21
|
+
// indentation / newlines between inline tags).
|
|
22
|
+
function collapseWhitespace(value) {
|
|
23
|
+
let out = "";
|
|
24
|
+
let inWs = false;
|
|
25
|
+
for (let i = 0; i < value.length; i += 1) {
|
|
26
|
+
const code = value.charCodeAt(i);
|
|
27
|
+
const ws = code === 0x20 || code === 0x09 || code === 0x0a || code === 0x0d;
|
|
28
|
+
if (ws) {
|
|
29
|
+
inWs = true;
|
|
30
|
+
continue;
|
|
31
|
+
}
|
|
32
|
+
if (inWs && out.length > 0)
|
|
33
|
+
out += " ";
|
|
34
|
+
inWs = false;
|
|
35
|
+
out += value.charAt(i);
|
|
36
|
+
}
|
|
37
|
+
return out;
|
|
38
|
+
}
|
|
39
|
+
const PARSER_ID = "html";
|
|
40
|
+
const PARSER_VERSION = "1";
|
|
41
|
+
const HTML_EXTENSIONS = new Set(["html", "htm", "xhtml"]);
|
|
42
|
+
const HTML_MEDIA_TYPES = new Set(["text/html", "application/xhtml+xml"]);
|
|
43
|
+
function isHtml(input) {
|
|
44
|
+
const ext = input.extension.toLowerCase();
|
|
45
|
+
if (HTML_EXTENSIONS.has(ext))
|
|
46
|
+
return true;
|
|
47
|
+
if (HTML_MEDIA_TYPES.has(input.mediaType.toLowerCase()))
|
|
48
|
+
return true;
|
|
49
|
+
// Sniff: a leading `<!DOCTYPE` or `<html` is enough to claim. We never decode beyond a few
|
|
50
|
+
// bytes here; the bigger decode happens inside `parse`.
|
|
51
|
+
const head = input.bytes.subarray(0, 64);
|
|
52
|
+
const text = new TextDecoder("utf-8", { fatal: false }).decode(head).trimStart().toLowerCase();
|
|
53
|
+
return text.startsWith("<!doctype html") || text.startsWith("<html");
|
|
54
|
+
}
|
|
55
|
+
const RAW_TEXT_TAGS = new Set(["script", "style", "noscript"]);
|
|
56
|
+
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
|
57
|
+
function isAlpha(code) {
|
|
58
|
+
return (code >= 0x41 && code <= 0x5a) || (code >= 0x61 && code <= 0x7a);
|
|
59
|
+
}
|
|
60
|
+
function isNameChar(code) {
|
|
61
|
+
return isAlpha(code) || (code >= 0x30 && code <= 0x39) || code === 0x2d /* - */;
|
|
62
|
+
}
|
|
63
|
+
function readTagName(text, from) {
|
|
64
|
+
let i = from;
|
|
65
|
+
while (i < text.length && isNameChar(text.charCodeAt(i)))
|
|
66
|
+
i += 1;
|
|
67
|
+
return { name: text.slice(from, i).toLowerCase(), after: i };
|
|
68
|
+
}
|
|
69
|
+
// Returns the next tag starting at or after `from`. Skips comments, CDATA, and DOCTYPE.
|
|
70
|
+
function skipSpecialMarker(text, lt, after) {
|
|
71
|
+
if (text.startsWith("!--", after)) {
|
|
72
|
+
const close = text.indexOf("-->", after + 3);
|
|
73
|
+
return close === -1 ? text.length : close + 3;
|
|
74
|
+
}
|
|
75
|
+
const ch = text.charCodeAt(after);
|
|
76
|
+
if (ch === 0x21 /* ! */ || ch === 0x3f /* ? */) {
|
|
77
|
+
const gt = text.indexOf(">", after);
|
|
78
|
+
return gt === -1 ? text.length : gt + 1;
|
|
79
|
+
}
|
|
80
|
+
return null;
|
|
81
|
+
}
|
|
82
|
+
function readTagAt(text, lt) {
|
|
83
|
+
const after = lt + 1;
|
|
84
|
+
const isClose = text.charCodeAt(after) === 0x2f; /* / */
|
|
85
|
+
const nameStart = isClose ? after + 1 : after;
|
|
86
|
+
if (!isAlpha(text.charCodeAt(nameStart)))
|
|
87
|
+
return null;
|
|
88
|
+
const { name, after: afterName } = readTagName(text, nameStart);
|
|
89
|
+
const gt = text.indexOf(">", afterName);
|
|
90
|
+
if (gt === -1)
|
|
91
|
+
return null;
|
|
92
|
+
const selfClosing = !isClose && text.charCodeAt(gt - 1) === 0x2f;
|
|
93
|
+
const kind = isClose ? "close" : selfClosing ? "self-closing" : "open";
|
|
94
|
+
return { name, kind, start: lt, end: gt + 1 };
|
|
95
|
+
}
|
|
96
|
+
// Returns the next event starting at `from`. Text events cover inter-tag/inter-marker runs
|
|
97
|
+
// only — bytes that are part of a `<!DOCTYPE>` / `<!-- -->` / `<tag>` literal are NEVER
|
|
98
|
+
// surfaced as text. This is the contract that keeps `<html>` and `<body>` literals out of
|
|
99
|
+
// any html-block span.
|
|
100
|
+
function nextEvent(text, from) {
|
|
101
|
+
if (from >= text.length)
|
|
102
|
+
return { kind: "eof" };
|
|
103
|
+
const lt = text.indexOf("<", from);
|
|
104
|
+
if (lt === -1)
|
|
105
|
+
return { kind: "text", start: from, end: text.length, next: text.length };
|
|
106
|
+
if (lt > from)
|
|
107
|
+
return { kind: "text", start: from, end: lt, next: lt };
|
|
108
|
+
const marker = skipSpecialMarker(text, lt, lt + 1);
|
|
109
|
+
if (marker !== null)
|
|
110
|
+
return { kind: "marker", next: marker };
|
|
111
|
+
const tag = readTagAt(text, lt);
|
|
112
|
+
if (tag !== null)
|
|
113
|
+
return { kind: "tag", tag, next: tag.end };
|
|
114
|
+
return { kind: "text", start: lt, end: lt + 1, next: lt + 1 };
|
|
115
|
+
}
|
|
116
|
+
// For a raw-text tag we MUST skip until the matching close tag with the same name, without
|
|
117
|
+
// interpreting the inner bytes. This is what neutralises `<script>...</script>` payloads.
|
|
118
|
+
// `textLower` is a precomputed lowercase view threaded from `emitHtml` to avoid recomputing
|
|
119
|
+
// it on every raw-text tag — without this, a document with N script/style tags would call
|
|
120
|
+
// text.toLowerCase() N times, making the function O(n²) in document size.
|
|
121
|
+
function skipRawText(text, textLower, tagName, from) {
|
|
122
|
+
const target = `</${tagName}`;
|
|
123
|
+
const close = textLower.indexOf(target, from);
|
|
124
|
+
if (close === -1)
|
|
125
|
+
return text.length;
|
|
126
|
+
const gt = text.indexOf(">", close);
|
|
127
|
+
return gt === -1 ? text.length : gt + 1;
|
|
128
|
+
}
|
|
129
|
+
// ─── Heading stack ───────────────────────────────────────────────────────────
|
|
130
|
+
function headingLevel(name) {
|
|
131
|
+
if (name.length !== 2 || name.charCodeAt(0) !== 0x68 /* h */)
|
|
132
|
+
return 0;
|
|
133
|
+
const code = name.charCodeAt(1);
|
|
134
|
+
if (code < 0x31 || code > 0x36)
|
|
135
|
+
return 0;
|
|
136
|
+
return code - 0x30;
|
|
137
|
+
}
|
|
138
|
+
function pushHeading(state, level, label) {
|
|
139
|
+
while (state.stack.length >= level)
|
|
140
|
+
state.stack.pop();
|
|
141
|
+
state.stack.push(label);
|
|
142
|
+
}
|
|
143
|
+
function isWhitespaceOnly(text, start, end) {
|
|
144
|
+
for (let i = start; i < end; i += 1) {
|
|
145
|
+
const code = text.charCodeAt(i);
|
|
146
|
+
if (code !== 0x20 && code !== 0x09 && code !== 0x0a && code !== 0x0d)
|
|
147
|
+
return false;
|
|
148
|
+
}
|
|
149
|
+
return true;
|
|
150
|
+
}
|
|
151
|
+
function resetBlock(state) {
|
|
152
|
+
state.pendingBlockStart = null;
|
|
153
|
+
state.pendingBlockHasText = false;
|
|
154
|
+
state.blockText = "";
|
|
155
|
+
}
|
|
156
|
+
function flushBlock(state, end) {
|
|
157
|
+
if (state.pendingBlockStart === null)
|
|
158
|
+
return;
|
|
159
|
+
if (end <= state.pendingBlockStart || !state.pendingBlockHasText) {
|
|
160
|
+
resetBlock(state);
|
|
161
|
+
return;
|
|
162
|
+
}
|
|
163
|
+
const limit = shouldStop(state.startedAt, state.options, state.units.length);
|
|
164
|
+
if (limit.stop && limit.code !== undefined && limit.message !== undefined) {
|
|
165
|
+
state.diagnostics.push(diagnostic(limit.code, limit.message, state.input.documentId, "info"));
|
|
166
|
+
state.stopped = true;
|
|
167
|
+
resetBlock(state);
|
|
168
|
+
return;
|
|
169
|
+
}
|
|
170
|
+
// Offsets index the CLEANED projection, not the raw HTML — so citation slices and chunk
|
|
171
|
+
// spans never expose tags, <script>/<style> bodies, or embedded secrets.
|
|
172
|
+
const cleaned = collapseWhitespace(state.blockText);
|
|
173
|
+
if (cleaned.length === 0) {
|
|
174
|
+
resetBlock(state);
|
|
175
|
+
return;
|
|
176
|
+
}
|
|
177
|
+
if (state.cleanedParts.length > 0) {
|
|
178
|
+
// Single newline separator between blocks (a gap not covered by any unit range).
|
|
179
|
+
state.cleanedParts.push("\n");
|
|
180
|
+
state.cleanedOffset += 1;
|
|
181
|
+
}
|
|
182
|
+
const start = state.cleanedOffset;
|
|
183
|
+
state.cleanedParts.push(cleaned);
|
|
184
|
+
state.cleanedOffset += cleaned.length;
|
|
185
|
+
state.units.push({
|
|
186
|
+
kind: "html-block",
|
|
187
|
+
documentId: state.input.documentId,
|
|
188
|
+
headingPath: [...state.heading.stack],
|
|
189
|
+
characterStart: start,
|
|
190
|
+
characterEnd: state.cleanedOffset,
|
|
191
|
+
});
|
|
192
|
+
resetBlock(state);
|
|
193
|
+
}
|
|
194
|
+
function openBlock(state, at, hasText) {
|
|
195
|
+
if (state.pendingBlockStart === null) {
|
|
196
|
+
state.pendingBlockStart = at;
|
|
197
|
+
state.blockText = "";
|
|
198
|
+
}
|
|
199
|
+
if (hasText)
|
|
200
|
+
state.pendingBlockHasText = true;
|
|
201
|
+
}
|
|
202
|
+
function handleHeadingOpen(state, tag, level) {
|
|
203
|
+
flushBlock(state, tag.start);
|
|
204
|
+
state.pendingHeadingLabel = "";
|
|
205
|
+
state.pendingHeadingLevel = level;
|
|
206
|
+
return tag.end;
|
|
207
|
+
}
|
|
208
|
+
function handleHeadingClose(state, tag) {
|
|
209
|
+
const label = (state.pendingHeadingLabel ?? "").trim();
|
|
210
|
+
if (label.length > 0 && state.pendingHeadingLevel > 0) {
|
|
211
|
+
pushHeading(state.heading, state.pendingHeadingLevel, label);
|
|
212
|
+
}
|
|
213
|
+
state.pendingHeadingLabel = null;
|
|
214
|
+
state.pendingHeadingLevel = 0;
|
|
215
|
+
state.pendingBlockStart = tag.end;
|
|
216
|
+
return tag.end;
|
|
217
|
+
}
|
|
218
|
+
function appendTextRun(state, from, to) {
|
|
219
|
+
if (from >= to)
|
|
220
|
+
return;
|
|
221
|
+
if (state.pendingHeadingLabel !== null) {
|
|
222
|
+
state.pendingHeadingLabel += state.text.slice(from, to);
|
|
223
|
+
return;
|
|
224
|
+
}
|
|
225
|
+
openBlock(state, from, !isWhitespaceOnly(state.text, from, to));
|
|
226
|
+
// Accumulate only inter-tag text runs — inline tag literals (<b>, <a …>) are never part of
|
|
227
|
+
// a run, so the cleaned projection is tag-free by construction.
|
|
228
|
+
state.blockText += state.text.slice(from, to);
|
|
229
|
+
}
|
|
230
|
+
function handleTag(state, tag) {
|
|
231
|
+
if (RAW_TEXT_TAGS.has(tag.name) && tag.kind === "open") {
|
|
232
|
+
// Terminate any in-progress block at the tag boundary so the raw bytes never appear in
|
|
233
|
+
// the unit stream. Skip past `</script>` (etc.); the next text run reopens a block.
|
|
234
|
+
flushBlock(state, tag.start);
|
|
235
|
+
return skipRawText(state.text, state.textLower, tag.name, tag.end);
|
|
236
|
+
}
|
|
237
|
+
const level = headingLevel(tag.name);
|
|
238
|
+
if (level > 0 && tag.kind === "open")
|
|
239
|
+
return handleHeadingOpen(state, tag, level);
|
|
240
|
+
if (level > 0 && tag.kind === "close")
|
|
241
|
+
return handleHeadingClose(state, tag);
|
|
242
|
+
return tag.end;
|
|
243
|
+
}
|
|
244
|
+
function step(state, cursor) {
|
|
245
|
+
const event = nextEvent(state.text, cursor);
|
|
246
|
+
if (event.kind === "eof") {
|
|
247
|
+
flushBlock(state, state.text.length);
|
|
248
|
+
return state.text.length;
|
|
249
|
+
}
|
|
250
|
+
if (event.kind === "text") {
|
|
251
|
+
appendTextRun(state, event.start, event.end);
|
|
252
|
+
return event.next;
|
|
253
|
+
}
|
|
254
|
+
if (event.kind === "marker")
|
|
255
|
+
return event.next;
|
|
256
|
+
return handleTag(state, event.tag);
|
|
257
|
+
}
|
|
258
|
+
function emitHtml(text, input, options) {
|
|
259
|
+
const state = {
|
|
260
|
+
text,
|
|
261
|
+
textLower: text.toLowerCase(),
|
|
262
|
+
input,
|
|
263
|
+
options,
|
|
264
|
+
startedAt: options.now(),
|
|
265
|
+
units: [],
|
|
266
|
+
diagnostics: [],
|
|
267
|
+
heading: { stack: [] },
|
|
268
|
+
pendingBlockStart: null,
|
|
269
|
+
pendingBlockHasText: false,
|
|
270
|
+
pendingHeadingLabel: null,
|
|
271
|
+
pendingHeadingLevel: 0,
|
|
272
|
+
stopped: false,
|
|
273
|
+
blockText: "",
|
|
274
|
+
cleanedParts: [],
|
|
275
|
+
cleanedOffset: 0,
|
|
276
|
+
};
|
|
277
|
+
let cursor = 0;
|
|
278
|
+
while (cursor < text.length && !state.stopped) {
|
|
279
|
+
cursor = step(state, cursor);
|
|
280
|
+
}
|
|
281
|
+
flushBlock(state, text.length);
|
|
282
|
+
return {
|
|
283
|
+
units: state.units,
|
|
284
|
+
diagnostics: state.diagnostics,
|
|
285
|
+
normalizedText: state.cleanedParts.join(""),
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
export const htmlParser = Object.freeze({
|
|
289
|
+
capability: Object.freeze({
|
|
290
|
+
parserId: PARSER_ID,
|
|
291
|
+
parserVersion: PARSER_VERSION,
|
|
292
|
+
matches: (input) => isHtml(input),
|
|
293
|
+
}),
|
|
294
|
+
parse: (input, options) => {
|
|
295
|
+
if (input.bytes.byteLength > options.maxBytes) {
|
|
296
|
+
return emptyResult(htmlParser.capability, input.documentId, options, [
|
|
297
|
+
oversizeDiagnostic(input.documentId, input.bytes.byteLength, options.maxBytes),
|
|
298
|
+
]);
|
|
299
|
+
}
|
|
300
|
+
const decoded = decodeUtf8(input.bytes);
|
|
301
|
+
const emission = emitHtml(decoded.text, input, options);
|
|
302
|
+
const result = {
|
|
303
|
+
...emptyResult(htmlParser.capability, input.documentId, options, emission.diagnostics, emission.units),
|
|
304
|
+
// GRD-003: persist the cleaned, tag/script/style-free projection so extract.ts NEVER
|
|
305
|
+
// falls back to raw bytes (which carried <script> bodies + embedded secrets).
|
|
306
|
+
normalizedText: emission.normalizedText,
|
|
307
|
+
};
|
|
308
|
+
return result;
|
|
309
|
+
},
|
|
310
|
+
});
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export { DEFAULT_MAX_BYTES, DEFAULT_MAX_NESTING_DEPTH, DEFAULT_MAX_OBJECTS, DEFAULT_MAX_UNITS, DEFAULT_TIMEOUT_MS, type AsyncParserAdapter, PARSER_ERROR_CODES, type ParserAdapter, type ParserCapability, type ParserErrorCode, type ParserOptions, type ParserRegistry, type ParserResolution, type ParserSelectionInput, } from "./types.js";
|
|
2
|
+
export type { ParsedUnit, ParserDiagnostic, ParserResult } from "./types.js";
|
|
3
|
+
export { buildParserOptions, createParserRegistry, registerParser, resolveParser, unsupportedParser, } from "./registry.js";
|
|
4
|
+
export { textParser } from "./text-parser.js";
|
|
5
|
+
export { jsonParser } from "./json-parser.js";
|
|
6
|
+
export { csvParser } from "./csv-parser.js";
|
|
7
|
+
export { htmlParser } from "./html-parser.js";
|
|
8
|
+
export { pdfParser } from "./pdf-parser.js";
|
|
9
|
+
export { docxParser } from "./docx-parser.js";
|
|
10
|
+
export { xlsxParser } from "./xlsx-parser.js";
|
|
11
|
+
import type { ParserRegistry } from "./types.js";
|
|
12
|
+
export declare function createDefaultParserRegistry(): ParserRegistry;
|
|
13
|
+
export { nullOcrAdapter, createOcrPipelineParser, type OcrAdapter, type OcrPageResult, type OcrPipelineAdapter, } from "./ocr/index.js";
|
|
14
|
+
export { runProgressiveExtraction, createProgressivePdfExtractor, syntheticStreamingSource, syntheticProgressiveExtractor, discoverExtractionCapabilities, probeOcrCapability, probeMultimodalCapability, nullMultimodalAdapter, classifyLargeDocument, usesProgressivePath, isLegacyBinaryOfficeFormat, legacyFormatGuidance, legacyFormatDiagnostic, largeDocumentDiagnostic, type ProgressiveExtractor, type ProgressiveExtractionSource, type ProgressiveExtractionWindow, type ProgressiveExtractionOptions, type ProgressiveExtractionSink, type ProgressiveExtractionSummary, type ProgressiveStopReason, type OcrPageFn, type ProgressivePdfExtractorDeps, type SyntheticStreamingConfig, type CapabilityProbeDeps, type MultimodalAdapter, type MultimodalResult, type PreflightInput, } from "./large-document/index.js";
|
|
15
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/parsers/index.ts"],"names":[],"mappings":"AAIA,OAAO,EACL,iBAAiB,EACjB,yBAAyB,EACzB,mBAAmB,EACnB,iBAAiB,EACjB,kBAAkB,EAClB,KAAK,kBAAkB,EACvB,kBAAkB,EAClB,KAAK,aAAa,EAClB,KAAK,gBAAgB,EACrB,KAAK,eAAe,EACpB,KAAK,aAAa,EAClB,KAAK,cAAc,EACnB,KAAK,gBAAgB,EACrB,KAAK,oBAAoB,GAC1B,MAAM,YAAY,CAAC;AACpB,YAAY,EAAE,UAAU,EAAE,gBAAgB,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAE7E,OAAO,EACL,kBAAkB,EAClB,oBAAoB,EACpB,cAAc,EACd,aAAa,EACb,iBAAiB,GAClB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAC5C,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAC5C,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAY9C,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAGjD,wBAAgB,2BAA2B,IAAI,cAAc,CAa5D;AAGD,OAAO,EACL,cAAc,EACd,uBAAuB,EACvB,KAAK,UAAU,EACf,KAAK,aAAa,EAClB,KAAK,kBAAkB,GACxB,MAAM,gBAAgB,CAAC;AAGxB,OAAO,EACL,wBAAwB,EACxB,6BAA6B,EAC7B,wBAAwB,EACxB,6BAA6B,EAC7B,8BAA8B,EAC9B,kBAAkB,EAClB,yBAAyB,EACzB,qBAAqB,EACrB,qBAAqB,EACrB,mBAAmB,EACnB,0BAA0B,EAC1B,oBAAoB,EACpB,sBAAsB,EACtB,uBAAuB,EACvB,KAAK,oBAAoB,EACzB,KAAK,2BAA2B,EAChC,KAAK,2BAA2B,EAChC,KAAK,4BAA4B,EACjC,KAAK,yBAAyB,EAC9B,KAAK,4BAA4B,EACjC,KAAK,qBAAqB,EAC1B,KAAK,SAAS,EACd,KAAK,2BAA2B,EAChC,KAAK,wBAAwB,EAC7B,KAAK,mBAAmB,EACxB,KAAK,iBAAiB,EACtB,KAAK,gBAAgB,EACrB,KAAK,cAAc,GACpB,MAAM,2BAA2B,CAAC"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
// Public surface of the parser registry + format adapters (Epic #189, Issue #266). Exposed
|
|
2
|
+
// from the package root via `packages/keiko-local-knowledge/src/index.ts` so callers can
|
|
3
|
+
// import everything as `@oscharko-dev/keiko-local-knowledge`.
|
|
4
|
+
export { DEFAULT_MAX_BYTES, DEFAULT_MAX_NESTING_DEPTH, DEFAULT_MAX_OBJECTS, DEFAULT_MAX_UNITS, DEFAULT_TIMEOUT_MS, PARSER_ERROR_CODES, } from "./types.js";
|
|
5
|
+
export { buildParserOptions, createParserRegistry, registerParser, resolveParser, unsupportedParser, } from "./registry.js";
|
|
6
|
+
export { textParser } from "./text-parser.js";
|
|
7
|
+
export { jsonParser } from "./json-parser.js";
|
|
8
|
+
export { csvParser } from "./csv-parser.js";
|
|
9
|
+
export { htmlParser } from "./html-parser.js";
|
|
10
|
+
export { pdfParser } from "./pdf-parser.js";
|
|
11
|
+
export { docxParser } from "./docx-parser.js";
|
|
12
|
+
export { xlsxParser } from "./xlsx-parser.js";
|
|
13
|
+
// Convenience: a registry pre-populated with every shipped adapter. Resolution order is
|
|
14
|
+
// JSON → CSV/TSV → HTML → text. Text registers last because its `matches` predicate is
|
|
15
|
+
// the most permissive (accepts any `text/*`); registering it first would shadow CSV and HTML.
|
|
16
|
+
import { csvParser } from "./csv-parser.js";
|
|
17
|
+
import { docxParser } from "./docx-parser.js";
|
|
18
|
+
import { htmlParser } from "./html-parser.js";
|
|
19
|
+
import { jsonParser } from "./json-parser.js";
|
|
20
|
+
import { pdfParser } from "./pdf-parser.js";
|
|
21
|
+
import { createParserRegistry, registerParser } from "./registry.js";
|
|
22
|
+
import { textParser } from "./text-parser.js";
|
|
23
|
+
import { xlsxParser } from "./xlsx-parser.js";
|
|
24
|
+
export function createDefaultParserRegistry() {
|
|
25
|
+
let registry = createParserRegistry();
|
|
26
|
+
registry = registerParser(registry, jsonParser);
|
|
27
|
+
registry = registerParser(registry, csvParser);
|
|
28
|
+
registry = registerParser(registry, htmlParser);
|
|
29
|
+
registry = registerParser(registry, pdfParser);
|
|
30
|
+
registry = registerParser(registry, docxParser);
|
|
31
|
+
registry = registerParser(registry, xlsxParser);
|
|
32
|
+
// Text parser is registered last among the real adapters because its `matches` predicate
|
|
33
|
+
// is the most permissive (it accepts any `text/*` media type), so it must not shadow the
|
|
34
|
+
// structured adapters.
|
|
35
|
+
registry = registerParser(registry, textParser);
|
|
36
|
+
return registry;
|
|
37
|
+
}
|
|
38
|
+
// OCR adapter seam (Issue #202).
|
|
39
|
+
export { nullOcrAdapter, createOcrPipelineParser, } from "./ocr/index.js";
|
|
40
|
+
// Bounded large-document ingestion parser layer (Epic #1160, Issue #1286).
|
|
41
|
+
export { runProgressiveExtraction, createProgressivePdfExtractor, syntheticStreamingSource, syntheticProgressiveExtractor, discoverExtractionCapabilities, probeOcrCapability, probeMultimodalCapability, nullMultimodalAdapter, classifyLargeDocument, usesProgressivePath, isLegacyBinaryOfficeFormat, legacyFormatGuidance, legacyFormatDiagnostic, largeDocumentDiagnostic, } from "./large-document/index.js";
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"json-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/json-parser.ts"],"names":[],"mappings":"AAqBA,OAAO,KAAK,EAEV,aAAa,EAGd,MAAM,YAAY,CAAC;AAoLpB,eAAO,MAAM,UAAU,EAAE,aAkDvB,CAAC"}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
// JSON parser adapter (Epic #189, Issue #266). Pure: bytes -> ParserResult.
|
|
2
|
+
//
|
|
3
|
+
// Emits one ParsedUnit { kind: "json-path", jsonPointer } per LEAF value (string / number /
|
|
4
|
+
// boolean / null / empty array / empty object). Pointers follow RFC 6901 — keys are encoded
|
|
5
|
+
// with `~0` for `~` and `~1` for `/`. Object keys with the same name are NOT deduplicated;
|
|
6
|
+
// callers see one unit per traversal step so duplicate keys (which `JSON.parse` itself
|
|
7
|
+
// resolves by last-wins) collapse the same way they do in the parsed object.
|
|
8
|
+
//
|
|
9
|
+
// Character offsets target a normalized line-oriented projection, not the raw JSON bytes:
|
|
10
|
+
// /json/pointer: <leaf-json>
|
|
11
|
+
// This keeps downstream chunking bounded by the selected leaf instead of multiplying the whole
|
|
12
|
+
// document once per leaf while preserving the JSON Pointer citation.
|
|
13
|
+
import { decodeUtf8, diagnostic, emptyResult, oversizeDiagnostic, shouldStop, } from "./_internal.js";
|
|
14
|
+
const PARSER_ID = "json";
|
|
15
|
+
const PARSER_VERSION = "1";
|
|
16
|
+
const JSON_EXTENSIONS = new Set(["json", "jsonl", "ndjson"]);
|
|
17
|
+
function isJson(input) {
|
|
18
|
+
const ext = input.extension.toLowerCase();
|
|
19
|
+
if (JSON_EXTENSIONS.has(ext))
|
|
20
|
+
return true;
|
|
21
|
+
const media = input.mediaType.toLowerCase();
|
|
22
|
+
return media === "application/json" || media === "application/ld+json";
|
|
23
|
+
}
|
|
24
|
+
function encodePointerSegment(key) {
|
|
25
|
+
// RFC 6901: `~` -> `~0`, `/` -> `~1`. Order matters: replace `~` first.
|
|
26
|
+
return key.replace(/~/g, "~0").replace(/\//g, "~1");
|
|
27
|
+
}
|
|
28
|
+
function joinPointer(parent, segment) {
|
|
29
|
+
return `${parent}/${segment}`;
|
|
30
|
+
}
|
|
31
|
+
function stringifyLeaf(value) {
|
|
32
|
+
return JSON.stringify(value);
|
|
33
|
+
}
|
|
34
|
+
function appendNormalizedLeaf(ctx, pointer, value) {
|
|
35
|
+
const label = pointer.length === 0 ? "/" : pointer;
|
|
36
|
+
const line = `${label}: ${stringifyLeaf(value)}\n`;
|
|
37
|
+
const start = ctx.normalizedLength;
|
|
38
|
+
ctx.normalizedParts.push(line);
|
|
39
|
+
ctx.normalizedLength += line.length;
|
|
40
|
+
return { start, end: ctx.normalizedLength };
|
|
41
|
+
}
|
|
42
|
+
function pushLeaf(ctx, pointer, value) {
|
|
43
|
+
const limit = shouldStop(ctx.startedAt, ctx.options, ctx.units.length);
|
|
44
|
+
if (limit.stop && limit.code !== undefined && limit.message !== undefined) {
|
|
45
|
+
ctx.diagnostics.push(diagnostic(limit.code, limit.message, ctx.input.documentId, "info"));
|
|
46
|
+
ctx.stopped = true;
|
|
47
|
+
return;
|
|
48
|
+
}
|
|
49
|
+
const span = appendNormalizedLeaf(ctx, pointer, value);
|
|
50
|
+
ctx.units.push({
|
|
51
|
+
kind: "json-path",
|
|
52
|
+
documentId: ctx.input.documentId,
|
|
53
|
+
jsonPointer: pointer,
|
|
54
|
+
characterStart: span.start,
|
|
55
|
+
characterEnd: span.end,
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
function isLeaf(value) {
|
|
59
|
+
if (value === null)
|
|
60
|
+
return true;
|
|
61
|
+
const t = typeof value;
|
|
62
|
+
if (t === "string" || t === "number" || t === "boolean")
|
|
63
|
+
return true;
|
|
64
|
+
if (Array.isArray(value))
|
|
65
|
+
return value.length === 0;
|
|
66
|
+
if (t === "object")
|
|
67
|
+
return Object.keys(value).length === 0;
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
function pushNestingLimit(ctx, pointer) {
|
|
71
|
+
const displayPointer = pointer.length === 0 ? "/" : pointer;
|
|
72
|
+
ctx.diagnostics.push(diagnostic("NESTING_LIMIT_REACHED", `JSON nesting depth exceeds maxNestingDepth=${String(ctx.options.maxNestingDepth)} at ${displayPointer}`, ctx.input.documentId, "error"));
|
|
73
|
+
ctx.stopped = true;
|
|
74
|
+
}
|
|
75
|
+
function descendArray(ctx, value, pointer, depth) {
|
|
76
|
+
for (let i = 0; i < value.length; i += 1) {
|
|
77
|
+
if (ctx.stopped)
|
|
78
|
+
return;
|
|
79
|
+
walk(ctx, value[i], joinPointer(pointer, String(i)), depth + 1);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
function descendObject(ctx, value, pointer, depth) {
|
|
83
|
+
for (const key of Object.keys(value)) {
|
|
84
|
+
if (ctx.stopped)
|
|
85
|
+
return;
|
|
86
|
+
walk(ctx, value[key], joinPointer(pointer, encodePointerSegment(key)), depth + 1);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
function walk(ctx, value, pointer, depth) {
|
|
90
|
+
if (ctx.stopped)
|
|
91
|
+
return;
|
|
92
|
+
if (isLeaf(value)) {
|
|
93
|
+
pushLeaf(ctx, pointer, value);
|
|
94
|
+
return;
|
|
95
|
+
}
|
|
96
|
+
if (depth >= ctx.options.maxNestingDepth) {
|
|
97
|
+
pushNestingLimit(ctx, pointer);
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
if (Array.isArray(value)) {
|
|
101
|
+
descendArray(ctx, value, pointer, depth);
|
|
102
|
+
return;
|
|
103
|
+
}
|
|
104
|
+
descendObject(ctx, value, pointer, depth);
|
|
105
|
+
}
|
|
106
|
+
function parseJsonValue(text) {
|
|
107
|
+
try {
|
|
108
|
+
return { ok: true, value: JSON.parse(text) };
|
|
109
|
+
}
|
|
110
|
+
catch (error) {
|
|
111
|
+
// Return only the error TYPE name, never the raw parser message. Modern Node embeds a
|
|
112
|
+
// fragment of the surrounding document text in JSON.parse error messages, which would
|
|
113
|
+
// leak indexed content into the persisted diagnostic surfaced in the capsule detail UI.
|
|
114
|
+
return { ok: false, error: error instanceof Error ? error.name : "SyntaxError" };
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
// GRD-011: JSON Lines / NDJSON is a sequence of independent JSON values, one per line — NOT a
|
|
118
|
+
// single JSON document. Whole-document `JSON.parse` always throws on a multi-record file, so
|
|
119
|
+
// `.jsonl`/`.ndjson` (advertised as supported) were 100% unparseable. Detect and parse per line.
|
|
120
|
+
const JSONL_EXTENSIONS = new Set(["jsonl", "ndjson"]);
|
|
121
|
+
function isJsonLines(input) {
|
|
122
|
+
if (JSONL_EXTENSIONS.has(input.extension.toLowerCase()))
|
|
123
|
+
return true;
|
|
124
|
+
return input.mediaType.toLowerCase() === "application/x-ndjson";
|
|
125
|
+
}
|
|
126
|
+
// Parse each non-empty line independently and walk it under an `/<lineIndex>` pointer prefix
|
|
127
|
+
// (RFC 6901 array-index style), so a record on file line N cites as `/N/...`. A malformed line
|
|
128
|
+
// is surfaced as a non-fatal `warning` diagnostic and skipped — one bad line never discards the
|
|
129
|
+
// whole file's good records.
|
|
130
|
+
function walkJsonLines(ctx) {
|
|
131
|
+
const lines = ctx.text.split("\n");
|
|
132
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
133
|
+
if (ctx.stopped)
|
|
134
|
+
return;
|
|
135
|
+
const raw = lines[i];
|
|
136
|
+
if (raw === undefined || raw.trim().length === 0)
|
|
137
|
+
continue;
|
|
138
|
+
const parsed = parseJsonValue(raw);
|
|
139
|
+
if (!parsed.ok) {
|
|
140
|
+
ctx.diagnostics.push(diagnostic("MALFORMED_INPUT", `JSONL line ${String(i)} parse failed: ${parsed.error}`, ctx.input.documentId, "warning"));
|
|
141
|
+
continue;
|
|
142
|
+
}
|
|
143
|
+
walk(ctx, parsed.value, joinPointer("", String(i)), 0);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
export const jsonParser = Object.freeze({
|
|
147
|
+
capability: Object.freeze({
|
|
148
|
+
parserId: PARSER_ID,
|
|
149
|
+
parserVersion: PARSER_VERSION,
|
|
150
|
+
matches: (input) => isJson(input),
|
|
151
|
+
}),
|
|
152
|
+
parse: (input, options) => {
|
|
153
|
+
if (input.bytes.byteLength > options.maxBytes) {
|
|
154
|
+
return emptyResult(jsonParser.capability, input.documentId, options, [
|
|
155
|
+
oversizeDiagnostic(input.documentId, input.bytes.byteLength, options.maxBytes),
|
|
156
|
+
]);
|
|
157
|
+
}
|
|
158
|
+
const decoded = decodeUtf8(input.bytes);
|
|
159
|
+
const jsonLines = isJsonLines(input);
|
|
160
|
+
// Whole-document parse only for true JSON; JSONL is parsed line-by-line below.
|
|
161
|
+
const parsed = jsonLines ? undefined : parseJsonValue(decoded.text);
|
|
162
|
+
if (parsed !== undefined && !parsed.ok) {
|
|
163
|
+
return emptyResult(jsonParser.capability, input.documentId, options, [
|
|
164
|
+
diagnostic("MALFORMED_INPUT", `JSON parse failed: ${parsed.error}`, input.documentId, "error"),
|
|
165
|
+
]);
|
|
166
|
+
}
|
|
167
|
+
const ctx = {
|
|
168
|
+
text: decoded.text,
|
|
169
|
+
input,
|
|
170
|
+
options,
|
|
171
|
+
startedAt: options.now(),
|
|
172
|
+
units: [],
|
|
173
|
+
diagnostics: [],
|
|
174
|
+
normalizedParts: [],
|
|
175
|
+
normalizedLength: 0,
|
|
176
|
+
stopped: false,
|
|
177
|
+
};
|
|
178
|
+
if (jsonLines) {
|
|
179
|
+
walkJsonLines(ctx);
|
|
180
|
+
}
|
|
181
|
+
else if (parsed?.ok === true) {
|
|
182
|
+
walk(ctx, parsed.value, "", 0);
|
|
183
|
+
}
|
|
184
|
+
if (ctx.diagnostics.some((diagnostic) => diagnostic.code === "NESTING_LIMIT_REACHED")) {
|
|
185
|
+
return emptyResult(jsonParser.capability, input.documentId, options, ctx.diagnostics);
|
|
186
|
+
}
|
|
187
|
+
return {
|
|
188
|
+
...emptyResult(jsonParser.capability, input.documentId, options, ctx.diagnostics, ctx.units),
|
|
189
|
+
normalizedText: ctx.normalizedParts.join(""),
|
|
190
|
+
};
|
|
191
|
+
},
|
|
192
|
+
});
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { ExtractionCapabilityAvailability, ExtractionCapabilityStatus } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
import type { OcrAdapter } from "../ocr/types.js";
|
|
3
|
+
export type MultimodalResult = {
|
|
4
|
+
readonly ok: true;
|
|
5
|
+
readonly text: string;
|
|
6
|
+
} | {
|
|
7
|
+
readonly ok: false;
|
|
8
|
+
readonly reason: "not-configured" | "timeout" | "unsupported-input";
|
|
9
|
+
};
|
|
10
|
+
export interface MultimodalAdapter {
|
|
11
|
+
readonly kind: "multimodal";
|
|
12
|
+
readonly describeImage: (input: {
|
|
13
|
+
readonly bytes: Uint8Array;
|
|
14
|
+
readonly pageNumber: number;
|
|
15
|
+
}) => Promise<MultimodalResult>;
|
|
16
|
+
}
|
|
17
|
+
export declare const nullMultimodalAdapter: MultimodalAdapter;
|
|
18
|
+
export interface CapabilityProbeDeps {
|
|
19
|
+
readonly ocr?: OcrAdapter;
|
|
20
|
+
readonly multimodal?: MultimodalAdapter;
|
|
21
|
+
readonly now?: () => number;
|
|
22
|
+
readonly probeTimeoutMs?: number;
|
|
23
|
+
}
|
|
24
|
+
export declare function probeOcrCapability(adapter: OcrAdapter | undefined, timeoutMs: number): Promise<ExtractionCapabilityStatus>;
|
|
25
|
+
export declare function probeMultimodalCapability(adapter: MultimodalAdapter | undefined, timeoutMs: number): Promise<ExtractionCapabilityStatus>;
|
|
26
|
+
export declare function discoverExtractionCapabilities(deps?: CapabilityProbeDeps): Promise<ExtractionCapabilityAvailability>;
|
|
27
|
+
//# sourceMappingURL=capability-discovery.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"capability-discovery.d.ts","sourceRoot":"","sources":["../../../src/parsers/large-document/capability-discovery.ts"],"names":[],"mappings":"AAQA,OAAO,KAAK,EACV,gCAAgC,EAChC,0BAA0B,EAC3B,MAAM,+BAA+B,CAAC;AAEvC,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAKlD,MAAM,MAAM,gBAAgB,GACxB;IAAE,QAAQ,CAAC,EAAE,EAAE,IAAI,CAAC;IAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAA;CAAE,GAC5C;IAAE,QAAQ,CAAC,EAAE,EAAE,KAAK,CAAC;IAAC,QAAQ,CAAC,MAAM,EAAE,gBAAgB,GAAG,SAAS,GAAG,mBAAmB,CAAA;CAAE,CAAC;AAEhG,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC;IAC5B,QAAQ,CAAC,aAAa,EAAE,CAAC,KAAK,EAAE;QAC9B,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC;QAC3B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;KAC7B,KAAK,OAAO,CAAC,gBAAgB,CAAC,CAAC;CACjC;AAED,eAAO,MAAM,qBAAqB,EAAE,iBAIlC,CAAC;AAEH,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,GAAG,CAAC,EAAE,UAAU,CAAC;IAC1B,QAAQ,CAAC,UAAU,CAAC,EAAE,iBAAiB,CAAC;IACxC,QAAQ,CAAC,GAAG,CAAC,EAAE,MAAM,MAAM,CAAC;IAC5B,QAAQ,CAAC,cAAc,CAAC,EAAE,MAAM,CAAC;CAClC;AAkBD,wBAAsB,kBAAkB,CACtC,OAAO,EAAE,UAAU,GAAG,SAAS,EAC/B,SAAS,EAAE,MAAM,GAChB,OAAO,CAAC,0BAA0B,CAAC,CAqBrC;AAED,wBAAsB,yBAAyB,CAC7C,OAAO,EAAE,iBAAiB,GAAG,SAAS,EACtC,SAAS,EAAE,MAAM,GAChB,OAAO,CAAC,0BAA0B,CAAC,CAoBrC;AAED,wBAAsB,8BAA8B,CAClD,IAAI,GAAE,mBAAwB,GAC7B,OAAO,CAAC,gCAAgC,CAAC,CAO3C"}
|