@oscharko-dev/keiko-local-knowledge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/bounded-document-extraction.d.ts +27 -0
- package/dist/bounded-document-extraction.d.ts.map +1 -0
- package/dist/bounded-document-extraction.js +214 -0
- package/dist/capsule-lifecycle.d.ts +33 -0
- package/dist/capsule-lifecycle.d.ts.map +1 -0
- package/dist/capsule-lifecycle.js +292 -0
- package/dist/capsule-set-lifecycle.d.ts +15 -0
- package/dist/capsule-set-lifecycle.d.ts.map +1 -0
- package/dist/capsule-set-lifecycle.js +158 -0
- package/dist/chunking/chunker-persist.d.ts +36 -0
- package/dist/chunking/chunker-persist.d.ts.map +1 -0
- package/dist/chunking/chunker-persist.js +74 -0
- package/dist/chunking/chunker-runner.d.ts +9 -0
- package/dist/chunking/chunker-runner.d.ts.map +1 -0
- package/dist/chunking/chunker-runner.js +218 -0
- package/dist/chunking/chunker.d.ts +7 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +139 -0
- package/dist/chunking/citation-mapper.d.ts +4 -0
- package/dist/chunking/citation-mapper.d.ts.map +1 -0
- package/dist/chunking/citation-mapper.js +180 -0
- package/dist/chunking/index.d.ts +6 -0
- package/dist/chunking/index.d.ts.map +1 -0
- package/dist/chunking/index.js +8 -0
- package/dist/chunking/token-estimator.d.ts +3 -0
- package/dist/chunking/token-estimator.d.ts.map +1 -0
- package/dist/chunking/token-estimator.js +26 -0
- package/dist/chunking/types.d.ts +49 -0
- package/dist/chunking/types.d.ts.map +1 -0
- package/dist/chunking/types.js +26 -0
- package/dist/composition.d.ts +57 -0
- package/dist/composition.d.ts.map +1 -0
- package/dist/composition.js +310 -0
- package/dist/conversation/citation-attacher.d.ts +8 -0
- package/dist/conversation/citation-attacher.d.ts.map +1 -0
- package/dist/conversation/citation-attacher.js +55 -0
- package/dist/conversation/citation-excerpts.d.ts +4 -0
- package/dist/conversation/citation-excerpts.d.ts.map +1 -0
- package/dist/conversation/citation-excerpts.js +41 -0
- package/dist/conversation/grounded-answer-runner.d.ts +9 -0
- package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
- package/dist/conversation/grounded-answer-runner.js +61 -0
- package/dist/conversation/index.d.ts +5 -0
- package/dist/conversation/index.d.ts.map +1 -0
- package/dist/conversation/index.js +7 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
- package/dist/conversation/model-gateway-answer-generator.js +105 -0
- package/dist/conversation/types.d.ts +35 -0
- package/dist/conversation/types.d.ts.map +1 -0
- package/dist/conversation/types.js +24 -0
- package/dist/discovery/discovery-runner.d.ts +23 -0
- package/dist/discovery/discovery-runner.d.ts.map +1 -0
- package/dist/discovery/discovery-runner.js +109 -0
- package/dist/discovery/extract-progressive.d.ts +17 -0
- package/dist/discovery/extract-progressive.d.ts.map +1 -0
- package/dist/discovery/extract-progressive.js +522 -0
- package/dist/discovery/extract.d.ts +26 -0
- package/dist/discovery/extract.d.ts.map +1 -0
- package/dist/discovery/extract.js +906 -0
- package/dist/discovery/glob.d.ts +10 -0
- package/dist/discovery/glob.d.ts.map +1 -0
- package/dist/discovery/glob.js +72 -0
- package/dist/discovery/index.d.ts +6 -0
- package/dist/discovery/index.d.ts.map +1 -0
- package/dist/discovery/index.js +8 -0
- package/dist/discovery/media-type.d.ts +4 -0
- package/dist/discovery/media-type.d.ts.map +1 -0
- package/dist/discovery/media-type.js +62 -0
- package/dist/discovery/persist.d.ts +63 -0
- package/dist/discovery/persist.d.ts.map +1 -0
- package/dist/discovery/persist.js +345 -0
- package/dist/discovery/test-support.d.ts +16 -0
- package/dist/discovery/test-support.d.ts.map +1 -0
- package/dist/discovery/test-support.js +127 -0
- package/dist/discovery/types.d.ts +63 -0
- package/dist/discovery/types.d.ts.map +1 -0
- package/dist/discovery/types.js +28 -0
- package/dist/discovery/walk.d.ts +12 -0
- package/dist/discovery/walk.d.ts.map +1 -0
- package/dist/discovery/walk.js +302 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +22 -0
- package/dist/evaluations/dimensions.d.ts +14 -0
- package/dist/evaluations/dimensions.d.ts.map +1 -0
- package/dist/evaluations/dimensions.js +191 -0
- package/dist/evaluations/fixtures.d.ts +18 -0
- package/dist/evaluations/fixtures.d.ts.map +1 -0
- package/dist/evaluations/fixtures.js +858 -0
- package/dist/evaluations/index.d.ts +7 -0
- package/dist/evaluations/index.d.ts.map +1 -0
- package/dist/evaluations/index.js +10 -0
- package/dist/evaluations/report.d.ts +3 -0
- package/dist/evaluations/report.d.ts.map +1 -0
- package/dist/evaluations/report.js +31 -0
- package/dist/evaluations/runner-seed.d.ts +12 -0
- package/dist/evaluations/runner-seed.d.ts.map +1 -0
- package/dist/evaluations/runner-seed.js +175 -0
- package/dist/evaluations/runner.d.ts +8 -0
- package/dist/evaluations/runner.d.ts.map +1 -0
- package/dist/evaluations/runner.js +205 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
- package/dist/evaluations/scripted-embedding-adapter.js +163 -0
- package/dist/evaluations/types.d.ts +116 -0
- package/dist/evaluations/types.d.ts.map +1 -0
- package/dist/evaluations/types.js +27 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +41 -0
- package/dist/indexing/bounded-indexing.d.ts +41 -0
- package/dist/indexing/bounded-indexing.d.ts.map +1 -0
- package/dist/indexing/bounded-indexing.js +240 -0
- package/dist/indexing/checkpoint-persist.d.ts +8 -0
- package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
- package/dist/indexing/checkpoint-persist.js +135 -0
- package/dist/indexing/checkpoint-resume.d.ts +20 -0
- package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
- package/dist/indexing/checkpoint-resume.js +50 -0
- package/dist/indexing/embedding-batcher.d.ts +3 -0
- package/dist/indexing/embedding-batcher.d.ts.map +1 -0
- package/dist/indexing/embedding-batcher.js +390 -0
- package/dist/indexing/index.d.ts +7 -0
- package/dist/indexing/index.d.ts.map +1 -0
- package/dist/indexing/index.js +11 -0
- package/dist/indexing/job-persist.d.ts +46 -0
- package/dist/indexing/job-persist.d.ts.map +1 -0
- package/dist/indexing/job-persist.js +157 -0
- package/dist/indexing/job-resume.d.ts +4 -0
- package/dist/indexing/job-resume.d.ts.map +1 -0
- package/dist/indexing/job-resume.js +14 -0
- package/dist/indexing/orchestrator.d.ts +3 -0
- package/dist/indexing/orchestrator.d.ts.map +1 -0
- package/dist/indexing/orchestrator.js +1151 -0
- package/dist/indexing/types.d.ts +156 -0
- package/dist/indexing/types.d.ts.map +1 -0
- package/dist/indexing/types.js +30 -0
- package/dist/indexing/vector-persist.d.ts +32 -0
- package/dist/indexing/vector-persist.d.ts.map +1 -0
- package/dist/indexing/vector-persist.js +105 -0
- package/dist/parsers/_internal.d.ts +20 -0
- package/dist/parsers/_internal.d.ts.map +1 -0
- package/dist/parsers/_internal.js +122 -0
- package/dist/parsers/csv-parser.d.ts +3 -0
- package/dist/parsers/csv-parser.d.ts.map +1 -0
- package/dist/parsers/csv-parser.js +202 -0
- package/dist/parsers/docx-parser.d.ts +3 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +390 -0
- package/dist/parsers/html-parser.d.ts +3 -0
- package/dist/parsers/html-parser.d.ts.map +1 -0
- package/dist/parsers/html-parser.js +310 -0
- package/dist/parsers/index.d.ts +15 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/index.js +41 -0
- package/dist/parsers/json-parser.d.ts +3 -0
- package/dist/parsers/json-parser.d.ts.map +1 -0
- package/dist/parsers/json-parser.js +192 -0
- package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
- package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
- package/dist/parsers/large-document/capability-discovery.js +76 -0
- package/dist/parsers/large-document/diagnostics.d.ts +3 -0
- package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
- package/dist/parsers/large-document/diagnostics.js +11 -0
- package/dist/parsers/large-document/index.d.ts +15 -0
- package/dist/parsers/large-document/index.d.ts.map +1 -0
- package/dist/parsers/large-document/index.js +10 -0
- package/dist/parsers/large-document/legacy-format.d.ts +5 -0
- package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
- package/dist/parsers/large-document/legacy-format.js +25 -0
- package/dist/parsers/large-document/preflight.d.ts +9 -0
- package/dist/parsers/large-document/preflight.d.ts.map +1 -0
- package/dist/parsers/large-document/preflight.js +43 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-extraction.js +123 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-pdf.js +145 -0
- package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
- package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
- package/dist/parsers/large-document/synthetic-source.js +101 -0
- package/dist/parsers/large-document/window-builder.d.ts +24 -0
- package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
- package/dist/parsers/large-document/window-builder.js +75 -0
- package/dist/parsers/ocr/index.d.ts +4 -0
- package/dist/parsers/ocr/index.d.ts.map +1 -0
- package/dist/parsers/ocr/index.js +4 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
- package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
- package/dist/parsers/ocr/types.d.ts +16 -0
- package/dist/parsers/ocr/types.d.ts.map +1 -0
- package/dist/parsers/ocr/types.js +4 -0
- package/dist/parsers/parser-test-fixtures.d.ts +28 -0
- package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
- package/dist/parsers/parser-test-fixtures.js +139 -0
- package/dist/parsers/pdf-parser.d.ts +43 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +388 -0
- package/dist/parsers/registry.d.ts +8 -0
- package/dist/parsers/registry.d.ts.map +1 -0
- package/dist/parsers/registry.js +57 -0
- package/dist/parsers/text-parser.d.ts +3 -0
- package/dist/parsers/text-parser.d.ts.map +1 -0
- package/dist/parsers/text-parser.js +214 -0
- package/dist/parsers/types.d.ts +53 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers/types.js +21 -0
- package/dist/parsers/unsupported-parser.d.ts +4 -0
- package/dist/parsers/unsupported-parser.d.ts.map +1 -0
- package/dist/parsers/unsupported-parser.js +97 -0
- package/dist/parsers/xlsx-parser.d.ts +3 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +425 -0
- package/dist/privacy/audit-emitter.d.ts +5 -0
- package/dist/privacy/audit-emitter.d.ts.map +1 -0
- package/dist/privacy/audit-emitter.js +93 -0
- package/dist/privacy/diagnostic-redactor.d.ts +2 -0
- package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
- package/dist/privacy/diagnostic-redactor.js +153 -0
- package/dist/privacy/index.d.ts +5 -0
- package/dist/privacy/index.d.ts.map +1 -0
- package/dist/privacy/index.js +6 -0
- package/dist/privacy/retention-applier.d.ts +5 -0
- package/dist/privacy/retention-applier.d.ts.map +1 -0
- package/dist/privacy/retention-applier.js +88 -0
- package/dist/privacy/types.d.ts +98 -0
- package/dist/privacy/types.d.ts.map +1 -0
- package/dist/privacy/types.js +12 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
- package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
- package/dist/qualityIntelligence/index.d.ts +3 -0
- package/dist/qualityIntelligence/index.d.ts.map +1 -0
- package/dist/qualityIntelligence/index.js +5 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
- package/dist/qualityIntelligence/qiHandoff.js +82 -0
- package/dist/retrieval/answer-grounding.d.ts +9 -0
- package/dist/retrieval/answer-grounding.d.ts.map +1 -0
- package/dist/retrieval/answer-grounding.js +31 -0
- package/dist/retrieval/context-pack-assembler.d.ts +24 -0
- package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
- package/dist/retrieval/context-pack-assembler.js +50 -0
- package/dist/retrieval/index.d.ts +6 -0
- package/dist/retrieval/index.d.ts.map +1 -0
- package/dist/retrieval/index.js +9 -0
- package/dist/retrieval/retrieval-runner.d.ts +10 -0
- package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
- package/dist/retrieval/retrieval-runner.js +163 -0
- package/dist/retrieval/scoped-vector-search.d.ts +24 -0
- package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
- package/dist/retrieval/scoped-vector-search.js +864 -0
- package/dist/retrieval/types.d.ts +28 -0
- package/dist/retrieval/types.d.ts.map +1 -0
- package/dist/retrieval/types.js +33 -0
- package/dist/section-path-hash.d.ts +3 -0
- package/dist/section-path-hash.d.ts.map +1 -0
- package/dist/section-path-hash.js +9 -0
- package/dist/source-lifecycle.d.ts +14 -0
- package/dist/source-lifecycle.d.ts.map +1 -0
- package/dist/source-lifecycle.js +155 -0
- package/dist/source-routing-validation.d.ts +11 -0
- package/dist/source-routing-validation.d.ts.map +1 -0
- package/dist/source-routing-validation.js +140 -0
- package/dist/store-content-cipher.d.ts +11 -0
- package/dist/store-content-cipher.d.ts.map +1 -0
- package/dist/store-content-cipher.js +67 -0
- package/dist/store-content-encryption.d.ts +12 -0
- package/dist/store-content-encryption.d.ts.map +1 -0
- package/dist/store-content-encryption.js +275 -0
- package/dist/store-paths.d.ts +6 -0
- package/dist/store-paths.d.ts.map +1 -0
- package/dist/store-paths.js +61 -0
- package/dist/store.d.ts +30 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +219 -0
- package/dist/testing.d.ts +47 -0
- package/dist/testing.d.ts.map +1 -0
- package/dist/testing.js +170 -0
- package/dist/version.d.ts +2 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +4 -0
- package/package.json +43 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
// Plain-text + markdown adapter (Epic #189, Issue #266). Pure: takes bytes, returns a
|
|
2
|
+
// ParserResult. Plain text emits a single section unit covering the whole document; markdown
|
|
3
|
+
// is split on ATX-style headings (`^#{1,6}\\s`) into one section per heading run with the
|
|
4
|
+
// hierarchical sectionPath populated from heading text.
|
|
5
|
+
//
|
|
6
|
+
// Source code (.ts/.js/.py/.yaml/etc) routes here too via `languageHint`. The lexical scan
|
|
7
|
+
// runs once over the decoded string and stops emitting units the moment any limit (deadline
|
|
8
|
+
// / cancellation / unit count) trips.
|
|
9
|
+
import { decodeUtf8, diagnostic, emptyResult, oversizeDiagnostic, shouldStop, } from "./_internal.js";
|
|
10
|
+
const PARSER_ID = "text";
|
|
11
|
+
const PARSER_VERSION = "1";
|
|
12
|
+
// Extensions accepted by the text adapter. The list is intentionally explicit so unknown
|
|
13
|
+
// binary extensions fall through to the unsupported adapter rather than being mis-parsed as
|
|
14
|
+
// "plain text". `languageHint` is emitted for source / config files so #195 (chunker) can
|
|
15
|
+
// route them through code-aware splitters when those land.
|
|
16
|
+
const TEXT_EXTENSIONS = new Set([
|
|
17
|
+
"txt",
|
|
18
|
+
"log",
|
|
19
|
+
"md",
|
|
20
|
+
"markdown",
|
|
21
|
+
"rst",
|
|
22
|
+
"adoc",
|
|
23
|
+
"asciidoc",
|
|
24
|
+
"ts",
|
|
25
|
+
"tsx",
|
|
26
|
+
"js",
|
|
27
|
+
"jsx",
|
|
28
|
+
"mjs",
|
|
29
|
+
"cjs",
|
|
30
|
+
"py",
|
|
31
|
+
"rb",
|
|
32
|
+
"go",
|
|
33
|
+
"rs",
|
|
34
|
+
"java",
|
|
35
|
+
"kt",
|
|
36
|
+
"swift",
|
|
37
|
+
"c",
|
|
38
|
+
"cc",
|
|
39
|
+
"cpp",
|
|
40
|
+
"h",
|
|
41
|
+
"hpp",
|
|
42
|
+
"cs",
|
|
43
|
+
"php",
|
|
44
|
+
"sh",
|
|
45
|
+
"bash",
|
|
46
|
+
"zsh",
|
|
47
|
+
"fish",
|
|
48
|
+
"ps1",
|
|
49
|
+
"yaml",
|
|
50
|
+
"yml",
|
|
51
|
+
"toml",
|
|
52
|
+
"ini",
|
|
53
|
+
"cfg",
|
|
54
|
+
"conf",
|
|
55
|
+
"env",
|
|
56
|
+
"properties",
|
|
57
|
+
"sql",
|
|
58
|
+
"graphql",
|
|
59
|
+
"gql",
|
|
60
|
+
]);
|
|
61
|
+
const TEXT_MEDIA_PREFIXES = ["text/"];
|
|
62
|
+
function isMarkdown(input) {
|
|
63
|
+
const ext = input.extension.toLowerCase();
|
|
64
|
+
if (ext === "md" || ext === "markdown")
|
|
65
|
+
return true;
|
|
66
|
+
return input.mediaType.toLowerCase() === "text/markdown";
|
|
67
|
+
}
|
|
68
|
+
function isTextLike(input) {
|
|
69
|
+
const ext = input.extension.toLowerCase();
|
|
70
|
+
if (TEXT_EXTENSIONS.has(ext))
|
|
71
|
+
return true;
|
|
72
|
+
const media = input.mediaType.toLowerCase();
|
|
73
|
+
for (const prefix of TEXT_MEDIA_PREFIXES) {
|
|
74
|
+
if (media.startsWith(prefix))
|
|
75
|
+
return true;
|
|
76
|
+
}
|
|
77
|
+
return false;
|
|
78
|
+
}
|
|
79
|
+
function scanHeadings(text) {
|
|
80
|
+
const out = [];
|
|
81
|
+
let cursor = 0;
|
|
82
|
+
while (cursor < text.length) {
|
|
83
|
+
const newline = text.indexOf("\n", cursor);
|
|
84
|
+
const lineEnd = newline === -1 ? text.length : newline;
|
|
85
|
+
const line = text.slice(cursor, lineEnd);
|
|
86
|
+
const heading = matchAtxHeading(line);
|
|
87
|
+
if (heading !== null) {
|
|
88
|
+
out.push({ ...heading, characterStart: cursor });
|
|
89
|
+
}
|
|
90
|
+
cursor = lineEnd + 1;
|
|
91
|
+
}
|
|
92
|
+
return out;
|
|
93
|
+
}
|
|
94
|
+
const HASH = 0x23;
|
|
95
|
+
const SPACE = 0x20;
|
|
96
|
+
const TAB = 0x09;
|
|
97
|
+
function isHSpace(code) {
|
|
98
|
+
return code === SPACE || code === TAB;
|
|
99
|
+
}
|
|
100
|
+
function countHashes(line) {
|
|
101
|
+
let i = 0;
|
|
102
|
+
while (i < line.length && line.charCodeAt(i) === HASH && i < 6)
|
|
103
|
+
i += 1;
|
|
104
|
+
return i;
|
|
105
|
+
}
|
|
106
|
+
function skipHSpace(line, from) {
|
|
107
|
+
let i = from;
|
|
108
|
+
while (i < line.length && isHSpace(line.charCodeAt(i)))
|
|
109
|
+
i += 1;
|
|
110
|
+
return i;
|
|
111
|
+
}
|
|
112
|
+
function trimAtxTrailing(line, start) {
|
|
113
|
+
let end = line.length;
|
|
114
|
+
while (end > start && isHSpace(line.charCodeAt(end - 1)))
|
|
115
|
+
end -= 1;
|
|
116
|
+
while (end > start && line.charCodeAt(end - 1) === HASH)
|
|
117
|
+
end -= 1;
|
|
118
|
+
while (end > start && isHSpace(line.charCodeAt(end - 1)))
|
|
119
|
+
end -= 1;
|
|
120
|
+
return end;
|
|
121
|
+
}
|
|
122
|
+
function matchAtxHeading(line) {
|
|
123
|
+
const level = countHashes(line);
|
|
124
|
+
if (level < 1 || level > 6)
|
|
125
|
+
return null;
|
|
126
|
+
if (level >= line.length || !isHSpace(line.charCodeAt(level)))
|
|
127
|
+
return null;
|
|
128
|
+
const textStart = skipHSpace(line, level + 1);
|
|
129
|
+
const textEnd = trimAtxTrailing(line, textStart);
|
|
130
|
+
if (textEnd <= textStart)
|
|
131
|
+
return null;
|
|
132
|
+
return { level, text: line.slice(textStart, textEnd) };
|
|
133
|
+
}
|
|
134
|
+
function sectionUnit(input, path, start, end) {
|
|
135
|
+
return {
|
|
136
|
+
kind: "section",
|
|
137
|
+
documentId: input.documentId,
|
|
138
|
+
sectionPath: path,
|
|
139
|
+
characterStart: start,
|
|
140
|
+
characterEnd: end,
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
function pushHeading(stack, current, next, text, input) {
|
|
144
|
+
while (stack.length >= current.level)
|
|
145
|
+
stack.pop();
|
|
146
|
+
stack.push(current.text);
|
|
147
|
+
const sectionEnd = next === undefined ? text.length : next.characterStart;
|
|
148
|
+
return sectionUnit(input, [...stack], current.characterStart, sectionEnd);
|
|
149
|
+
}
|
|
150
|
+
function emitMarkdownSections(text, input, options, startedAt) {
|
|
151
|
+
const headings = scanHeadings(text);
|
|
152
|
+
if (headings.length === 0) {
|
|
153
|
+
return emitPlainSection(text, input, options, startedAt);
|
|
154
|
+
}
|
|
155
|
+
const units = [];
|
|
156
|
+
const diagnostics = [];
|
|
157
|
+
const stack = [];
|
|
158
|
+
const firstStart = headings[0]?.characterStart ?? 0;
|
|
159
|
+
if (firstStart > 0)
|
|
160
|
+
units.push(sectionUnit(input, [], 0, firstStart));
|
|
161
|
+
for (let i = 0; i < headings.length; i += 1) {
|
|
162
|
+
const limit = shouldStop(startedAt, options, units.length);
|
|
163
|
+
if (limit.stop && limit.code !== undefined && limit.message !== undefined) {
|
|
164
|
+
diagnostics.push(diagnostic(limit.code, limit.message, input.documentId, "info"));
|
|
165
|
+
break;
|
|
166
|
+
}
|
|
167
|
+
const current = headings[i];
|
|
168
|
+
if (current === undefined)
|
|
169
|
+
break;
|
|
170
|
+
units.push(pushHeading(stack, current, headings[i + 1], text, input));
|
|
171
|
+
}
|
|
172
|
+
return { units, diagnostics };
|
|
173
|
+
}
|
|
174
|
+
function emitPlainSection(text, input, options, startedAt) {
|
|
175
|
+
const limit = shouldStop(startedAt, options, 0);
|
|
176
|
+
if (limit.stop && limit.code !== undefined && limit.message !== undefined) {
|
|
177
|
+
return {
|
|
178
|
+
units: [],
|
|
179
|
+
diagnostics: [diagnostic(limit.code, limit.message, input.documentId, "info")],
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
return {
|
|
183
|
+
units: [
|
|
184
|
+
{
|
|
185
|
+
kind: "section",
|
|
186
|
+
documentId: input.documentId,
|
|
187
|
+
sectionPath: [],
|
|
188
|
+
characterStart: 0,
|
|
189
|
+
characterEnd: text.length,
|
|
190
|
+
},
|
|
191
|
+
],
|
|
192
|
+
diagnostics: [],
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
export const textParser = Object.freeze({
|
|
196
|
+
capability: Object.freeze({
|
|
197
|
+
parserId: PARSER_ID,
|
|
198
|
+
parserVersion: PARSER_VERSION,
|
|
199
|
+
matches: (input) => isTextLike(input),
|
|
200
|
+
}),
|
|
201
|
+
parse: (input, options) => {
|
|
202
|
+
if (input.bytes.byteLength > options.maxBytes) {
|
|
203
|
+
return emptyResult(textParser.capability, input.documentId, options, [
|
|
204
|
+
oversizeDiagnostic(input.documentId, input.bytes.byteLength, options.maxBytes),
|
|
205
|
+
]);
|
|
206
|
+
}
|
|
207
|
+
const startedAt = options.now();
|
|
208
|
+
const decoded = decodeUtf8(input.bytes);
|
|
209
|
+
const emission = isMarkdown(input)
|
|
210
|
+
? emitMarkdownSections(decoded.text, input, options, startedAt)
|
|
211
|
+
: emitPlainSection(decoded.text, input, options, startedAt);
|
|
212
|
+
return emptyResult(textParser.capability, input.documentId, options, emission.diagnostics, emission.units);
|
|
213
|
+
},
|
|
214
|
+
});
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import type { DocumentId, ParsedUnit, ParserDiagnostic, ParserDependencyVersion, ParserResult } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
export interface ParserOptions {
|
|
3
|
+
readonly maxBytes: number;
|
|
4
|
+
readonly maxUnitsPerDocument: number;
|
|
5
|
+
readonly maxNestingDepth: number;
|
|
6
|
+
readonly maxObjectsPerDocument: number;
|
|
7
|
+
readonly timeoutMs: number;
|
|
8
|
+
readonly signal?: AbortSignal;
|
|
9
|
+
readonly now: () => number;
|
|
10
|
+
}
|
|
11
|
+
export declare const DEFAULT_MAX_BYTES: number;
|
|
12
|
+
export declare const DEFAULT_MAX_UNITS = 50000;
|
|
13
|
+
export declare const DEFAULT_MAX_NESTING_DEPTH = 128;
|
|
14
|
+
export declare const DEFAULT_MAX_OBJECTS = 25000000;
|
|
15
|
+
export declare const DEFAULT_TIMEOUT_MS: number;
|
|
16
|
+
export interface ParserSelectionInput {
|
|
17
|
+
readonly documentId: DocumentId;
|
|
18
|
+
readonly bytes: Uint8Array;
|
|
19
|
+
readonly extension: string;
|
|
20
|
+
readonly mediaType: string;
|
|
21
|
+
readonly languageHint?: string;
|
|
22
|
+
}
|
|
23
|
+
export interface ParserCapability {
|
|
24
|
+
readonly parserId: string;
|
|
25
|
+
readonly parserVersion: string;
|
|
26
|
+
readonly dependencyVersions?: readonly ParserDependencyVersion[];
|
|
27
|
+
readonly matches: (input: ParserSelectionInput) => boolean;
|
|
28
|
+
}
|
|
29
|
+
export interface ParserAdapter {
|
|
30
|
+
readonly capability: ParserCapability;
|
|
31
|
+
readonly parse: (input: ParserSelectionInput, options: ParserOptions) => ParserResult;
|
|
32
|
+
}
|
|
33
|
+
export interface AsyncParserAdapter extends ParserAdapter {
|
|
34
|
+
readonly parseAsync: (input: ParserSelectionInput, options: ParserOptions) => Promise<ParserResult>;
|
|
35
|
+
}
|
|
36
|
+
export interface InternalParserResult extends ParserResult {
|
|
37
|
+
readonly normalizedText?: string;
|
|
38
|
+
}
|
|
39
|
+
export interface ParserRegistry {
|
|
40
|
+
readonly list: () => readonly ParserAdapter[];
|
|
41
|
+
readonly resolve: (input: ParserSelectionInput) => ParserResolution;
|
|
42
|
+
}
|
|
43
|
+
export type ParserResolution = {
|
|
44
|
+
readonly kind: "matched";
|
|
45
|
+
readonly adapter: ParserAdapter;
|
|
46
|
+
} | {
|
|
47
|
+
readonly kind: "unsupported";
|
|
48
|
+
readonly reason: string;
|
|
49
|
+
};
|
|
50
|
+
export type ParserErrorCode = "OVERSIZED_FILE" | "UNIT_LIMIT_REACHED" | "PARSER_TIMEOUT" | "PARSER_CANCELLED" | "NESTING_LIMIT_REACHED" | "OBJECT_LIMIT_REACHED" | "MALFORMED_INPUT" | "UNSUPPORTED_FORMAT";
|
|
51
|
+
export declare const PARSER_ERROR_CODES: readonly ParserErrorCode[];
|
|
52
|
+
export type { ParserResult, ParsedUnit, ParserDiagnostic };
|
|
53
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/parsers/types.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EACV,UAAU,EACV,UAAU,EACV,gBAAgB,EAChB,uBAAuB,EACvB,YAAY,EACb,MAAM,+BAA+B,CAAC;AAIvC,MAAM,WAAW,aAAa;IAG5B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAE1B,QAAQ,CAAC,mBAAmB,EAAE,MAAM,CAAC;IAGrC,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;IAGjC,QAAQ,CAAC,qBAAqB,EAAE,MAAM,CAAC;IAEvC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAG3B,QAAQ,CAAC,MAAM,CAAC,EAAE,WAAW,CAAC;IAG9B,QAAQ,CAAC,GAAG,EAAE,MAAM,MAAM,CAAC;CAC5B;AAED,eAAO,MAAM,iBAAiB,QAAqB,CAAC;AACpD,eAAO,MAAM,iBAAiB,QAAS,CAAC;AACxC,eAAO,MAAM,yBAAyB,MAAM,CAAC;AAC7C,eAAO,MAAM,mBAAmB,WAAa,CAAC;AAC9C,eAAO,MAAM,kBAAkB,QAAiB,CAAC;AAIjD,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;IAEhC,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC;IAG3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAG3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAG3B,QAAQ,CAAC,YAAY,CAAC,EAAE,MAAM,CAAC;CAChC;AAED,MAAM,WAAW,gBAAgB;IAE/B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAG1B,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAG/B,QAAQ,CAAC,kBAAkB,CAAC,EAAE,SAAS,uBAAuB,EAAE,CAAC;IAGjE,QAAQ,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,oBAAoB,KAAK,OAAO,CAAC;CAC5D;AAID,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,UAAU,EAAE,gBAAgB,CAAC;IAKtC,QAAQ,CAAC,KAAK,EAAE,CAAC,KAAK,EAAE,oBAAoB,EAAE,OAAO,EAAE,aAAa,KAAK,YAAY,CAAC;CACvF;AAED,MAAM,WAAW,kBAAmB,SAAQ,aAAa;IACvD,QAAQ,CAAC,UAAU,EAAE,CACnB,KAAK,EAAE,oBAAoB,EAC3B,OAAO,EAAE,aAAa,KACnB,OAAO,CAAC,YAAY,CAAC,CAAC;CAC5B;AAOD,MAAM,WAAW,oBAAqB,SAAQ,YAAY;IACxD,QAAQ,CAAC,cAAc,CAAC,EAAE,MAAM,CAAC;CAClC;AAID,MAAM,WAAW,cAAc;IAC7B,QAAQ,CAAC,IAAI,EAAE,MAAM,SAAS,aAAa,EAAE,CAAC;IAI9C,QAAQ,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,oBAAoB,KAAK,gBAAgB,CAAC;CACrE;AAED,MAAM,MAAM,gBAAgB,GACxB;IAAE,QAAQ,CAAC,IAAI,EAAE,SAAS,CAAC;IAAC,QAAQ,CAAC,OAAO,EAAE,aAAa,CAAA;CAAE,GAC7D;IAAE,QAAQ,CAAC,IAAI,EAAE,aAAa,CAAC;IAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAAC;AAI9D,MAAM,MAAM,eAAe,GACvB,gBAAgB,GAChB,oBAAoB,GACpB,gBAAgB,GAChB,kBAAkB,GAClB,uBAAuB,GACvB,sBAAsB,GACtB,iBAAiB,GACjB,oBAAoB,CAAC;AAEzB,eAAO,MAAM,kBAAkB,EAAE,SAAS,eAAe,EAS/C,CAAC;AAIX,YAAY,EAAE,YAAY,EAAE,UAAU,EAAE,gBAAgB,EAAE,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
// Public types for the parser registry + format adapters (Epic #189, Issue #266).
|
|
2
|
+
//
|
|
3
|
+
// Adapters are pure: `(input, options) -> ParserResult`. No FS, no clock beyond a single
|
|
4
|
+
// injected `now()`, no randomness, and no implicit runtime services. The runtime layer at
|
|
5
|
+
// #194 reads bytes from disk and hands them to a parser; that keeps parsers trivially
|
|
6
|
+
// testable with synthetic strings.
|
|
7
|
+
export const DEFAULT_MAX_BYTES = 1024 * 1024 * 1024;
|
|
8
|
+
export const DEFAULT_MAX_UNITS = 50_000;
|
|
9
|
+
export const DEFAULT_MAX_NESTING_DEPTH = 128;
|
|
10
|
+
export const DEFAULT_MAX_OBJECTS = 25_000_000;
|
|
11
|
+
export const DEFAULT_TIMEOUT_MS = 60 * 60 * 1000;
|
|
12
|
+
export const PARSER_ERROR_CODES = [
|
|
13
|
+
"OVERSIZED_FILE",
|
|
14
|
+
"UNIT_LIMIT_REACHED",
|
|
15
|
+
"PARSER_TIMEOUT",
|
|
16
|
+
"PARSER_CANCELLED",
|
|
17
|
+
"NESTING_LIMIT_REACHED",
|
|
18
|
+
"OBJECT_LIMIT_REACHED",
|
|
19
|
+
"MALFORMED_INPUT",
|
|
20
|
+
"UNSUPPORTED_FORMAT",
|
|
21
|
+
];
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { ParserAdapter, ParserSelectionInput } from "./types.js";
|
|
2
|
+
export declare function classifyUnsupported(input: ParserSelectionInput): string | undefined;
|
|
3
|
+
export declare const unsupportedParser: ParserAdapter;
|
|
4
|
+
//# sourceMappingURL=unsupported-parser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"unsupported-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/unsupported-parser.ts"],"names":[],"mappings":"AAQA,OAAO,KAAK,EAAE,aAAa,EAAiB,oBAAoB,EAAE,MAAM,YAAY,CAAC;AAwErF,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,oBAAoB,GAAG,MAAM,GAAG,SAAS,CASnF;AAED,eAAO,MAAM,iBAAiB,EAAE,aAuB9B,CAAC"}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
// Unsupported-format parser (Epic #189, Issue #266). Emits a single
|
|
2
|
+
// `unsupported-media` ParsedUnit plus a typed `UNSUPPORTED_FORMAT` diagnostic so
|
|
3
|
+
// downstream layers (chunker #195, indexer #196, UI #198) can render a stable signal
|
|
4
|
+
// without sniffing the bytes themselves.
|
|
5
|
+
//
|
|
6
|
+
// CRITICAL: archives (.zip / .tar / .gz / .tar.gz) hit THIS adapter unchanged. We never
|
|
7
|
+
// open them — zip-bomb protection lives in not-decompressing, not in size caps.
|
|
8
|
+
import { diagnostic, emptyResult } from "./_internal.js";
|
|
9
|
+
const PARSER_ID = "unsupported";
|
|
10
|
+
const PARSER_VERSION = "1";
|
|
11
|
+
// Extensions we recognise but cannot handle in this PR. Each maps to a stable reason string
|
|
12
|
+
// so UI surfaces can render distinct guidance (e.g. "PDF — extract via OCR adapter").
|
|
13
|
+
const UNSUPPORTED_EXTENSIONS = Object.freeze({
|
|
14
|
+
pdf: "pdf-not-implemented",
|
|
15
|
+
docx: "docx-not-implemented",
|
|
16
|
+
doc: "legacy-doc-not-implemented",
|
|
17
|
+
png: "image-not-supported",
|
|
18
|
+
jpg: "image-not-supported",
|
|
19
|
+
jpeg: "image-not-supported",
|
|
20
|
+
gif: "image-not-supported",
|
|
21
|
+
bmp: "image-not-supported",
|
|
22
|
+
tif: "image-not-supported",
|
|
23
|
+
tiff: "image-not-supported",
|
|
24
|
+
webp: "image-not-supported",
|
|
25
|
+
mp3: "audio-not-supported",
|
|
26
|
+
wav: "audio-not-supported",
|
|
27
|
+
flac: "audio-not-supported",
|
|
28
|
+
ogg: "audio-not-supported",
|
|
29
|
+
mp4: "video-not-supported",
|
|
30
|
+
mov: "video-not-supported",
|
|
31
|
+
mkv: "video-not-supported",
|
|
32
|
+
avi: "video-not-supported",
|
|
33
|
+
webm: "video-not-supported",
|
|
34
|
+
zip: "archive-not-decompressed",
|
|
35
|
+
tar: "archive-not-decompressed",
|
|
36
|
+
gz: "archive-not-decompressed",
|
|
37
|
+
tgz: "archive-not-decompressed",
|
|
38
|
+
bz2: "archive-not-decompressed",
|
|
39
|
+
"7z": "archive-not-decompressed",
|
|
40
|
+
rar: "archive-not-decompressed",
|
|
41
|
+
exe: "binary-not-supported",
|
|
42
|
+
dll: "binary-not-supported",
|
|
43
|
+
bin: "binary-not-supported",
|
|
44
|
+
so: "binary-not-supported",
|
|
45
|
+
dylib: "binary-not-supported",
|
|
46
|
+
});
|
|
47
|
+
// Magic-byte signatures for content sniffing. Each entry is `(bytes prefix) -> reason`.
|
|
48
|
+
// We compare against the first few bytes so we still classify a `.txt`-named PDF as
|
|
49
|
+
// "pdf-not-implemented" rather than misclassifying it as plain text.
|
|
50
|
+
const MAGIC_BYTES = Object.freeze([
|
|
51
|
+
{ prefix: [0x25, 0x50, 0x44, 0x46], reason: "pdf-not-implemented" }, // %PDF
|
|
52
|
+
{ prefix: [0x50, 0x4b, 0x03, 0x04], reason: "archive-not-decompressed" }, // PK\3\4 (zip / docx / xlsx)
|
|
53
|
+
{ prefix: [0x1f, 0x8b], reason: "archive-not-decompressed" }, // gzip
|
|
54
|
+
{ prefix: [0x89, 0x50, 0x4e, 0x47], reason: "image-not-supported" }, // PNG
|
|
55
|
+
{ prefix: [0xff, 0xd8, 0xff], reason: "image-not-supported" }, // JPEG
|
|
56
|
+
{ prefix: [0x47, 0x49, 0x46, 0x38], reason: "image-not-supported" }, // GIF8
|
|
57
|
+
{ prefix: [0x42, 0x4d], reason: "image-not-supported" }, // BMP
|
|
58
|
+
]);
|
|
59
|
+
function magicByteReason(bytes) {
|
|
60
|
+
for (const entry of MAGIC_BYTES) {
|
|
61
|
+
if (bytes.length < entry.prefix.length)
|
|
62
|
+
continue;
|
|
63
|
+
let match = true;
|
|
64
|
+
for (let i = 0; i < entry.prefix.length; i += 1) {
|
|
65
|
+
if (bytes[i] !== entry.prefix[i]) {
|
|
66
|
+
match = false;
|
|
67
|
+
break;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
if (match)
|
|
71
|
+
return entry.reason;
|
|
72
|
+
}
|
|
73
|
+
return undefined;
|
|
74
|
+
}
|
|
75
|
+
export function classifyUnsupported(input) {
|
|
76
|
+
// Extension table is checked first so that known formats (e.g. .docx, which IS a ZIP) get
|
|
77
|
+
// the specific reason string rather than the generic magic-byte fallback.
|
|
78
|
+
const lower = input.extension.toLowerCase();
|
|
79
|
+
if (lower in UNSUPPORTED_EXTENSIONS) {
|
|
80
|
+
return UNSUPPORTED_EXTENSIONS[lower];
|
|
81
|
+
}
|
|
82
|
+
// Fall back to magic-byte sniffing for files with unknown/absent extensions.
|
|
83
|
+
return magicByteReason(input.bytes);
|
|
84
|
+
}
|
|
85
|
+
export const unsupportedParser = Object.freeze({
|
|
86
|
+
capability: Object.freeze({
|
|
87
|
+
parserId: PARSER_ID,
|
|
88
|
+
parserVersion: PARSER_VERSION,
|
|
89
|
+
matches: (input) => classifyUnsupported(input) !== undefined,
|
|
90
|
+
}),
|
|
91
|
+
parse: (input, options) => {
|
|
92
|
+
const reason = classifyUnsupported(input) ?? "unknown-format";
|
|
93
|
+
return emptyResult(unsupportedParser.capability, input.documentId, options, [
|
|
94
|
+
diagnostic("UNSUPPORTED_FORMAT", `format not parseable in this build (${reason})`, input.documentId, "info"),
|
|
95
|
+
], [{ kind: "unsupported-media", documentId: input.documentId, reason }]);
|
|
96
|
+
},
|
|
97
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"xlsx-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/xlsx-parser.ts"],"names":[],"mappings":"AAaA,OAAO,KAAK,EACV,kBAAkB,EAMnB,MAAM,YAAY,CAAC;AAugBpB,eAAO,MAAM,UAAU,EAAE,kBAyCvB,CAAC"}
|