@gmickel/gno 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +256 -0
- package/assets/skill/SKILL.md +112 -0
- package/assets/skill/cli-reference.md +327 -0
- package/assets/skill/examples.md +234 -0
- package/assets/skill/mcp-reference.md +159 -0
- package/package.json +90 -0
- package/src/app/constants.ts +313 -0
- package/src/cli/colors.ts +65 -0
- package/src/cli/commands/ask.ts +545 -0
- package/src/cli/commands/cleanup.ts +105 -0
- package/src/cli/commands/collection/add.ts +120 -0
- package/src/cli/commands/collection/index.ts +10 -0
- package/src/cli/commands/collection/list.ts +108 -0
- package/src/cli/commands/collection/remove.ts +64 -0
- package/src/cli/commands/collection/rename.ts +95 -0
- package/src/cli/commands/context/add.ts +67 -0
- package/src/cli/commands/context/check.ts +153 -0
- package/src/cli/commands/context/index.ts +10 -0
- package/src/cli/commands/context/list.ts +109 -0
- package/src/cli/commands/context/rm.ts +52 -0
- package/src/cli/commands/doctor.ts +393 -0
- package/src/cli/commands/embed.ts +462 -0
- package/src/cli/commands/get.ts +356 -0
- package/src/cli/commands/index-cmd.ts +119 -0
- package/src/cli/commands/index.ts +102 -0
- package/src/cli/commands/init.ts +328 -0
- package/src/cli/commands/ls.ts +217 -0
- package/src/cli/commands/mcp/config.ts +300 -0
- package/src/cli/commands/mcp/index.ts +24 -0
- package/src/cli/commands/mcp/install.ts +203 -0
- package/src/cli/commands/mcp/paths.ts +470 -0
- package/src/cli/commands/mcp/status.ts +222 -0
- package/src/cli/commands/mcp/uninstall.ts +158 -0
- package/src/cli/commands/mcp.ts +20 -0
- package/src/cli/commands/models/clear.ts +103 -0
- package/src/cli/commands/models/index.ts +32 -0
- package/src/cli/commands/models/list.ts +214 -0
- package/src/cli/commands/models/path.ts +51 -0
- package/src/cli/commands/models/pull.ts +199 -0
- package/src/cli/commands/models/use.ts +85 -0
- package/src/cli/commands/multi-get.ts +400 -0
- package/src/cli/commands/query.ts +220 -0
- package/src/cli/commands/ref-parser.ts +108 -0
- package/src/cli/commands/reset.ts +191 -0
- package/src/cli/commands/search.ts +136 -0
- package/src/cli/commands/shared.ts +156 -0
- package/src/cli/commands/skill/index.ts +19 -0
- package/src/cli/commands/skill/install.ts +197 -0
- package/src/cli/commands/skill/paths-cmd.ts +81 -0
- package/src/cli/commands/skill/paths.ts +191 -0
- package/src/cli/commands/skill/show.ts +73 -0
- package/src/cli/commands/skill/uninstall.ts +141 -0
- package/src/cli/commands/status.ts +205 -0
- package/src/cli/commands/update.ts +68 -0
- package/src/cli/commands/vsearch.ts +188 -0
- package/src/cli/context.ts +64 -0
- package/src/cli/errors.ts +64 -0
- package/src/cli/format/search-results.ts +211 -0
- package/src/cli/options.ts +183 -0
- package/src/cli/program.ts +1330 -0
- package/src/cli/run.ts +213 -0
- package/src/cli/ui.ts +92 -0
- package/src/config/defaults.ts +20 -0
- package/src/config/index.ts +55 -0
- package/src/config/loader.ts +161 -0
- package/src/config/paths.ts +87 -0
- package/src/config/saver.ts +153 -0
- package/src/config/types.ts +280 -0
- package/src/converters/adapters/markitdownTs/adapter.ts +140 -0
- package/src/converters/adapters/officeparser/adapter.ts +126 -0
- package/src/converters/canonicalize.ts +89 -0
- package/src/converters/errors.ts +218 -0
- package/src/converters/index.ts +51 -0
- package/src/converters/mime.ts +163 -0
- package/src/converters/native/markdown.ts +115 -0
- package/src/converters/native/plaintext.ts +56 -0
- package/src/converters/path.ts +48 -0
- package/src/converters/pipeline.ts +159 -0
- package/src/converters/registry.ts +74 -0
- package/src/converters/types.ts +123 -0
- package/src/converters/versions.ts +24 -0
- package/src/index.ts +27 -0
- package/src/ingestion/chunker.ts +238 -0
- package/src/ingestion/index.ts +32 -0
- package/src/ingestion/language.ts +276 -0
- package/src/ingestion/sync.ts +671 -0
- package/src/ingestion/types.ts +219 -0
- package/src/ingestion/walker.ts +235 -0
- package/src/llm/cache.ts +467 -0
- package/src/llm/errors.ts +191 -0
- package/src/llm/index.ts +58 -0
- package/src/llm/nodeLlamaCpp/adapter.ts +133 -0
- package/src/llm/nodeLlamaCpp/embedding.ts +165 -0
- package/src/llm/nodeLlamaCpp/generation.ts +88 -0
- package/src/llm/nodeLlamaCpp/lifecycle.ts +317 -0
- package/src/llm/nodeLlamaCpp/rerank.ts +94 -0
- package/src/llm/registry.ts +86 -0
- package/src/llm/types.ts +129 -0
- package/src/mcp/resources/index.ts +151 -0
- package/src/mcp/server.ts +229 -0
- package/src/mcp/tools/get.ts +220 -0
- package/src/mcp/tools/index.ts +160 -0
- package/src/mcp/tools/multi-get.ts +263 -0
- package/src/mcp/tools/query.ts +226 -0
- package/src/mcp/tools/search.ts +119 -0
- package/src/mcp/tools/status.ts +81 -0
- package/src/mcp/tools/vsearch.ts +198 -0
- package/src/pipeline/chunk-lookup.ts +44 -0
- package/src/pipeline/expansion.ts +256 -0
- package/src/pipeline/explain.ts +115 -0
- package/src/pipeline/fusion.ts +185 -0
- package/src/pipeline/hybrid.ts +535 -0
- package/src/pipeline/index.ts +64 -0
- package/src/pipeline/query-language.ts +118 -0
- package/src/pipeline/rerank.ts +223 -0
- package/src/pipeline/search.ts +261 -0
- package/src/pipeline/types.ts +328 -0
- package/src/pipeline/vsearch.ts +348 -0
- package/src/store/index.ts +41 -0
- package/src/store/migrations/001-initial.ts +196 -0
- package/src/store/migrations/index.ts +20 -0
- package/src/store/migrations/runner.ts +187 -0
- package/src/store/sqlite/adapter.ts +1242 -0
- package/src/store/sqlite/index.ts +7 -0
- package/src/store/sqlite/setup.ts +129 -0
- package/src/store/sqlite/types.ts +28 -0
- package/src/store/types.ts +506 -0
- package/src/store/vector/index.ts +13 -0
- package/src/store/vector/sqlite-vec.ts +373 -0
- package/src/store/vector/stats.ts +152 -0
- package/src/store/vector/types.ts +115 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-platform path utilities for deterministic behavior.
|
|
3
|
+
* Uses POSIX path operations to ensure identical results across platforms.
|
|
4
|
+
*
|
|
5
|
+
* CRITICAL: All converter path operations MUST use these utilities.
|
|
6
|
+
* Using node:path directly will produce different results on Windows vs POSIX.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
// OK: no Bun path utils, must use node:path/posix for cross-platform determinism
|
|
10
|
+
import {
|
|
11
|
+
basename as posixBasename,
|
|
12
|
+
extname as posixExtname,
|
|
13
|
+
normalize as posixNormalize,
|
|
14
|
+
} from 'node:path/posix';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Normalize a relative path to POSIX format.
|
|
18
|
+
* Converts Windows separators, normalizes ../ and ./, strips trailing slashes.
|
|
19
|
+
*/
|
|
20
|
+
export function normalizePath(p: string): string {
|
|
21
|
+
// Convert Windows separators to POSIX
|
|
22
|
+
const posixPath = p.replace(/\\/g, '/');
|
|
23
|
+
// Normalize (resolve .., ., double slashes)
|
|
24
|
+
return posixNormalize(posixPath);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Get the base filename from a path (POSIX-safe).
|
|
29
|
+
*/
|
|
30
|
+
export function basename(p: string): string {
|
|
31
|
+
return posixBasename(normalizePath(p));
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Get the file extension from a path (POSIX-safe, lowercase).
|
|
36
|
+
*/
|
|
37
|
+
export function extname(p: string): string {
|
|
38
|
+
return posixExtname(normalizePath(p)).toLowerCase();
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Get filename without extension (for title derivation).
|
|
43
|
+
*/
|
|
44
|
+
export function basenameWithoutExt(p: string): string {
|
|
45
|
+
const name = basename(p);
|
|
46
|
+
const ext = posixExtname(name);
|
|
47
|
+
return ext ? name.slice(0, -ext.length) : name;
|
|
48
|
+
}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Conversion pipeline - single entry point for all document conversions.
|
|
3
|
+
* PRD §8.4 - Canonical Markdown conventions
|
|
4
|
+
*
|
|
5
|
+
* The pipeline:
|
|
6
|
+
* 1. Delegates to the registry to find and invoke the appropriate converter
|
|
7
|
+
* 2. Enforces pre-canonicalization output size limit (early bailout)
|
|
8
|
+
* 3. Canonicalizes the raw markdown output (centralized, not per-converter)
|
|
9
|
+
* 4. Enforces post-canonicalization output size limit (zip bomb protection)
|
|
10
|
+
* 5. Computes mirrorHash from canonical markdown
|
|
11
|
+
* 6. Returns ConversionArtifact (not ConvertOutput)
|
|
12
|
+
*
|
|
13
|
+
* CRITICAL: Canonicalization is ONLY done here, not in individual converters.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { canonicalize, mirrorHash } from './canonicalize';
|
|
17
|
+
import { internalError, outputTooLargeError } from './errors';
|
|
18
|
+
import { type ConverterRegistry, createDefaultRegistry } from './registry';
|
|
19
|
+
import type { ConversionArtifact, ConvertInput, PipelineResult } from './types';
|
|
20
|
+
|
|
21
|
+
export class ConversionPipeline {
|
|
22
|
+
private registry: ConverterRegistry | null = null;
|
|
23
|
+
private initPromise: Promise<void> | null = null;
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Create a pipeline with default registry.
|
|
27
|
+
* Registry is lazily initialized on first use.
|
|
28
|
+
*/
|
|
29
|
+
constructor(registry?: ConverterRegistry) {
|
|
30
|
+
if (registry) {
|
|
31
|
+
this.registry = registry;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Ensure registry is initialized.
|
|
37
|
+
* Resets initPromise on failure to allow retry.
|
|
38
|
+
*/
|
|
39
|
+
private async ensureRegistry(): Promise<ConverterRegistry> {
|
|
40
|
+
if (this.registry) {
|
|
41
|
+
return this.registry;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if (!this.initPromise) {
|
|
45
|
+
this.initPromise = createDefaultRegistry().then((r) => {
|
|
46
|
+
this.registry = r;
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
try {
|
|
51
|
+
await this.initPromise;
|
|
52
|
+
} catch (err) {
|
|
53
|
+
// Reset to allow retry on next call
|
|
54
|
+
this.initPromise = null;
|
|
55
|
+
throw err;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Safe: after await, this.registry is always set
|
|
59
|
+
return this.registry as unknown as ConverterRegistry;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Convert a file through the pipeline.
|
|
64
|
+
* Returns ConversionArtifact with canonical markdown and mirrorHash.
|
|
65
|
+
*
|
|
66
|
+
* All exceptions are caught and mapped to INTERNAL errors.
|
|
67
|
+
*/
|
|
68
|
+
async convert(input: ConvertInput): Promise<PipelineResult> {
|
|
69
|
+
try {
|
|
70
|
+
// 0. Initialize registry
|
|
71
|
+
const registry = await this.ensureRegistry();
|
|
72
|
+
|
|
73
|
+
// 1. Delegate to registry (finds converter + invokes)
|
|
74
|
+
const result = await registry.convert(input);
|
|
75
|
+
|
|
76
|
+
if (!result.ok) {
|
|
77
|
+
return result; // Pass through error
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const maxChars = input.limits.maxOutputChars ?? 0;
|
|
81
|
+
const rawMarkdown = result.value.markdown;
|
|
82
|
+
|
|
83
|
+
// 2. Pre-canonicalization size check (early bailout to avoid expensive canonicalization)
|
|
84
|
+
if (maxChars > 0 && rawMarkdown.length > maxChars) {
|
|
85
|
+
return {
|
|
86
|
+
ok: false,
|
|
87
|
+
error: outputTooLargeError(input, 'pipeline', {
|
|
88
|
+
outputChars: rawMarkdown.length,
|
|
89
|
+
limitChars: maxChars,
|
|
90
|
+
stage: 'raw',
|
|
91
|
+
}),
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// 3. Canonicalize the raw markdown output
|
|
96
|
+
const canonical = canonicalize(rawMarkdown);
|
|
97
|
+
|
|
98
|
+
// 4. Post-canonicalization size check (canonicalization may expand slightly)
|
|
99
|
+
if (maxChars > 0 && canonical.length > maxChars) {
|
|
100
|
+
return {
|
|
101
|
+
ok: false,
|
|
102
|
+
error: outputTooLargeError(input, 'pipeline', {
|
|
103
|
+
outputChars: canonical.length,
|
|
104
|
+
limitChars: maxChars,
|
|
105
|
+
stage: 'canonical',
|
|
106
|
+
}),
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// 5. Compute content-addressed hash
|
|
111
|
+
const hash = mirrorHash(canonical);
|
|
112
|
+
|
|
113
|
+
// 6. Return artifact with all pipeline-computed fields
|
|
114
|
+
const artifact: ConversionArtifact = {
|
|
115
|
+
markdown: canonical,
|
|
116
|
+
mirrorHash: hash,
|
|
117
|
+
title: result.value.title,
|
|
118
|
+
languageHint: result.value.languageHint,
|
|
119
|
+
meta: result.value.meta,
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
return { ok: true, value: artifact };
|
|
123
|
+
} catch (cause) {
|
|
124
|
+
// Catch any unhandled exceptions and map to INTERNAL error
|
|
125
|
+
return {
|
|
126
|
+
ok: false,
|
|
127
|
+
error: internalError(
|
|
128
|
+
input,
|
|
129
|
+
'pipeline',
|
|
130
|
+
cause instanceof Error ? cause.message : 'Unknown pipeline error',
|
|
131
|
+
cause
|
|
132
|
+
),
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* List available converters.
|
|
139
|
+
*/
|
|
140
|
+
async listConverters(): Promise<string[]> {
|
|
141
|
+
const registry = await this.ensureRegistry();
|
|
142
|
+
return registry.listConverters();
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/** Singleton for simple usage */
|
|
147
|
+
let defaultPipeline: ConversionPipeline | null = null;
|
|
148
|
+
|
|
149
|
+
export function getDefaultPipeline(): ConversionPipeline {
|
|
150
|
+
if (!defaultPipeline) {
|
|
151
|
+
defaultPipeline = new ConversionPipeline();
|
|
152
|
+
}
|
|
153
|
+
return defaultPipeline;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/** Reset singleton (for testing) */
|
|
157
|
+
export function resetDefaultPipeline(): void {
|
|
158
|
+
defaultPipeline = null;
|
|
159
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Converter registry for routing files to appropriate converters.
|
|
3
|
+
* PRD §8.6 - Converter registry
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { unsupportedError } from './errors';
|
|
7
|
+
import type { Converter, ConvertInput, ConvertResult } from './types';
|
|
8
|
+
|
|
9
|
+
export class ConverterRegistry {
|
|
10
|
+
private readonly converters: Converter[] = [];
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Register a converter. Order matters - first match wins.
|
|
14
|
+
*/
|
|
15
|
+
register(converter: Converter): void {
|
|
16
|
+
this.converters.push(converter);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Select the first converter that can handle the given MIME/ext.
|
|
21
|
+
* Normalizes to lowercase for consistent matching.
|
|
22
|
+
*/
|
|
23
|
+
select(mime: string, ext: string): Converter | undefined {
|
|
24
|
+
const m = mime.toLowerCase();
|
|
25
|
+
const e = ext.toLowerCase();
|
|
26
|
+
return this.converters.find((c) => c.canHandle(m, e));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* List all registered converter IDs.
|
|
31
|
+
*/
|
|
32
|
+
listConverters(): string[] {
|
|
33
|
+
return this.converters.map((c) => c.id);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Convert a file using the appropriate converter.
|
|
38
|
+
*/
|
|
39
|
+
convert(input: ConvertInput): Promise<ConvertResult> {
|
|
40
|
+
const converter = this.select(input.mime, input.ext);
|
|
41
|
+
if (!converter) {
|
|
42
|
+
return Promise.resolve({ ok: false, error: unsupportedError(input) });
|
|
43
|
+
}
|
|
44
|
+
return converter.convert(input);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Create the default registry with all MVP converters.
|
|
50
|
+
* Priority order per PRD §8.6:
|
|
51
|
+
* 1. native/markdown - handles .md
|
|
52
|
+
* 2. native/plaintext - handles .txt
|
|
53
|
+
* 3. adapter/markitdown-ts - handles .pdf, .docx, .xlsx
|
|
54
|
+
* 4. adapter/officeparser - handles .pptx
|
|
55
|
+
*/
|
|
56
|
+
export async function createDefaultRegistry(): Promise<ConverterRegistry> {
|
|
57
|
+
const registry = new ConverterRegistry();
|
|
58
|
+
|
|
59
|
+
// Import converters dynamically to avoid circular deps
|
|
60
|
+
const { markdownConverter } = await import('./native/markdown');
|
|
61
|
+
const { plaintextConverter } = await import('./native/plaintext');
|
|
62
|
+
const { markitdownAdapter } = await import('./adapters/markitdownTs/adapter');
|
|
63
|
+
const { officeparserAdapter } = await import(
|
|
64
|
+
'./adapters/officeparser/adapter'
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
// Register in priority order
|
|
68
|
+
registry.register(markdownConverter);
|
|
69
|
+
registry.register(plaintextConverter);
|
|
70
|
+
registry.register(markitdownAdapter);
|
|
71
|
+
registry.register(officeparserAdapter);
|
|
72
|
+
|
|
73
|
+
return registry;
|
|
74
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Converter subsystem types.
|
|
3
|
+
* PRD §8.2 - Converter interfaces
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export type ConverterId = string;
|
|
7
|
+
|
|
8
|
+
export interface ConvertInput {
|
|
9
|
+
/** Absolute path to source file */
|
|
10
|
+
sourcePath: string;
|
|
11
|
+
/** Relative path within collection */
|
|
12
|
+
relativePath: string;
|
|
13
|
+
/** Collection name */
|
|
14
|
+
collection: string;
|
|
15
|
+
/** File contents */
|
|
16
|
+
bytes: Uint8Array;
|
|
17
|
+
/** Detected MIME type */
|
|
18
|
+
mime: string;
|
|
19
|
+
/** File extension (e.g., ".pdf") */
|
|
20
|
+
ext: string;
|
|
21
|
+
/** Conversion limits */
|
|
22
|
+
limits: {
|
|
23
|
+
/** Max file size in bytes (default: 100MB) */
|
|
24
|
+
maxBytes: number;
|
|
25
|
+
/** Conversion timeout in ms (default: 60000) */
|
|
26
|
+
timeoutMs: number;
|
|
27
|
+
/** Max output chars after conversion (zip bomb protection, default: 50M) */
|
|
28
|
+
maxOutputChars?: number;
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface ConvertWarning {
|
|
33
|
+
code:
|
|
34
|
+
| 'LOSSY'
|
|
35
|
+
| 'TRUNCATED'
|
|
36
|
+
| 'PARTIAL'
|
|
37
|
+
| 'UNSUPPORTED_FEATURE'
|
|
38
|
+
| 'LOW_CONFIDENCE';
|
|
39
|
+
message: string;
|
|
40
|
+
details?: Record<string, unknown>;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Raw output from individual converters.
|
|
45
|
+
* Note: markdown is NOT canonical - pipeline.ts handles normalization.
|
|
46
|
+
*/
|
|
47
|
+
export interface ConvertOutput {
|
|
48
|
+
/** Raw markdown (pipeline canonicalizes) */
|
|
49
|
+
markdown: string;
|
|
50
|
+
/** Extracted or derived title */
|
|
51
|
+
title?: string;
|
|
52
|
+
/** BCP-47 language hint or "und" */
|
|
53
|
+
languageHint?: string;
|
|
54
|
+
/** Conversion metadata */
|
|
55
|
+
meta: {
|
|
56
|
+
converterId: ConverterId;
|
|
57
|
+
converterVersion: string;
|
|
58
|
+
sourceMime: string;
|
|
59
|
+
warnings?: ConvertWarning[];
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export type ConvertErrorCode =
|
|
64
|
+
| 'UNSUPPORTED'
|
|
65
|
+
| 'TOO_LARGE'
|
|
66
|
+
| 'TIMEOUT'
|
|
67
|
+
| 'CORRUPT'
|
|
68
|
+
| 'PERMISSION'
|
|
69
|
+
| 'IO'
|
|
70
|
+
| 'ADAPTER_FAILURE'
|
|
71
|
+
| 'INTERNAL';
|
|
72
|
+
|
|
73
|
+
export interface ConvertError {
|
|
74
|
+
code: ConvertErrorCode;
|
|
75
|
+
message: string;
|
|
76
|
+
retryable: boolean;
|
|
77
|
+
fatal: boolean;
|
|
78
|
+
converterId: string;
|
|
79
|
+
sourcePath: string;
|
|
80
|
+
mime: string;
|
|
81
|
+
ext: string;
|
|
82
|
+
cause?: unknown;
|
|
83
|
+
details?: Record<string, unknown>;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export type ConvertResult =
|
|
87
|
+
| { ok: true; value: ConvertOutput }
|
|
88
|
+
| { ok: false; error: ConvertError };
|
|
89
|
+
|
|
90
|
+
export interface Converter {
|
|
91
|
+
readonly id: ConverterId;
|
|
92
|
+
readonly version: string;
|
|
93
|
+
canHandle(mime: string, ext: string): boolean;
|
|
94
|
+
convert(input: ConvertInput): Promise<ConvertResult>;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Pipeline output after canonicalization and hash computation.
|
|
99
|
+
* This is what consumers receive from the conversion pipeline.
|
|
100
|
+
*/
|
|
101
|
+
export interface ConversionArtifact {
|
|
102
|
+
/** Canonical markdown after pipeline normalization */
|
|
103
|
+
markdown: string;
|
|
104
|
+
/** SHA-256 hex of canonical markdown - content-addressed key */
|
|
105
|
+
mirrorHash: string;
|
|
106
|
+
/** Title from conversion (or derived from filename) */
|
|
107
|
+
title?: string;
|
|
108
|
+
/** Language hint from conversion */
|
|
109
|
+
languageHint?: string;
|
|
110
|
+
/** Conversion metadata */
|
|
111
|
+
meta: ConvertOutput['meta'];
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
export type PipelineResult =
|
|
115
|
+
| { ok: true; value: ConversionArtifact }
|
|
116
|
+
| { ok: false; error: ConvertError };
|
|
117
|
+
|
|
118
|
+
/** Default conversion limits */
|
|
119
|
+
export const DEFAULT_LIMITS = {
|
|
120
|
+
maxBytes: 100 * 1024 * 1024, // 100MB
|
|
121
|
+
timeoutMs: 60_000, // 60 seconds
|
|
122
|
+
maxOutputChars: 50_000_000, // 50M chars (zip bomb protection)
|
|
123
|
+
} as const;
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Centralized converter version tracking.
|
|
3
|
+
*
|
|
4
|
+
* Native converters use our own versioning.
|
|
5
|
+
* Adapter versions MUST match the wrapped npm package version.
|
|
6
|
+
*
|
|
7
|
+
* When updating npm dependencies, update these versions too.
|
|
8
|
+
* Run `bun pm ls markitdown-ts officeparser` to check current versions.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
/** Native converter versions (our own) */
|
|
12
|
+
export const NATIVE_VERSIONS = {
|
|
13
|
+
markdown: '1.0.0',
|
|
14
|
+
plaintext: '1.0.0',
|
|
15
|
+
} as const;
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Adapter versions - MUST match npm package versions.
|
|
19
|
+
* Update these when running `bun update`.
|
|
20
|
+
*/
|
|
21
|
+
export const ADAPTER_VERSIONS = {
|
|
22
|
+
'markitdown-ts': '0.0.8',
|
|
23
|
+
officeparser: '5.2.0',
|
|
24
|
+
} as const;
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* GNO CLI entry point.
|
|
4
|
+
* Thin bootstrap that delegates to CLI runner.
|
|
5
|
+
*
|
|
6
|
+
* @module src/index
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { runCli } from './cli/run';
|
|
10
|
+
|
|
11
|
+
// SIGINT handler for graceful shutdown
|
|
12
|
+
process.on('SIGINT', () => {
|
|
13
|
+
process.stderr.write('\nInterrupted\n');
|
|
14
|
+
process.exit(130);
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
// Run CLI and exit
|
|
18
|
+
runCli(process.argv)
|
|
19
|
+
.then((code) => {
|
|
20
|
+
process.exit(code);
|
|
21
|
+
})
|
|
22
|
+
.catch((err) => {
|
|
23
|
+
process.stderr.write(
|
|
24
|
+
`Fatal error: ${err instanceof Error ? err.message : String(err)}\n`
|
|
25
|
+
);
|
|
26
|
+
process.exit(1);
|
|
27
|
+
});
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown chunker implementation.
|
|
3
|
+
* Char-based chunking with line tracking.
|
|
4
|
+
*
|
|
5
|
+
* @module src/ingestion/chunker
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { defaultLanguageDetector } from './language';
|
|
9
|
+
import type { ChunkerPort, ChunkOutput, ChunkParams } from './types';
|
|
10
|
+
import { DEFAULT_CHUNK_PARAMS } from './types';
|
|
11
|
+
|
|
12
|
+
/** Approximate chars per token (conservative estimate) */
|
|
13
|
+
const CHARS_PER_TOKEN = 4;
|
|
14
|
+
|
|
15
|
+
/** Minimum valid maxTokens to prevent degenerate behavior */
|
|
16
|
+
const MIN_MAX_TOKENS = 10;
|
|
17
|
+
|
|
18
|
+
/** Maximum valid overlap percentage */
|
|
19
|
+
const MAX_OVERLAP_PERCENT = 0.5;
|
|
20
|
+
|
|
21
|
+
/** Regex for sentence ending followed by whitespace and capital letter (global) */
|
|
22
|
+
const SENTENCE_END_REGEX = /[.!?](\s+)[A-Z]/g;
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Line index for O(1) line number lookups.
|
|
26
|
+
* Stores positions of all newline characters.
|
|
27
|
+
*/
|
|
28
|
+
interface LineIndex {
|
|
29
|
+
/** Positions of '\n' characters in the text */
|
|
30
|
+
newlines: number[];
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Build a line index from text (O(n) once).
|
|
35
|
+
*/
|
|
36
|
+
function buildLineIndex(text: string): LineIndex {
|
|
37
|
+
const newlines: number[] = [];
|
|
38
|
+
for (let i = 0; i < text.length; i += 1) {
|
|
39
|
+
if (text[i] === '\n') {
|
|
40
|
+
newlines.push(i);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return { newlines };
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Find line number at character position using binary search (O(log n)).
|
|
48
|
+
* Returns 1-based line number.
|
|
49
|
+
*/
|
|
50
|
+
function lineAtPosition(index: LineIndex, pos: number): number {
|
|
51
|
+
const { newlines } = index;
|
|
52
|
+
|
|
53
|
+
// Binary search for the number of newlines before pos
|
|
54
|
+
let low = 0;
|
|
55
|
+
let high = newlines.length;
|
|
56
|
+
|
|
57
|
+
while (low < high) {
|
|
58
|
+
const mid = Math.floor((low + high) / 2);
|
|
59
|
+
const newlinePos = newlines[mid];
|
|
60
|
+
if (newlinePos !== undefined && newlinePos < pos) {
|
|
61
|
+
low = mid + 1;
|
|
62
|
+
} else {
|
|
63
|
+
high = mid;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return low + 1; // 1-based line number
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Normalize chunk parameters to valid ranges.
|
|
72
|
+
* Prevents degenerate behavior from invalid inputs.
|
|
73
|
+
*/
|
|
74
|
+
function normalizeChunkParams(params?: ChunkParams): Required<ChunkParams> {
|
|
75
|
+
const maxTokens = Math.max(
|
|
76
|
+
MIN_MAX_TOKENS,
|
|
77
|
+
Math.floor(params?.maxTokens ?? DEFAULT_CHUNK_PARAMS.maxTokens)
|
|
78
|
+
);
|
|
79
|
+
const overlapPercent = Math.max(
|
|
80
|
+
0,
|
|
81
|
+
Math.min(
|
|
82
|
+
MAX_OVERLAP_PERCENT,
|
|
83
|
+
params?.overlapPercent ?? DEFAULT_CHUNK_PARAMS.overlapPercent
|
|
84
|
+
)
|
|
85
|
+
);
|
|
86
|
+
return { maxTokens, overlapPercent };
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Find a good break point near target position.
|
|
91
|
+
* Prefers paragraph breaks, then sentence endings, then word boundaries.
|
|
92
|
+
*/
|
|
93
|
+
function findBreakPoint(
|
|
94
|
+
text: string,
|
|
95
|
+
target: number,
|
|
96
|
+
windowSize: number
|
|
97
|
+
): number {
|
|
98
|
+
const start = Math.max(0, target - windowSize);
|
|
99
|
+
const end = Math.min(text.length, target + windowSize);
|
|
100
|
+
const windowText = text.slice(start, end);
|
|
101
|
+
|
|
102
|
+
// Look for paragraph break (double newline) - prefer last one
|
|
103
|
+
const paraBreak = windowText.lastIndexOf('\n\n');
|
|
104
|
+
if (paraBreak !== -1) {
|
|
105
|
+
return start + paraBreak + 2;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Look for sentence ending - find the LAST match before target
|
|
109
|
+
// Reset regex state for fresh search
|
|
110
|
+
SENTENCE_END_REGEX.lastIndex = 0;
|
|
111
|
+
let lastSentenceMatch: RegExpExecArray | null = null;
|
|
112
|
+
let match: RegExpExecArray | null = null;
|
|
113
|
+
|
|
114
|
+
while (true) {
|
|
115
|
+
match = SENTENCE_END_REGEX.exec(windowText);
|
|
116
|
+
if (!match) {
|
|
117
|
+
break;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Only consider matches that would give us a break point before or near target
|
|
121
|
+
const whitespace = match[1] ?? '';
|
|
122
|
+
const breakPos = start + match.index + 1 + whitespace.length;
|
|
123
|
+
if (breakPos <= target + windowSize) {
|
|
124
|
+
lastSentenceMatch = match;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (lastSentenceMatch) {
|
|
129
|
+
// Break after the punctuation and whitespace, before the capital
|
|
130
|
+
const whitespace = lastSentenceMatch[1] ?? '';
|
|
131
|
+
return start + lastSentenceMatch.index + 1 + whitespace.length;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Look for single newline
|
|
135
|
+
const lineBreak = windowText.lastIndexOf('\n');
|
|
136
|
+
if (lineBreak !== -1) {
|
|
137
|
+
return start + lineBreak + 1;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Look for word boundary
|
|
141
|
+
const wordBoundary = windowText.lastIndexOf(' ');
|
|
142
|
+
if (wordBoundary !== -1) {
|
|
143
|
+
return start + wordBoundary + 1;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Fall back to target
|
|
147
|
+
return target;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Markdown chunker implementation.
|
|
152
|
+
* Uses character-based chunking with semantic break points.
|
|
153
|
+
*
|
|
154
|
+
* Note: Chunk text is preserved exactly as-is (no trimming) to maintain
|
|
155
|
+
* accurate pos/line mappings and preserve Markdown semantics like
|
|
156
|
+
* indented code blocks.
|
|
157
|
+
*/
|
|
158
|
+
export class MarkdownChunker implements ChunkerPort {
|
|
159
|
+
chunk(
|
|
160
|
+
markdown: string,
|
|
161
|
+
params?: ChunkParams,
|
|
162
|
+
documentLanguageHint?: string
|
|
163
|
+
): ChunkOutput[] {
|
|
164
|
+
if (!markdown || markdown.trim().length === 0) {
|
|
165
|
+
return [];
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Normalize params to prevent degenerate behavior
|
|
169
|
+
const { maxTokens, overlapPercent } = normalizeChunkParams(params);
|
|
170
|
+
|
|
171
|
+
const maxChars = maxTokens * CHARS_PER_TOKEN;
|
|
172
|
+
const overlapChars = Math.floor(maxChars * overlapPercent);
|
|
173
|
+
const windowSize = Math.floor(maxChars * 0.1); // 10% window for break search
|
|
174
|
+
|
|
175
|
+
// Build line index once for O(log n) lookups
|
|
176
|
+
const lineIndex = buildLineIndex(markdown);
|
|
177
|
+
|
|
178
|
+
const chunks: ChunkOutput[] = [];
|
|
179
|
+
let pos = 0;
|
|
180
|
+
let seq = 0;
|
|
181
|
+
|
|
182
|
+
while (pos < markdown.length) {
|
|
183
|
+
// Calculate target end position
|
|
184
|
+
const targetEnd = pos + maxChars;
|
|
185
|
+
|
|
186
|
+
let endPos: number;
|
|
187
|
+
if (targetEnd >= markdown.length) {
|
|
188
|
+
// Last chunk - take rest
|
|
189
|
+
endPos = markdown.length;
|
|
190
|
+
} else {
|
|
191
|
+
// Find a good break point
|
|
192
|
+
endPos = findBreakPoint(markdown, targetEnd, windowSize);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Extract chunk text - preserve exactly (no trim!)
|
|
196
|
+
// This maintains accurate pos/line mappings and Markdown semantics
|
|
197
|
+
const text = markdown.slice(pos, endPos);
|
|
198
|
+
|
|
199
|
+
// Only skip truly empty chunks (all whitespace after full content consumed)
|
|
200
|
+
if (text.trim().length > 0) {
|
|
201
|
+
const startLine = lineAtPosition(lineIndex, pos);
|
|
202
|
+
const endLine = lineAtPosition(lineIndex, endPos - 1);
|
|
203
|
+
|
|
204
|
+
// Detect language for this chunk
|
|
205
|
+
const language =
|
|
206
|
+
documentLanguageHint ?? defaultLanguageDetector.detect(text);
|
|
207
|
+
|
|
208
|
+
chunks.push({
|
|
209
|
+
seq,
|
|
210
|
+
pos,
|
|
211
|
+
text,
|
|
212
|
+
startLine,
|
|
213
|
+
endLine,
|
|
214
|
+
language,
|
|
215
|
+
tokenCount: null, // Char-based, no exact token count
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
seq += 1;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Move position, accounting for overlap
|
|
222
|
+
if (endPos >= markdown.length) {
|
|
223
|
+
break;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Calculate next position with overlap
|
|
227
|
+
const nextPos = endPos - overlapChars;
|
|
228
|
+
pos = Math.max(pos + 1, nextPos); // Ensure we always advance
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
return chunks;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Default chunker instance.
|
|
237
|
+
*/
|
|
238
|
+
export const defaultChunker = new MarkdownChunker();
|