@gmickel/gno 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +256 -0
  2. package/assets/skill/SKILL.md +112 -0
  3. package/assets/skill/cli-reference.md +327 -0
  4. package/assets/skill/examples.md +234 -0
  5. package/assets/skill/mcp-reference.md +159 -0
  6. package/package.json +90 -0
  7. package/src/app/constants.ts +313 -0
  8. package/src/cli/colors.ts +65 -0
  9. package/src/cli/commands/ask.ts +545 -0
  10. package/src/cli/commands/cleanup.ts +105 -0
  11. package/src/cli/commands/collection/add.ts +120 -0
  12. package/src/cli/commands/collection/index.ts +10 -0
  13. package/src/cli/commands/collection/list.ts +108 -0
  14. package/src/cli/commands/collection/remove.ts +64 -0
  15. package/src/cli/commands/collection/rename.ts +95 -0
  16. package/src/cli/commands/context/add.ts +67 -0
  17. package/src/cli/commands/context/check.ts +153 -0
  18. package/src/cli/commands/context/index.ts +10 -0
  19. package/src/cli/commands/context/list.ts +109 -0
  20. package/src/cli/commands/context/rm.ts +52 -0
  21. package/src/cli/commands/doctor.ts +393 -0
  22. package/src/cli/commands/embed.ts +462 -0
  23. package/src/cli/commands/get.ts +356 -0
  24. package/src/cli/commands/index-cmd.ts +119 -0
  25. package/src/cli/commands/index.ts +102 -0
  26. package/src/cli/commands/init.ts +328 -0
  27. package/src/cli/commands/ls.ts +217 -0
  28. package/src/cli/commands/mcp/config.ts +300 -0
  29. package/src/cli/commands/mcp/index.ts +24 -0
  30. package/src/cli/commands/mcp/install.ts +203 -0
  31. package/src/cli/commands/mcp/paths.ts +470 -0
  32. package/src/cli/commands/mcp/status.ts +222 -0
  33. package/src/cli/commands/mcp/uninstall.ts +158 -0
  34. package/src/cli/commands/mcp.ts +20 -0
  35. package/src/cli/commands/models/clear.ts +103 -0
  36. package/src/cli/commands/models/index.ts +32 -0
  37. package/src/cli/commands/models/list.ts +214 -0
  38. package/src/cli/commands/models/path.ts +51 -0
  39. package/src/cli/commands/models/pull.ts +199 -0
  40. package/src/cli/commands/models/use.ts +85 -0
  41. package/src/cli/commands/multi-get.ts +400 -0
  42. package/src/cli/commands/query.ts +220 -0
  43. package/src/cli/commands/ref-parser.ts +108 -0
  44. package/src/cli/commands/reset.ts +191 -0
  45. package/src/cli/commands/search.ts +136 -0
  46. package/src/cli/commands/shared.ts +156 -0
  47. package/src/cli/commands/skill/index.ts +19 -0
  48. package/src/cli/commands/skill/install.ts +197 -0
  49. package/src/cli/commands/skill/paths-cmd.ts +81 -0
  50. package/src/cli/commands/skill/paths.ts +191 -0
  51. package/src/cli/commands/skill/show.ts +73 -0
  52. package/src/cli/commands/skill/uninstall.ts +141 -0
  53. package/src/cli/commands/status.ts +205 -0
  54. package/src/cli/commands/update.ts +68 -0
  55. package/src/cli/commands/vsearch.ts +188 -0
  56. package/src/cli/context.ts +64 -0
  57. package/src/cli/errors.ts +64 -0
  58. package/src/cli/format/search-results.ts +211 -0
  59. package/src/cli/options.ts +183 -0
  60. package/src/cli/program.ts +1330 -0
  61. package/src/cli/run.ts +213 -0
  62. package/src/cli/ui.ts +92 -0
  63. package/src/config/defaults.ts +20 -0
  64. package/src/config/index.ts +55 -0
  65. package/src/config/loader.ts +161 -0
  66. package/src/config/paths.ts +87 -0
  67. package/src/config/saver.ts +153 -0
  68. package/src/config/types.ts +280 -0
  69. package/src/converters/adapters/markitdownTs/adapter.ts +140 -0
  70. package/src/converters/adapters/officeparser/adapter.ts +126 -0
  71. package/src/converters/canonicalize.ts +89 -0
  72. package/src/converters/errors.ts +218 -0
  73. package/src/converters/index.ts +51 -0
  74. package/src/converters/mime.ts +163 -0
  75. package/src/converters/native/markdown.ts +115 -0
  76. package/src/converters/native/plaintext.ts +56 -0
  77. package/src/converters/path.ts +48 -0
  78. package/src/converters/pipeline.ts +159 -0
  79. package/src/converters/registry.ts +74 -0
  80. package/src/converters/types.ts +123 -0
  81. package/src/converters/versions.ts +24 -0
  82. package/src/index.ts +27 -0
  83. package/src/ingestion/chunker.ts +238 -0
  84. package/src/ingestion/index.ts +32 -0
  85. package/src/ingestion/language.ts +276 -0
  86. package/src/ingestion/sync.ts +671 -0
  87. package/src/ingestion/types.ts +219 -0
  88. package/src/ingestion/walker.ts +235 -0
  89. package/src/llm/cache.ts +467 -0
  90. package/src/llm/errors.ts +191 -0
  91. package/src/llm/index.ts +58 -0
  92. package/src/llm/nodeLlamaCpp/adapter.ts +133 -0
  93. package/src/llm/nodeLlamaCpp/embedding.ts +165 -0
  94. package/src/llm/nodeLlamaCpp/generation.ts +88 -0
  95. package/src/llm/nodeLlamaCpp/lifecycle.ts +317 -0
  96. package/src/llm/nodeLlamaCpp/rerank.ts +94 -0
  97. package/src/llm/registry.ts +86 -0
  98. package/src/llm/types.ts +129 -0
  99. package/src/mcp/resources/index.ts +151 -0
  100. package/src/mcp/server.ts +229 -0
  101. package/src/mcp/tools/get.ts +220 -0
  102. package/src/mcp/tools/index.ts +160 -0
  103. package/src/mcp/tools/multi-get.ts +263 -0
  104. package/src/mcp/tools/query.ts +226 -0
  105. package/src/mcp/tools/search.ts +119 -0
  106. package/src/mcp/tools/status.ts +81 -0
  107. package/src/mcp/tools/vsearch.ts +198 -0
  108. package/src/pipeline/chunk-lookup.ts +44 -0
  109. package/src/pipeline/expansion.ts +256 -0
  110. package/src/pipeline/explain.ts +115 -0
  111. package/src/pipeline/fusion.ts +185 -0
  112. package/src/pipeline/hybrid.ts +535 -0
  113. package/src/pipeline/index.ts +64 -0
  114. package/src/pipeline/query-language.ts +118 -0
  115. package/src/pipeline/rerank.ts +223 -0
  116. package/src/pipeline/search.ts +261 -0
  117. package/src/pipeline/types.ts +328 -0
  118. package/src/pipeline/vsearch.ts +348 -0
  119. package/src/store/index.ts +41 -0
  120. package/src/store/migrations/001-initial.ts +196 -0
  121. package/src/store/migrations/index.ts +20 -0
  122. package/src/store/migrations/runner.ts +187 -0
  123. package/src/store/sqlite/adapter.ts +1242 -0
  124. package/src/store/sqlite/index.ts +7 -0
  125. package/src/store/sqlite/setup.ts +129 -0
  126. package/src/store/sqlite/types.ts +28 -0
  127. package/src/store/types.ts +506 -0
  128. package/src/store/vector/index.ts +13 -0
  129. package/src/store/vector/sqlite-vec.ts +373 -0
  130. package/src/store/vector/stats.ts +152 -0
  131. package/src/store/vector/types.ts +115 -0
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Cross-platform path utilities for deterministic behavior.
3
+ * Uses POSIX path operations to ensure identical results across platforms.
4
+ *
5
+ * CRITICAL: All converter path operations MUST use these utilities.
6
+ * Using node:path directly will produce different results on Windows vs POSIX.
7
+ */
8
+
9
+ // OK: no Bun path utils, must use node:path/posix for cross-platform determinism
10
+ import {
11
+ basename as posixBasename,
12
+ extname as posixExtname,
13
+ normalize as posixNormalize,
14
+ } from 'node:path/posix';
15
+
16
+ /**
17
+ * Normalize a relative path to POSIX format.
18
+ * Converts Windows separators, normalizes ../ and ./, strips trailing slashes.
19
+ */
20
+ export function normalizePath(p: string): string {
21
+ // Convert Windows separators to POSIX
22
+ const posixPath = p.replace(/\\/g, '/');
23
+ // Normalize (resolve .., ., double slashes)
24
+ return posixNormalize(posixPath);
25
+ }
26
+
27
+ /**
28
+ * Get the base filename from a path (POSIX-safe).
29
+ */
30
+ export function basename(p: string): string {
31
+ return posixBasename(normalizePath(p));
32
+ }
33
+
34
+ /**
35
+ * Get the file extension from a path (POSIX-safe, lowercase).
36
+ */
37
+ export function extname(p: string): string {
38
+ return posixExtname(normalizePath(p)).toLowerCase();
39
+ }
40
+
41
+ /**
42
+ * Get filename without extension (for title derivation).
43
+ */
44
+ export function basenameWithoutExt(p: string): string {
45
+ const name = basename(p);
46
+ const ext = posixExtname(name);
47
+ return ext ? name.slice(0, -ext.length) : name;
48
+ }
@@ -0,0 +1,159 @@
1
+ /**
2
+ * Conversion pipeline - single entry point for all document conversions.
3
+ * PRD §8.4 - Canonical Markdown conventions
4
+ *
5
+ * The pipeline:
6
+ * 1. Delegates to the registry to find and invoke the appropriate converter
7
+ * 2. Enforces pre-canonicalization output size limit (early bailout)
8
+ * 3. Canonicalizes the raw markdown output (centralized, not per-converter)
9
+ * 4. Enforces post-canonicalization output size limit (zip bomb protection)
10
+ * 5. Computes mirrorHash from canonical markdown
11
+ * 6. Returns ConversionArtifact (not ConvertOutput)
12
+ *
13
+ * CRITICAL: Canonicalization is ONLY done here, not in individual converters.
14
+ */
15
+
16
+ import { canonicalize, mirrorHash } from './canonicalize';
17
+ import { internalError, outputTooLargeError } from './errors';
18
+ import { type ConverterRegistry, createDefaultRegistry } from './registry';
19
+ import type { ConversionArtifact, ConvertInput, PipelineResult } from './types';
20
+
21
+ export class ConversionPipeline {
22
+ private registry: ConverterRegistry | null = null;
23
+ private initPromise: Promise<void> | null = null;
24
+
25
+ /**
26
+ * Create a pipeline with default registry.
27
+ * Registry is lazily initialized on first use.
28
+ */
29
+ constructor(registry?: ConverterRegistry) {
30
+ if (registry) {
31
+ this.registry = registry;
32
+ }
33
+ }
34
+
35
+ /**
36
+ * Ensure registry is initialized.
37
+ * Resets initPromise on failure to allow retry.
38
+ */
39
+ private async ensureRegistry(): Promise<ConverterRegistry> {
40
+ if (this.registry) {
41
+ return this.registry;
42
+ }
43
+
44
+ if (!this.initPromise) {
45
+ this.initPromise = createDefaultRegistry().then((r) => {
46
+ this.registry = r;
47
+ });
48
+ }
49
+
50
+ try {
51
+ await this.initPromise;
52
+ } catch (err) {
53
+ // Reset to allow retry on next call
54
+ this.initPromise = null;
55
+ throw err;
56
+ }
57
+
58
+ // Safe: after await, this.registry is always set
59
+ return this.registry as unknown as ConverterRegistry;
60
+ }
61
+
62
+ /**
63
+ * Convert a file through the pipeline.
64
+ * Returns ConversionArtifact with canonical markdown and mirrorHash.
65
+ *
66
+ * All exceptions are caught and mapped to INTERNAL errors.
67
+ */
68
+ async convert(input: ConvertInput): Promise<PipelineResult> {
69
+ try {
70
+ // 0. Initialize registry
71
+ const registry = await this.ensureRegistry();
72
+
73
+ // 1. Delegate to registry (finds converter + invokes)
74
+ const result = await registry.convert(input);
75
+
76
+ if (!result.ok) {
77
+ return result; // Pass through error
78
+ }
79
+
80
+ const maxChars = input.limits.maxOutputChars ?? 0;
81
+ const rawMarkdown = result.value.markdown;
82
+
83
+ // 2. Pre-canonicalization size check (early bailout to avoid expensive canonicalization)
84
+ if (maxChars > 0 && rawMarkdown.length > maxChars) {
85
+ return {
86
+ ok: false,
87
+ error: outputTooLargeError(input, 'pipeline', {
88
+ outputChars: rawMarkdown.length,
89
+ limitChars: maxChars,
90
+ stage: 'raw',
91
+ }),
92
+ };
93
+ }
94
+
95
+ // 3. Canonicalize the raw markdown output
96
+ const canonical = canonicalize(rawMarkdown);
97
+
98
+ // 4. Post-canonicalization size check (canonicalization may expand slightly)
99
+ if (maxChars > 0 && canonical.length > maxChars) {
100
+ return {
101
+ ok: false,
102
+ error: outputTooLargeError(input, 'pipeline', {
103
+ outputChars: canonical.length,
104
+ limitChars: maxChars,
105
+ stage: 'canonical',
106
+ }),
107
+ };
108
+ }
109
+
110
+ // 5. Compute content-addressed hash
111
+ const hash = mirrorHash(canonical);
112
+
113
+ // 6. Return artifact with all pipeline-computed fields
114
+ const artifact: ConversionArtifact = {
115
+ markdown: canonical,
116
+ mirrorHash: hash,
117
+ title: result.value.title,
118
+ languageHint: result.value.languageHint,
119
+ meta: result.value.meta,
120
+ };
121
+
122
+ return { ok: true, value: artifact };
123
+ } catch (cause) {
124
+ // Catch any unhandled exceptions and map to INTERNAL error
125
+ return {
126
+ ok: false,
127
+ error: internalError(
128
+ input,
129
+ 'pipeline',
130
+ cause instanceof Error ? cause.message : 'Unknown pipeline error',
131
+ cause
132
+ ),
133
+ };
134
+ }
135
+ }
136
+
137
+ /**
138
+ * List available converters.
139
+ */
140
+ async listConverters(): Promise<string[]> {
141
+ const registry = await this.ensureRegistry();
142
+ return registry.listConverters();
143
+ }
144
+ }
145
+
146
+ /** Singleton for simple usage */
147
+ let defaultPipeline: ConversionPipeline | null = null;
148
+
149
+ export function getDefaultPipeline(): ConversionPipeline {
150
+ if (!defaultPipeline) {
151
+ defaultPipeline = new ConversionPipeline();
152
+ }
153
+ return defaultPipeline;
154
+ }
155
+
156
+ /** Reset singleton (for testing) */
157
+ export function resetDefaultPipeline(): void {
158
+ defaultPipeline = null;
159
+ }
@@ -0,0 +1,74 @@
1
+ /**
2
+ * Converter registry for routing files to appropriate converters.
3
+ * PRD §8.6 - Converter registry
4
+ */
5
+
6
+ import { unsupportedError } from './errors';
7
+ import type { Converter, ConvertInput, ConvertResult } from './types';
8
+
9
+ export class ConverterRegistry {
10
+ private readonly converters: Converter[] = [];
11
+
12
+ /**
13
+ * Register a converter. Order matters - first match wins.
14
+ */
15
+ register(converter: Converter): void {
16
+ this.converters.push(converter);
17
+ }
18
+
19
+ /**
20
+ * Select the first converter that can handle the given MIME/ext.
21
+ * Normalizes to lowercase for consistent matching.
22
+ */
23
+ select(mime: string, ext: string): Converter | undefined {
24
+ const m = mime.toLowerCase();
25
+ const e = ext.toLowerCase();
26
+ return this.converters.find((c) => c.canHandle(m, e));
27
+ }
28
+
29
+ /**
30
+ * List all registered converter IDs.
31
+ */
32
+ listConverters(): string[] {
33
+ return this.converters.map((c) => c.id);
34
+ }
35
+
36
+ /**
37
+ * Convert a file using the appropriate converter.
38
+ */
39
+ convert(input: ConvertInput): Promise<ConvertResult> {
40
+ const converter = this.select(input.mime, input.ext);
41
+ if (!converter) {
42
+ return Promise.resolve({ ok: false, error: unsupportedError(input) });
43
+ }
44
+ return converter.convert(input);
45
+ }
46
+ }
47
+
48
+ /**
49
+ * Create the default registry with all MVP converters.
50
+ * Priority order per PRD §8.6:
51
+ * 1. native/markdown - handles .md
52
+ * 2. native/plaintext - handles .txt
53
+ * 3. adapter/markitdown-ts - handles .pdf, .docx, .xlsx
54
+ * 4. adapter/officeparser - handles .pptx
55
+ */
56
+ export async function createDefaultRegistry(): Promise<ConverterRegistry> {
57
+ const registry = new ConverterRegistry();
58
+
59
+ // Import converters dynamically to avoid circular deps
60
+ const { markdownConverter } = await import('./native/markdown');
61
+ const { plaintextConverter } = await import('./native/plaintext');
62
+ const { markitdownAdapter } = await import('./adapters/markitdownTs/adapter');
63
+ const { officeparserAdapter } = await import(
64
+ './adapters/officeparser/adapter'
65
+ );
66
+
67
+ // Register in priority order
68
+ registry.register(markdownConverter);
69
+ registry.register(plaintextConverter);
70
+ registry.register(markitdownAdapter);
71
+ registry.register(officeparserAdapter);
72
+
73
+ return registry;
74
+ }
@@ -0,0 +1,123 @@
1
+ /**
2
+ * Converter subsystem types.
3
+ * PRD §8.2 - Converter interfaces
4
+ */
5
+
6
+ export type ConverterId = string;
7
+
8
+ export interface ConvertInput {
9
+ /** Absolute path to source file */
10
+ sourcePath: string;
11
+ /** Relative path within collection */
12
+ relativePath: string;
13
+ /** Collection name */
14
+ collection: string;
15
+ /** File contents */
16
+ bytes: Uint8Array;
17
+ /** Detected MIME type */
18
+ mime: string;
19
+ /** File extension (e.g., ".pdf") */
20
+ ext: string;
21
+ /** Conversion limits */
22
+ limits: {
23
+ /** Max file size in bytes (default: 100MB) */
24
+ maxBytes: number;
25
+ /** Conversion timeout in ms (default: 60000) */
26
+ timeoutMs: number;
27
+ /** Max output chars after conversion (zip bomb protection, default: 50M) */
28
+ maxOutputChars?: number;
29
+ };
30
+ }
31
+
32
+ export interface ConvertWarning {
33
+ code:
34
+ | 'LOSSY'
35
+ | 'TRUNCATED'
36
+ | 'PARTIAL'
37
+ | 'UNSUPPORTED_FEATURE'
38
+ | 'LOW_CONFIDENCE';
39
+ message: string;
40
+ details?: Record<string, unknown>;
41
+ }
42
+
43
+ /**
44
+ * Raw output from individual converters.
45
+ * Note: markdown is NOT canonical - pipeline.ts handles normalization.
46
+ */
47
+ export interface ConvertOutput {
48
+ /** Raw markdown (pipeline canonicalizes) */
49
+ markdown: string;
50
+ /** Extracted or derived title */
51
+ title?: string;
52
+ /** BCP-47 language hint or "und" */
53
+ languageHint?: string;
54
+ /** Conversion metadata */
55
+ meta: {
56
+ converterId: ConverterId;
57
+ converterVersion: string;
58
+ sourceMime: string;
59
+ warnings?: ConvertWarning[];
60
+ };
61
+ }
62
+
63
+ export type ConvertErrorCode =
64
+ | 'UNSUPPORTED'
65
+ | 'TOO_LARGE'
66
+ | 'TIMEOUT'
67
+ | 'CORRUPT'
68
+ | 'PERMISSION'
69
+ | 'IO'
70
+ | 'ADAPTER_FAILURE'
71
+ | 'INTERNAL';
72
+
73
+ export interface ConvertError {
74
+ code: ConvertErrorCode;
75
+ message: string;
76
+ retryable: boolean;
77
+ fatal: boolean;
78
+ converterId: string;
79
+ sourcePath: string;
80
+ mime: string;
81
+ ext: string;
82
+ cause?: unknown;
83
+ details?: Record<string, unknown>;
84
+ }
85
+
86
+ export type ConvertResult =
87
+ | { ok: true; value: ConvertOutput }
88
+ | { ok: false; error: ConvertError };
89
+
90
+ export interface Converter {
91
+ readonly id: ConverterId;
92
+ readonly version: string;
93
+ canHandle(mime: string, ext: string): boolean;
94
+ convert(input: ConvertInput): Promise<ConvertResult>;
95
+ }
96
+
97
+ /**
98
+ * Pipeline output after canonicalization and hash computation.
99
+ * This is what consumers receive from the conversion pipeline.
100
+ */
101
+ export interface ConversionArtifact {
102
+ /** Canonical markdown after pipeline normalization */
103
+ markdown: string;
104
+ /** SHA-256 hex of canonical markdown - content-addressed key */
105
+ mirrorHash: string;
106
+ /** Title from conversion (or derived from filename) */
107
+ title?: string;
108
+ /** Language hint from conversion */
109
+ languageHint?: string;
110
+ /** Conversion metadata */
111
+ meta: ConvertOutput['meta'];
112
+ }
113
+
114
+ export type PipelineResult =
115
+ | { ok: true; value: ConversionArtifact }
116
+ | { ok: false; error: ConvertError };
117
+
118
+ /** Default conversion limits */
119
+ export const DEFAULT_LIMITS = {
120
+ maxBytes: 100 * 1024 * 1024, // 100MB
121
+ timeoutMs: 60_000, // 60 seconds
122
+ maxOutputChars: 50_000_000, // 50M chars (zip bomb protection)
123
+ } as const;
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Centralized converter version tracking.
3
+ *
4
+ * Native converters use our own versioning.
5
+ * Adapter versions MUST match the wrapped npm package version.
6
+ *
7
+ * When updating npm dependencies, update these versions too.
8
+ * Run `bun pm ls markitdown-ts officeparser` to check current versions.
9
+ */
10
+
11
+ /** Native converter versions (our own) */
12
+ export const NATIVE_VERSIONS = {
13
+ markdown: '1.0.0',
14
+ plaintext: '1.0.0',
15
+ } as const;
16
+
17
+ /**
18
+ * Adapter versions - MUST match npm package versions.
19
+ * Update these when running `bun update`.
20
+ */
21
+ export const ADAPTER_VERSIONS = {
22
+ 'markitdown-ts': '0.0.8',
23
+ officeparser: '5.2.0',
24
+ } as const;
package/src/index.ts ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * GNO CLI entry point.
4
+ * Thin bootstrap that delegates to CLI runner.
5
+ *
6
+ * @module src/index
7
+ */
8
+
9
+ import { runCli } from './cli/run';
10
+
11
+ // SIGINT handler for graceful shutdown
12
+ process.on('SIGINT', () => {
13
+ process.stderr.write('\nInterrupted\n');
14
+ process.exit(130);
15
+ });
16
+
17
+ // Run CLI and exit
18
+ runCli(process.argv)
19
+ .then((code) => {
20
+ process.exit(code);
21
+ })
22
+ .catch((err) => {
23
+ process.stderr.write(
24
+ `Fatal error: ${err instanceof Error ? err.message : String(err)}\n`
25
+ );
26
+ process.exit(1);
27
+ });
@@ -0,0 +1,238 @@
1
+ /**
2
+ * Markdown chunker implementation.
3
+ * Char-based chunking with line tracking.
4
+ *
5
+ * @module src/ingestion/chunker
6
+ */
7
+
8
+ import { defaultLanguageDetector } from './language';
9
+ import type { ChunkerPort, ChunkOutput, ChunkParams } from './types';
10
+ import { DEFAULT_CHUNK_PARAMS } from './types';
11
+
12
+ /** Approximate chars per token (conservative estimate) */
13
+ const CHARS_PER_TOKEN = 4;
14
+
15
+ /** Minimum valid maxTokens to prevent degenerate behavior */
16
+ const MIN_MAX_TOKENS = 10;
17
+
18
+ /** Maximum valid overlap percentage */
19
+ const MAX_OVERLAP_PERCENT = 0.5;
20
+
21
+ /** Regex for sentence ending followed by whitespace and capital letter (global) */
22
+ const SENTENCE_END_REGEX = /[.!?](\s+)[A-Z]/g;
23
+
24
+ /**
25
+ * Line index for O(1) line number lookups.
26
+ * Stores positions of all newline characters.
27
+ */
28
+ interface LineIndex {
29
+ /** Positions of '\n' characters in the text */
30
+ newlines: number[];
31
+ }
32
+
33
+ /**
34
+ * Build a line index from text (O(n) once).
35
+ */
36
+ function buildLineIndex(text: string): LineIndex {
37
+ const newlines: number[] = [];
38
+ for (let i = 0; i < text.length; i += 1) {
39
+ if (text[i] === '\n') {
40
+ newlines.push(i);
41
+ }
42
+ }
43
+ return { newlines };
44
+ }
45
+
46
+ /**
47
+ * Find line number at character position using binary search (O(log n)).
48
+ * Returns 1-based line number.
49
+ */
50
+ function lineAtPosition(index: LineIndex, pos: number): number {
51
+ const { newlines } = index;
52
+
53
+ // Binary search for the number of newlines before pos
54
+ let low = 0;
55
+ let high = newlines.length;
56
+
57
+ while (low < high) {
58
+ const mid = Math.floor((low + high) / 2);
59
+ const newlinePos = newlines[mid];
60
+ if (newlinePos !== undefined && newlinePos < pos) {
61
+ low = mid + 1;
62
+ } else {
63
+ high = mid;
64
+ }
65
+ }
66
+
67
+ return low + 1; // 1-based line number
68
+ }
69
+
70
+ /**
71
+ * Normalize chunk parameters to valid ranges.
72
+ * Prevents degenerate behavior from invalid inputs.
73
+ */
74
+ function normalizeChunkParams(params?: ChunkParams): Required<ChunkParams> {
75
+ const maxTokens = Math.max(
76
+ MIN_MAX_TOKENS,
77
+ Math.floor(params?.maxTokens ?? DEFAULT_CHUNK_PARAMS.maxTokens)
78
+ );
79
+ const overlapPercent = Math.max(
80
+ 0,
81
+ Math.min(
82
+ MAX_OVERLAP_PERCENT,
83
+ params?.overlapPercent ?? DEFAULT_CHUNK_PARAMS.overlapPercent
84
+ )
85
+ );
86
+ return { maxTokens, overlapPercent };
87
+ }
88
+
89
+ /**
90
+ * Find a good break point near target position.
91
+ * Prefers paragraph breaks, then sentence endings, then word boundaries.
92
+ */
93
+ function findBreakPoint(
94
+ text: string,
95
+ target: number,
96
+ windowSize: number
97
+ ): number {
98
+ const start = Math.max(0, target - windowSize);
99
+ const end = Math.min(text.length, target + windowSize);
100
+ const windowText = text.slice(start, end);
101
+
102
+ // Look for paragraph break (double newline) - prefer last one
103
+ const paraBreak = windowText.lastIndexOf('\n\n');
104
+ if (paraBreak !== -1) {
105
+ return start + paraBreak + 2;
106
+ }
107
+
108
+ // Look for sentence ending - find the LAST match before target
109
+ // Reset regex state for fresh search
110
+ SENTENCE_END_REGEX.lastIndex = 0;
111
+ let lastSentenceMatch: RegExpExecArray | null = null;
112
+ let match: RegExpExecArray | null = null;
113
+
114
+ while (true) {
115
+ match = SENTENCE_END_REGEX.exec(windowText);
116
+ if (!match) {
117
+ break;
118
+ }
119
+
120
+ // Only consider matches that would give us a break point before or near target
121
+ const whitespace = match[1] ?? '';
122
+ const breakPos = start + match.index + 1 + whitespace.length;
123
+ if (breakPos <= target + windowSize) {
124
+ lastSentenceMatch = match;
125
+ }
126
+ }
127
+
128
+ if (lastSentenceMatch) {
129
+ // Break after the punctuation and whitespace, before the capital
130
+ const whitespace = lastSentenceMatch[1] ?? '';
131
+ return start + lastSentenceMatch.index + 1 + whitespace.length;
132
+ }
133
+
134
+ // Look for single newline
135
+ const lineBreak = windowText.lastIndexOf('\n');
136
+ if (lineBreak !== -1) {
137
+ return start + lineBreak + 1;
138
+ }
139
+
140
+ // Look for word boundary
141
+ const wordBoundary = windowText.lastIndexOf(' ');
142
+ if (wordBoundary !== -1) {
143
+ return start + wordBoundary + 1;
144
+ }
145
+
146
+ // Fall back to target
147
+ return target;
148
+ }
149
+
150
+ /**
151
+ * Markdown chunker implementation.
152
+ * Uses character-based chunking with semantic break points.
153
+ *
154
+ * Note: Chunk text is preserved exactly as-is (no trimming) to maintain
155
+ * accurate pos/line mappings and preserve Markdown semantics like
156
+ * indented code blocks.
157
+ */
158
+ export class MarkdownChunker implements ChunkerPort {
159
+ chunk(
160
+ markdown: string,
161
+ params?: ChunkParams,
162
+ documentLanguageHint?: string
163
+ ): ChunkOutput[] {
164
+ if (!markdown || markdown.trim().length === 0) {
165
+ return [];
166
+ }
167
+
168
+ // Normalize params to prevent degenerate behavior
169
+ const { maxTokens, overlapPercent } = normalizeChunkParams(params);
170
+
171
+ const maxChars = maxTokens * CHARS_PER_TOKEN;
172
+ const overlapChars = Math.floor(maxChars * overlapPercent);
173
+ const windowSize = Math.floor(maxChars * 0.1); // 10% window for break search
174
+
175
+ // Build line index once for O(log n) lookups
176
+ const lineIndex = buildLineIndex(markdown);
177
+
178
+ const chunks: ChunkOutput[] = [];
179
+ let pos = 0;
180
+ let seq = 0;
181
+
182
+ while (pos < markdown.length) {
183
+ // Calculate target end position
184
+ const targetEnd = pos + maxChars;
185
+
186
+ let endPos: number;
187
+ if (targetEnd >= markdown.length) {
188
+ // Last chunk - take rest
189
+ endPos = markdown.length;
190
+ } else {
191
+ // Find a good break point
192
+ endPos = findBreakPoint(markdown, targetEnd, windowSize);
193
+ }
194
+
195
+ // Extract chunk text - preserve exactly (no trim!)
196
+ // This maintains accurate pos/line mappings and Markdown semantics
197
+ const text = markdown.slice(pos, endPos);
198
+
199
+ // Only skip truly empty chunks (all whitespace after full content consumed)
200
+ if (text.trim().length > 0) {
201
+ const startLine = lineAtPosition(lineIndex, pos);
202
+ const endLine = lineAtPosition(lineIndex, endPos - 1);
203
+
204
+ // Detect language for this chunk
205
+ const language =
206
+ documentLanguageHint ?? defaultLanguageDetector.detect(text);
207
+
208
+ chunks.push({
209
+ seq,
210
+ pos,
211
+ text,
212
+ startLine,
213
+ endLine,
214
+ language,
215
+ tokenCount: null, // Char-based, no exact token count
216
+ });
217
+
218
+ seq += 1;
219
+ }
220
+
221
+ // Move position, accounting for overlap
222
+ if (endPos >= markdown.length) {
223
+ break;
224
+ }
225
+
226
+ // Calculate next position with overlap
227
+ const nextPos = endPos - overlapChars;
228
+ pos = Math.max(pos + 1, nextPos); // Ensure we always advance
229
+ }
230
+
231
+ return chunks;
232
+ }
233
+ }
234
+
235
+ /**
236
+ * Default chunker instance.
237
+ */
238
+ export const defaultChunker = new MarkdownChunker();