@gmickel/gno 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +256 -0
- package/assets/skill/SKILL.md +112 -0
- package/assets/skill/cli-reference.md +327 -0
- package/assets/skill/examples.md +234 -0
- package/assets/skill/mcp-reference.md +159 -0
- package/package.json +90 -0
- package/src/app/constants.ts +313 -0
- package/src/cli/colors.ts +65 -0
- package/src/cli/commands/ask.ts +545 -0
- package/src/cli/commands/cleanup.ts +105 -0
- package/src/cli/commands/collection/add.ts +120 -0
- package/src/cli/commands/collection/index.ts +10 -0
- package/src/cli/commands/collection/list.ts +108 -0
- package/src/cli/commands/collection/remove.ts +64 -0
- package/src/cli/commands/collection/rename.ts +95 -0
- package/src/cli/commands/context/add.ts +67 -0
- package/src/cli/commands/context/check.ts +153 -0
- package/src/cli/commands/context/index.ts +10 -0
- package/src/cli/commands/context/list.ts +109 -0
- package/src/cli/commands/context/rm.ts +52 -0
- package/src/cli/commands/doctor.ts +393 -0
- package/src/cli/commands/embed.ts +462 -0
- package/src/cli/commands/get.ts +356 -0
- package/src/cli/commands/index-cmd.ts +119 -0
- package/src/cli/commands/index.ts +102 -0
- package/src/cli/commands/init.ts +328 -0
- package/src/cli/commands/ls.ts +217 -0
- package/src/cli/commands/mcp/config.ts +300 -0
- package/src/cli/commands/mcp/index.ts +24 -0
- package/src/cli/commands/mcp/install.ts +203 -0
- package/src/cli/commands/mcp/paths.ts +470 -0
- package/src/cli/commands/mcp/status.ts +222 -0
- package/src/cli/commands/mcp/uninstall.ts +158 -0
- package/src/cli/commands/mcp.ts +20 -0
- package/src/cli/commands/models/clear.ts +103 -0
- package/src/cli/commands/models/index.ts +32 -0
- package/src/cli/commands/models/list.ts +214 -0
- package/src/cli/commands/models/path.ts +51 -0
- package/src/cli/commands/models/pull.ts +199 -0
- package/src/cli/commands/models/use.ts +85 -0
- package/src/cli/commands/multi-get.ts +400 -0
- package/src/cli/commands/query.ts +220 -0
- package/src/cli/commands/ref-parser.ts +108 -0
- package/src/cli/commands/reset.ts +191 -0
- package/src/cli/commands/search.ts +136 -0
- package/src/cli/commands/shared.ts +156 -0
- package/src/cli/commands/skill/index.ts +19 -0
- package/src/cli/commands/skill/install.ts +197 -0
- package/src/cli/commands/skill/paths-cmd.ts +81 -0
- package/src/cli/commands/skill/paths.ts +191 -0
- package/src/cli/commands/skill/show.ts +73 -0
- package/src/cli/commands/skill/uninstall.ts +141 -0
- package/src/cli/commands/status.ts +205 -0
- package/src/cli/commands/update.ts +68 -0
- package/src/cli/commands/vsearch.ts +188 -0
- package/src/cli/context.ts +64 -0
- package/src/cli/errors.ts +64 -0
- package/src/cli/format/search-results.ts +211 -0
- package/src/cli/options.ts +183 -0
- package/src/cli/program.ts +1330 -0
- package/src/cli/run.ts +213 -0
- package/src/cli/ui.ts +92 -0
- package/src/config/defaults.ts +20 -0
- package/src/config/index.ts +55 -0
- package/src/config/loader.ts +161 -0
- package/src/config/paths.ts +87 -0
- package/src/config/saver.ts +153 -0
- package/src/config/types.ts +280 -0
- package/src/converters/adapters/markitdownTs/adapter.ts +140 -0
- package/src/converters/adapters/officeparser/adapter.ts +126 -0
- package/src/converters/canonicalize.ts +89 -0
- package/src/converters/errors.ts +218 -0
- package/src/converters/index.ts +51 -0
- package/src/converters/mime.ts +163 -0
- package/src/converters/native/markdown.ts +115 -0
- package/src/converters/native/plaintext.ts +56 -0
- package/src/converters/path.ts +48 -0
- package/src/converters/pipeline.ts +159 -0
- package/src/converters/registry.ts +74 -0
- package/src/converters/types.ts +123 -0
- package/src/converters/versions.ts +24 -0
- package/src/index.ts +27 -0
- package/src/ingestion/chunker.ts +238 -0
- package/src/ingestion/index.ts +32 -0
- package/src/ingestion/language.ts +276 -0
- package/src/ingestion/sync.ts +671 -0
- package/src/ingestion/types.ts +219 -0
- package/src/ingestion/walker.ts +235 -0
- package/src/llm/cache.ts +467 -0
- package/src/llm/errors.ts +191 -0
- package/src/llm/index.ts +58 -0
- package/src/llm/nodeLlamaCpp/adapter.ts +133 -0
- package/src/llm/nodeLlamaCpp/embedding.ts +165 -0
- package/src/llm/nodeLlamaCpp/generation.ts +88 -0
- package/src/llm/nodeLlamaCpp/lifecycle.ts +317 -0
- package/src/llm/nodeLlamaCpp/rerank.ts +94 -0
- package/src/llm/registry.ts +86 -0
- package/src/llm/types.ts +129 -0
- package/src/mcp/resources/index.ts +151 -0
- package/src/mcp/server.ts +229 -0
- package/src/mcp/tools/get.ts +220 -0
- package/src/mcp/tools/index.ts +160 -0
- package/src/mcp/tools/multi-get.ts +263 -0
- package/src/mcp/tools/query.ts +226 -0
- package/src/mcp/tools/search.ts +119 -0
- package/src/mcp/tools/status.ts +81 -0
- package/src/mcp/tools/vsearch.ts +198 -0
- package/src/pipeline/chunk-lookup.ts +44 -0
- package/src/pipeline/expansion.ts +256 -0
- package/src/pipeline/explain.ts +115 -0
- package/src/pipeline/fusion.ts +185 -0
- package/src/pipeline/hybrid.ts +535 -0
- package/src/pipeline/index.ts +64 -0
- package/src/pipeline/query-language.ts +118 -0
- package/src/pipeline/rerank.ts +223 -0
- package/src/pipeline/search.ts +261 -0
- package/src/pipeline/types.ts +328 -0
- package/src/pipeline/vsearch.ts +348 -0
- package/src/store/index.ts +41 -0
- package/src/store/migrations/001-initial.ts +196 -0
- package/src/store/migrations/index.ts +20 -0
- package/src/store/migrations/runner.ts +187 -0
- package/src/store/sqlite/adapter.ts +1242 -0
- package/src/store/sqlite/index.ts +7 -0
- package/src/store/sqlite/setup.ts +129 -0
- package/src/store/sqlite/types.ts +28 -0
- package/src/store/types.ts +506 -0
- package/src/store/vector/index.ts +13 -0
- package/src/store/vector/sqlite-vec.ts +373 -0
- package/src/store/vector/stats.ts +152 -0
- package/src/store/vector/types.ts +115 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ingestion subsystem - public exports.
|
|
3
|
+
*
|
|
4
|
+
* @module src/ingestion
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
// Chunker
|
|
8
|
+
export { defaultChunker, MarkdownChunker } from './chunker';
|
|
9
|
+
// Language detection
|
|
10
|
+
export { defaultLanguageDetector, SimpleLanguageDetector } from './language';
|
|
11
|
+
// Sync service
|
|
12
|
+
export { defaultSyncService, SyncService } from './sync';
|
|
13
|
+
// Types
|
|
14
|
+
export type {
|
|
15
|
+
ChunkerPort,
|
|
16
|
+
ChunkOutput,
|
|
17
|
+
ChunkParams,
|
|
18
|
+
CollectionSyncResult,
|
|
19
|
+
FileSyncResult,
|
|
20
|
+
FileSyncStatus,
|
|
21
|
+
LanguageDetectorPort,
|
|
22
|
+
ProcessDecision,
|
|
23
|
+
SkippedEntry,
|
|
24
|
+
SyncOptions,
|
|
25
|
+
SyncResult,
|
|
26
|
+
WalkConfig,
|
|
27
|
+
WalkEntry,
|
|
28
|
+
WalkerPort,
|
|
29
|
+
} from './types';
|
|
30
|
+
export { collectionToWalkConfig, DEFAULT_CHUNK_PARAMS } from './types';
|
|
31
|
+
// Walker
|
|
32
|
+
export { defaultWalker, FileWalker } from './walker';
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic language detection for chunks.
|
|
3
|
+
* Uses simple heuristics - no external dependencies.
|
|
4
|
+
*
|
|
5
|
+
* @module src/ingestion/language
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { LanguageDetectorPort } from './types';
|
|
9
|
+
|
|
10
|
+
/** Regex to split on whitespace and punctuation */
|
|
11
|
+
const WORD_SPLIT_REGEX = /[\s\p{P}]+/u;
|
|
12
|
+
|
|
13
|
+
/** Hiragana range */
|
|
14
|
+
const HIRAGANA_MIN = 0x30_40;
|
|
15
|
+
const HIRAGANA_MAX = 0x30_9f;
|
|
16
|
+
|
|
17
|
+
/** Katakana range */
|
|
18
|
+
const KATAKANA_MIN = 0x30_a0;
|
|
19
|
+
const KATAKANA_MAX = 0x30_ff;
|
|
20
|
+
|
|
21
|
+
/** Hangul range */
|
|
22
|
+
const HANGUL_MIN = 0xac_00;
|
|
23
|
+
const HANGUL_MAX = 0xd7_af;
|
|
24
|
+
|
|
25
|
+
/** CJK ranges for quick codepoint checking */
|
|
26
|
+
const CJK_RANGES = [
|
|
27
|
+
[0x4e_00, 0x9f_ff], // CJK Unified Ideographs
|
|
28
|
+
[0x34_00, 0x4d_bf], // CJK Unified Ideographs Extension A
|
|
29
|
+
[0x30_40, 0x30_9f], // Hiragana
|
|
30
|
+
[0x30_a0, 0x30_ff], // Katakana
|
|
31
|
+
[0xac_00, 0xd7_af], // Hangul
|
|
32
|
+
] as const;
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Character frequency thresholds for CJK detection.
|
|
36
|
+
*/
|
|
37
|
+
const CJK_THRESHOLD = 0.1; // 10% CJK chars triggers detection
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Common words for European language detection.
|
|
41
|
+
* These are stop words that appear frequently.
|
|
42
|
+
* Pre-built as Sets for O(1) lookup.
|
|
43
|
+
*/
|
|
44
|
+
const LANGUAGE_MARKER_SETS: Record<string, Set<string>> = {
|
|
45
|
+
en: new Set([
|
|
46
|
+
'the',
|
|
47
|
+
'and',
|
|
48
|
+
'is',
|
|
49
|
+
'are',
|
|
50
|
+
'was',
|
|
51
|
+
'were',
|
|
52
|
+
'be',
|
|
53
|
+
'have',
|
|
54
|
+
'has',
|
|
55
|
+
'this',
|
|
56
|
+
'that',
|
|
57
|
+
'with',
|
|
58
|
+
'for',
|
|
59
|
+
'not',
|
|
60
|
+
]),
|
|
61
|
+
de: new Set([
|
|
62
|
+
'der',
|
|
63
|
+
'die',
|
|
64
|
+
'das',
|
|
65
|
+
'und',
|
|
66
|
+
'ist',
|
|
67
|
+
'sind',
|
|
68
|
+
'ein',
|
|
69
|
+
'eine',
|
|
70
|
+
'für',
|
|
71
|
+
'mit',
|
|
72
|
+
'auf',
|
|
73
|
+
'den',
|
|
74
|
+
'dem',
|
|
75
|
+
'nicht',
|
|
76
|
+
]),
|
|
77
|
+
fr: new Set([
|
|
78
|
+
'le',
|
|
79
|
+
'la',
|
|
80
|
+
'les',
|
|
81
|
+
'et',
|
|
82
|
+
'est',
|
|
83
|
+
'sont',
|
|
84
|
+
'un',
|
|
85
|
+
'une',
|
|
86
|
+
'pour',
|
|
87
|
+
'avec',
|
|
88
|
+
'sur',
|
|
89
|
+
'des',
|
|
90
|
+
'dans',
|
|
91
|
+
'pas',
|
|
92
|
+
]),
|
|
93
|
+
it: new Set([
|
|
94
|
+
'il',
|
|
95
|
+
'la',
|
|
96
|
+
'le',
|
|
97
|
+
'e',
|
|
98
|
+
'è',
|
|
99
|
+
'sono',
|
|
100
|
+
'un',
|
|
101
|
+
'una',
|
|
102
|
+
'per',
|
|
103
|
+
'con',
|
|
104
|
+
'su',
|
|
105
|
+
'dei',
|
|
106
|
+
'nel',
|
|
107
|
+
'non',
|
|
108
|
+
]),
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Check if a codepoint is CJK.
|
|
113
|
+
*/
|
|
114
|
+
function isCjkCodepoint(cp: number): boolean {
|
|
115
|
+
for (const [min, max] of CJK_RANGES) {
|
|
116
|
+
if (cp >= min && cp <= max) {
|
|
117
|
+
return true;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return false;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Check if a codepoint is whitespace.
|
|
125
|
+
*/
|
|
126
|
+
function isWhitespace(cp: number): boolean {
|
|
127
|
+
// Common whitespace codepoints
|
|
128
|
+
return (
|
|
129
|
+
cp === 0x20 || // space
|
|
130
|
+
cp === 0x09 || // tab
|
|
131
|
+
cp === 0x0a || // newline
|
|
132
|
+
cp === 0x0d || // carriage return
|
|
133
|
+
cp === 0x0c || // form feed
|
|
134
|
+
cp === 0xa0 || // non-breaking space
|
|
135
|
+
(cp >= 0x20_00 && cp <= 0x20_0a) // various spaces
|
|
136
|
+
);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Extract words from text for language analysis.
|
|
141
|
+
*/
|
|
142
|
+
function extractWords(text: string): string[] {
|
|
143
|
+
return text
|
|
144
|
+
.toLowerCase()
|
|
145
|
+
.split(WORD_SPLIT_REGEX)
|
|
146
|
+
.filter((w) => w.length >= 2 && w.length <= 15);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Detect if text is primarily CJK (Chinese, Japanese, Korean).
|
|
151
|
+
* Single-pass counting for efficiency.
|
|
152
|
+
*/
|
|
153
|
+
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: Unicode range detection with multiple language heuristics
|
|
154
|
+
function detectCjk(text: string): 'zh' | 'ja' | 'ko' | null {
|
|
155
|
+
let totalChars = 0;
|
|
156
|
+
let cjkCount = 0;
|
|
157
|
+
let hasHiragana = false;
|
|
158
|
+
let hasKatakana = false;
|
|
159
|
+
let hasHangul = false;
|
|
160
|
+
|
|
161
|
+
// Single pass through the string
|
|
162
|
+
for (const char of text) {
|
|
163
|
+
const cp = char.codePointAt(0);
|
|
164
|
+
if (cp === undefined) {
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Skip whitespace for total count
|
|
169
|
+
if (!isWhitespace(cp)) {
|
|
170
|
+
totalChars += 1;
|
|
171
|
+
|
|
172
|
+
// Check CJK ranges
|
|
173
|
+
if (isCjkCodepoint(cp)) {
|
|
174
|
+
cjkCount += 1;
|
|
175
|
+
|
|
176
|
+
// Also check for script-specific markers
|
|
177
|
+
if (cp >= HIRAGANA_MIN && cp <= HIRAGANA_MAX) {
|
|
178
|
+
hasHiragana = true;
|
|
179
|
+
} else if (cp >= KATAKANA_MIN && cp <= KATAKANA_MAX) {
|
|
180
|
+
hasKatakana = true;
|
|
181
|
+
} else if (cp >= HANGUL_MIN && cp <= HANGUL_MAX) {
|
|
182
|
+
hasHangul = true;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
if (totalChars === 0) {
|
|
189
|
+
return null;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
const cjkRatio = cjkCount / totalChars;
|
|
193
|
+
|
|
194
|
+
if (cjkRatio < CJK_THRESHOLD) {
|
|
195
|
+
return null;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// Distinguish between CJK languages by script-specific characters
|
|
199
|
+
if (hasHiragana || hasKatakana) {
|
|
200
|
+
return 'ja';
|
|
201
|
+
}
|
|
202
|
+
if (hasHangul) {
|
|
203
|
+
return 'ko';
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Default to Chinese for pure Han characters
|
|
207
|
+
return 'zh';
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Detect European language by word frequency.
|
|
212
|
+
* Uses pre-built Sets for O(1) marker lookup.
|
|
213
|
+
*/
|
|
214
|
+
function detectEuropean(words: string[]): string | null {
|
|
215
|
+
if (words.length < 10) {
|
|
216
|
+
return null;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const scores: Record<string, number> = {};
|
|
220
|
+
|
|
221
|
+
for (const [lang, markerSet] of Object.entries(LANGUAGE_MARKER_SETS)) {
|
|
222
|
+
let matches = 0;
|
|
223
|
+
|
|
224
|
+
for (const word of words) {
|
|
225
|
+
if (markerSet.has(word)) {
|
|
226
|
+
matches += 1;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
scores[lang] = matches / words.length;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Find language with highest score (must exceed threshold)
|
|
234
|
+
const threshold = 0.02; // 2% of words must be markers
|
|
235
|
+
let bestLang: string | null = null;
|
|
236
|
+
let bestScore = threshold;
|
|
237
|
+
|
|
238
|
+
for (const [lang, score] of Object.entries(scores)) {
|
|
239
|
+
if (score > bestScore) {
|
|
240
|
+
bestScore = score;
|
|
241
|
+
bestLang = lang;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return bestLang;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* Simple deterministic language detector.
|
|
250
|
+
* Priority:
|
|
251
|
+
* 1. CJK detection (script-based)
|
|
252
|
+
* 2. European language detection (word frequency)
|
|
253
|
+
* 3. null (undetermined)
|
|
254
|
+
*/
|
|
255
|
+
export class SimpleLanguageDetector implements LanguageDetectorPort {
|
|
256
|
+
detect(text: string): string | null {
|
|
257
|
+
if (!text || text.length < 50) {
|
|
258
|
+
return null;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// Try CJK first (script-based, more reliable)
|
|
262
|
+
const cjk = detectCjk(text);
|
|
263
|
+
if (cjk) {
|
|
264
|
+
return cjk;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Try European languages
|
|
268
|
+
const words = extractWords(text);
|
|
269
|
+
return detectEuropean(words);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* Default language detector instance.
|
|
275
|
+
*/
|
|
276
|
+
export const defaultLanguageDetector = new SimpleLanguageDetector();
|