@luckydraw/cumulus 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +148 -0
- package/dist/cli/cumulus.d.ts +3 -0
- package/dist/cli/cumulus.d.ts.map +1 -0
- package/dist/cli/cumulus.js +233 -0
- package/dist/cli/cumulus.js.map +1 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +43 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/config.d.ts +86 -0
- package/dist/lib/config.d.ts.map +1 -0
- package/dist/lib/config.js +241 -0
- package/dist/lib/config.js.map +1 -0
- package/dist/lib/content-detector.d.ts +46 -0
- package/dist/lib/content-detector.d.ts.map +1 -0
- package/dist/lib/content-detector.js +359 -0
- package/dist/lib/content-detector.js.map +1 -0
- package/dist/lib/content-store.d.ts +255 -0
- package/dist/lib/content-store.d.ts.map +1 -0
- package/dist/lib/content-store.js +955 -0
- package/dist/lib/content-store.js.map +1 -0
- package/dist/lib/context-budget.d.ts +83 -0
- package/dist/lib/context-budget.d.ts.map +1 -0
- package/dist/lib/context-budget.js +101 -0
- package/dist/lib/context-budget.js.map +1 -0
- package/dist/lib/embeddings.d.ts +64 -0
- package/dist/lib/embeddings.d.ts.map +1 -0
- package/dist/lib/embeddings.js +176 -0
- package/dist/lib/embeddings.js.map +1 -0
- package/dist/lib/history.d.ts +120 -0
- package/dist/lib/history.d.ts.map +1 -0
- package/dist/lib/history.js +205 -0
- package/dist/lib/history.js.map +1 -0
- package/dist/lib/image-utils.d.ts +41 -0
- package/dist/lib/image-utils.d.ts.map +1 -0
- package/dist/lib/image-utils.js +288 -0
- package/dist/lib/image-utils.js.map +1 -0
- package/dist/lib/migrate.d.ts +35 -0
- package/dist/lib/migrate.d.ts.map +1 -0
- package/dist/lib/migrate.js +196 -0
- package/dist/lib/migrate.js.map +1 -0
- package/dist/lib/retriever.d.ts +56 -0
- package/dist/lib/retriever.d.ts.map +1 -0
- package/dist/lib/retriever.js +644 -0
- package/dist/lib/retriever.js.map +1 -0
- package/dist/lib/revert.d.ts +23 -0
- package/dist/lib/revert.d.ts.map +1 -0
- package/dist/lib/revert.js +75 -0
- package/dist/lib/revert.js.map +1 -0
- package/dist/lib/session.d.ts +65 -0
- package/dist/lib/session.d.ts.map +1 -0
- package/dist/lib/session.js +289 -0
- package/dist/lib/session.js.map +1 -0
- package/dist/lib/snapshots.d.ts +39 -0
- package/dist/lib/snapshots.d.ts.map +1 -0
- package/dist/lib/snapshots.js +99 -0
- package/dist/lib/snapshots.js.map +1 -0
- package/dist/lib/stream-processor.d.ts +149 -0
- package/dist/lib/stream-processor.d.ts.map +1 -0
- package/dist/lib/stream-processor.js +389 -0
- package/dist/lib/stream-processor.js.map +1 -0
- package/dist/lib/summarizer.d.ts +67 -0
- package/dist/lib/summarizer.d.ts.map +1 -0
- package/dist/lib/summarizer.js +213 -0
- package/dist/lib/summarizer.js.map +1 -0
- package/dist/mcp/index.d.ts +3 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/index.js +16 -0
- package/dist/mcp/index.js.map +1 -0
- package/dist/mcp/proxy.d.ts +19 -0
- package/dist/mcp/proxy.d.ts.map +1 -0
- package/dist/mcp/proxy.js +120 -0
- package/dist/mcp/proxy.js.map +1 -0
- package/dist/mcp/server.d.ts +6 -0
- package/dist/mcp/server.d.ts.map +1 -0
- package/dist/mcp/server.js +29 -0
- package/dist/mcp/server.js.map +1 -0
- package/dist/mcp/shared-server.d.ts +21 -0
- package/dist/mcp/shared-server.d.ts.map +1 -0
- package/dist/mcp/shared-server.js +210 -0
- package/dist/mcp/shared-server.js.map +1 -0
- package/dist/mcp/tool-handler.d.ts +20 -0
- package/dist/mcp/tool-handler.d.ts.map +1 -0
- package/dist/mcp/tool-handler.js +1405 -0
- package/dist/mcp/tool-handler.js.map +1 -0
- package/dist/tui/components/App.d.ts +11 -0
- package/dist/tui/components/App.d.ts.map +1 -0
- package/dist/tui/components/App.js +607 -0
- package/dist/tui/components/App.js.map +1 -0
- package/dist/tui/components/DebugContextView.d.ts +13 -0
- package/dist/tui/components/DebugContextView.d.ts.map +1 -0
- package/dist/tui/components/DebugContextView.js +78 -0
- package/dist/tui/components/DebugContextView.js.map +1 -0
- package/dist/tui/components/IncludeMenu.d.ts +12 -0
- package/dist/tui/components/IncludeMenu.d.ts.map +1 -0
- package/dist/tui/components/IncludeMenu.js +127 -0
- package/dist/tui/components/IncludeMenu.js.map +1 -0
- package/dist/tui/components/InputArea.d.ts +27 -0
- package/dist/tui/components/InputArea.d.ts.map +1 -0
- package/dist/tui/components/InputArea.js +366 -0
- package/dist/tui/components/InputArea.js.map +1 -0
- package/dist/tui/components/MarkdownText.d.ts +38 -0
- package/dist/tui/components/MarkdownText.d.ts.map +1 -0
- package/dist/tui/components/MarkdownText.js +234 -0
- package/dist/tui/components/MarkdownText.js.map +1 -0
- package/dist/tui/components/MessageBubble.d.ts +11 -0
- package/dist/tui/components/MessageBubble.d.ts.map +1 -0
- package/dist/tui/components/MessageBubble.js +16 -0
- package/dist/tui/components/MessageBubble.js.map +1 -0
- package/dist/tui/components/MessageHistory.d.ts +11 -0
- package/dist/tui/components/MessageHistory.d.ts.map +1 -0
- package/dist/tui/components/MessageHistory.js +12 -0
- package/dist/tui/components/MessageHistory.js.map +1 -0
- package/dist/tui/components/RevertMenu.d.ts +17 -0
- package/dist/tui/components/RevertMenu.d.ts.map +1 -0
- package/dist/tui/components/RevertMenu.js +144 -0
- package/dist/tui/components/RevertMenu.js.map +1 -0
- package/dist/tui/components/StatusBar.d.ts +14 -0
- package/dist/tui/components/StatusBar.d.ts.map +1 -0
- package/dist/tui/components/StatusBar.js +13 -0
- package/dist/tui/components/StatusBar.js.map +1 -0
- package/dist/tui/components/StreamingResponse.d.ts +15 -0
- package/dist/tui/components/StreamingResponse.d.ts.map +1 -0
- package/dist/tui/components/StreamingResponse.js +52 -0
- package/dist/tui/components/StreamingResponse.js.map +1 -0
- package/dist/tui/hooks/useAppState.d.ts +147 -0
- package/dist/tui/hooks/useAppState.d.ts.map +1 -0
- package/dist/tui/hooks/useAppState.js +110 -0
- package/dist/tui/hooks/useAppState.js.map +1 -0
- package/dist/tui/hooks/useClaudeProcess.d.ts +19 -0
- package/dist/tui/hooks/useClaudeProcess.d.ts.map +1 -0
- package/dist/tui/hooks/useClaudeProcess.js +185 -0
- package/dist/tui/hooks/useClaudeProcess.js.map +1 -0
- package/dist/tui/index.d.ts +10 -0
- package/dist/tui/index.d.ts.map +1 -0
- package/dist/tui/index.js +11 -0
- package/dist/tui/index.js.map +1 -0
- package/dist/tui/utils/streamParser.d.ts +31 -0
- package/dist/tui/utils/streamParser.d.ts.map +1 -0
- package/dist/tui/utils/streamParser.js +63 -0
- package/dist/tui/utils/streamParser.js.map +1 -0
- package/package.json +94 -0
|
@@ -0,0 +1,955 @@
|
|
|
1
|
+
import * as crypto from 'crypto';
|
|
2
|
+
import * as fs from 'fs/promises';
|
|
3
|
+
import * as path from 'path';
|
|
4
|
+
import { nanoid } from 'nanoid';
|
|
5
|
+
import { generateHeuristicSummary } from './content-detector.js';
|
|
6
|
+
import { estimateTokens } from './context-budget.js';
|
|
7
|
+
import { cosineSimilarity, embeddingsAvailable, getEmbeddingProvider } from './embeddings.js';
|
|
8
|
+
/** Default chunk size in tokens */
|
|
9
|
+
const DEFAULT_CHUNK_SIZE = 800;
|
|
10
|
+
/** Minimum structural unit size before merging with sibling */
|
|
11
|
+
const MIN_UNIT_TOKENS = 100;
|
|
12
|
+
/** Maximum snippet length for search results */
|
|
13
|
+
const SNIPPET_LENGTH = 300;
|
|
14
|
+
/** Minimum similarity threshold for semantic search */
|
|
15
|
+
const SEMANTIC_THRESHOLD = 0.3;
|
|
16
|
+
/**
|
|
17
|
+
* ContentStore manages externalized content storage and retrieval.
|
|
18
|
+
*
|
|
19
|
+
* Content is stored in a thread-specific directory:
|
|
20
|
+
* - index.jsonl: Metadata for all stored content
|
|
21
|
+
* - chunks/: Directory containing actual content chunks
|
|
22
|
+
*/
|
|
23
|
+
export class ContentStore {
|
|
24
|
+
basePath;
|
|
25
|
+
indexPath;
|
|
26
|
+
chunksDir;
|
|
27
|
+
constructor(basePath) {
|
|
28
|
+
this.basePath = basePath;
|
|
29
|
+
this.indexPath = path.join(basePath, 'index.jsonl');
|
|
30
|
+
this.chunksDir = path.join(basePath, 'chunks');
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Ensure storage directories exist.
|
|
34
|
+
*/
|
|
35
|
+
async ensureDirectories() {
|
|
36
|
+
await fs.mkdir(this.chunksDir, { recursive: true });
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Get path to the images directory for cached image files.
|
|
40
|
+
*/
|
|
41
|
+
getImagesDir() {
|
|
42
|
+
return path.join(this.basePath, 'images');
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Ensure the images directory exists.
|
|
46
|
+
*/
|
|
47
|
+
async ensureImagesDir() {
|
|
48
|
+
const imagesDir = this.getImagesDir();
|
|
49
|
+
await fs.mkdir(imagesDir, { recursive: true });
|
|
50
|
+
return imagesDir;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Compute SHA-256 hash of content for deduplication.
|
|
54
|
+
*/
|
|
55
|
+
computeHash(content) {
|
|
56
|
+
return crypto.createHash('sha256').update(content).digest('hex');
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Split content into chunks of approximately chunkSize tokens.
|
|
60
|
+
*/
|
|
61
|
+
createChunks(content, chunkSize = DEFAULT_CHUNK_SIZE) {
|
|
62
|
+
const lines = content.split('\n');
|
|
63
|
+
const chunks = [];
|
|
64
|
+
let currentChunk = [];
|
|
65
|
+
let currentTokens = 0;
|
|
66
|
+
for (const line of lines) {
|
|
67
|
+
const lineTokens = estimateTokens(line);
|
|
68
|
+
// If adding this line would exceed chunk size, finalize current chunk
|
|
69
|
+
if (currentTokens + lineTokens > chunkSize && currentChunk.length > 0) {
|
|
70
|
+
const chunkContent = currentChunk.join('\n');
|
|
71
|
+
chunks.push({
|
|
72
|
+
index: chunks.length,
|
|
73
|
+
content: chunkContent,
|
|
74
|
+
tokenEstimate: currentTokens,
|
|
75
|
+
});
|
|
76
|
+
currentChunk = [];
|
|
77
|
+
currentTokens = 0;
|
|
78
|
+
}
|
|
79
|
+
currentChunk.push(line);
|
|
80
|
+
currentTokens += lineTokens;
|
|
81
|
+
}
|
|
82
|
+
// Don't forget the last chunk
|
|
83
|
+
if (currentChunk.length > 0) {
|
|
84
|
+
const chunkContent = currentChunk.join('\n');
|
|
85
|
+
chunks.push({
|
|
86
|
+
index: chunks.length,
|
|
87
|
+
content: chunkContent,
|
|
88
|
+
tokenEstimate: currentTokens,
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
return chunks;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Detect structural boundaries in non-code content and split into structural units.
|
|
95
|
+
* Each unit is a coherent block: a headed section, table, code fence, paragraph, etc.
|
|
96
|
+
*/
|
|
97
|
+
detectStructuralUnits(lines) {
|
|
98
|
+
const units = [];
|
|
99
|
+
let current = [];
|
|
100
|
+
let inCodeFence = false;
|
|
101
|
+
let inTable = false;
|
|
102
|
+
const flushCurrent = () => {
|
|
103
|
+
if (current.length > 0) {
|
|
104
|
+
units.push(current);
|
|
105
|
+
current = [];
|
|
106
|
+
}
|
|
107
|
+
};
|
|
108
|
+
for (let i = 0; i < lines.length; i++) {
|
|
109
|
+
const line = lines[i];
|
|
110
|
+
const trimmed = line.trimStart();
|
|
111
|
+
// Code fence toggle
|
|
112
|
+
if (trimmed.startsWith('```') || trimmed.startsWith('~~~')) {
|
|
113
|
+
if (!inCodeFence) {
|
|
114
|
+
// Starting a code fence — flush anything before it
|
|
115
|
+
flushCurrent();
|
|
116
|
+
inCodeFence = true;
|
|
117
|
+
current.push(line);
|
|
118
|
+
}
|
|
119
|
+
else {
|
|
120
|
+
// Closing a code fence — include closing line, flush as atomic unit
|
|
121
|
+
inCodeFence = false;
|
|
122
|
+
current.push(line);
|
|
123
|
+
flushCurrent();
|
|
124
|
+
}
|
|
125
|
+
continue;
|
|
126
|
+
}
|
|
127
|
+
// Inside a code fence — accumulate without any splitting
|
|
128
|
+
if (inCodeFence) {
|
|
129
|
+
current.push(line);
|
|
130
|
+
continue;
|
|
131
|
+
}
|
|
132
|
+
// Table detection: lines starting with |
|
|
133
|
+
const isTableLine = trimmed.startsWith('|') && trimmed.includes('|', 1);
|
|
134
|
+
if (isTableLine) {
|
|
135
|
+
if (!inTable) {
|
|
136
|
+
// Starting a table — flush anything before it
|
|
137
|
+
flushCurrent();
|
|
138
|
+
inTable = true;
|
|
139
|
+
}
|
|
140
|
+
current.push(line);
|
|
141
|
+
continue;
|
|
142
|
+
}
|
|
143
|
+
else if (inTable) {
|
|
144
|
+
// Leaving a table — flush the table as atomic unit
|
|
145
|
+
inTable = false;
|
|
146
|
+
flushCurrent();
|
|
147
|
+
}
|
|
148
|
+
// Markdown heading — starts a new section
|
|
149
|
+
if (/^#{1,6}\s/.test(trimmed)) {
|
|
150
|
+
flushCurrent();
|
|
151
|
+
current.push(line);
|
|
152
|
+
continue;
|
|
153
|
+
}
|
|
154
|
+
// Horizontal rule
|
|
155
|
+
if (/^(?:---+|___+|\*\*\*+)\s*$/.test(trimmed)) {
|
|
156
|
+
flushCurrent();
|
|
157
|
+
current.push(line);
|
|
158
|
+
flushCurrent();
|
|
159
|
+
continue;
|
|
160
|
+
}
|
|
161
|
+
// Blank line — paragraph boundary
|
|
162
|
+
if (trimmed === '') {
|
|
163
|
+
if (current.length > 0) {
|
|
164
|
+
// End of a paragraph/section
|
|
165
|
+
flushCurrent();
|
|
166
|
+
}
|
|
167
|
+
// Skip blank lines (don't include in any unit)
|
|
168
|
+
continue;
|
|
169
|
+
}
|
|
170
|
+
// Regular line — accumulate into current unit
|
|
171
|
+
current.push(line);
|
|
172
|
+
}
|
|
173
|
+
// Flush remaining (including unclosed code fences)
|
|
174
|
+
flushCurrent();
|
|
175
|
+
return units;
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Split non-code content into structure-aware chunks.
|
|
179
|
+
* Respects headers, tables, code fences, and paragraph boundaries.
|
|
180
|
+
* Merges small units and splits oversized ones.
|
|
181
|
+
*/
|
|
182
|
+
createStructuralChunks(content, chunkSize = DEFAULT_CHUNK_SIZE) {
|
|
183
|
+
const lines = content.split('\n');
|
|
184
|
+
const units = this.detectStructuralUnits(lines);
|
|
185
|
+
// Merge tiny units with their next sibling
|
|
186
|
+
const mergedUnits = [];
|
|
187
|
+
let pendingUnit = null;
|
|
188
|
+
let pendingTokens = 0;
|
|
189
|
+
for (const unit of units) {
|
|
190
|
+
const unitContent = unit.join('\n');
|
|
191
|
+
const unitTokens = estimateTokens(unitContent);
|
|
192
|
+
if (pendingUnit) {
|
|
193
|
+
// Merge pending tiny unit with this one
|
|
194
|
+
pendingUnit.push('', ...unit); // blank line separator
|
|
195
|
+
pendingTokens += unitTokens;
|
|
196
|
+
if (pendingTokens >= MIN_UNIT_TOKENS) {
|
|
197
|
+
mergedUnits.push(pendingUnit);
|
|
198
|
+
pendingUnit = null;
|
|
199
|
+
pendingTokens = 0;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
else if (unitTokens < MIN_UNIT_TOKENS) {
|
|
203
|
+
pendingUnit = [...unit];
|
|
204
|
+
pendingTokens = unitTokens;
|
|
205
|
+
}
|
|
206
|
+
else {
|
|
207
|
+
mergedUnits.push(unit);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
// Flush any remaining pending unit
|
|
211
|
+
if (pendingUnit) {
|
|
212
|
+
if (mergedUnits.length > 0) {
|
|
213
|
+
// Append to previous unit
|
|
214
|
+
const last = mergedUnits[mergedUnits.length - 1];
|
|
215
|
+
last.push('', ...pendingUnit);
|
|
216
|
+
}
|
|
217
|
+
else {
|
|
218
|
+
mergedUnits.push(pendingUnit);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
// Pack units into chunks
|
|
222
|
+
const chunks = [];
|
|
223
|
+
let chunkLines = [];
|
|
224
|
+
let chunkTokens = 0;
|
|
225
|
+
for (const unit of mergedUnits) {
|
|
226
|
+
const unitContent = unit.join('\n');
|
|
227
|
+
const unitTokens = estimateTokens(unitContent);
|
|
228
|
+
// If a single unit exceeds chunk size, split it with the fallback line-based splitter
|
|
229
|
+
if (unitTokens > chunkSize) {
|
|
230
|
+
// Flush accumulated chunk first
|
|
231
|
+
if (chunkLines.length > 0) {
|
|
232
|
+
const cc = chunkLines.join('\n');
|
|
233
|
+
chunks.push({ index: chunks.length, content: cc, tokenEstimate: chunkTokens });
|
|
234
|
+
chunkLines = [];
|
|
235
|
+
chunkTokens = 0;
|
|
236
|
+
}
|
|
237
|
+
// Split the oversized unit at line boundaries
|
|
238
|
+
const subChunks = this.createChunks(unitContent, chunkSize);
|
|
239
|
+
for (const sub of subChunks) {
|
|
240
|
+
chunks.push({
|
|
241
|
+
index: chunks.length,
|
|
242
|
+
content: sub.content,
|
|
243
|
+
tokenEstimate: sub.tokenEstimate,
|
|
244
|
+
});
|
|
245
|
+
}
|
|
246
|
+
continue;
|
|
247
|
+
}
|
|
248
|
+
// If adding this unit exceeds chunk size, finalize current chunk
|
|
249
|
+
if (chunkTokens + unitTokens > chunkSize && chunkLines.length > 0) {
|
|
250
|
+
const cc = chunkLines.join('\n');
|
|
251
|
+
chunks.push({ index: chunks.length, content: cc, tokenEstimate: chunkTokens });
|
|
252
|
+
chunkLines = [];
|
|
253
|
+
chunkTokens = 0;
|
|
254
|
+
}
|
|
255
|
+
// Add blank line separator between units within a chunk
|
|
256
|
+
if (chunkLines.length > 0) {
|
|
257
|
+
chunkLines.push('');
|
|
258
|
+
}
|
|
259
|
+
chunkLines.push(...unit);
|
|
260
|
+
chunkTokens += unitTokens;
|
|
261
|
+
}
|
|
262
|
+
// Final chunk
|
|
263
|
+
if (chunkLines.length > 0) {
|
|
264
|
+
const cc = chunkLines.join('\n');
|
|
265
|
+
chunks.push({ index: chunks.length, content: cc, tokenEstimate: chunkTokens });
|
|
266
|
+
}
|
|
267
|
+
return chunks;
|
|
268
|
+
}
|
|
269
|
+
/**
|
|
270
|
+
* Regex patterns that identify top-level code block boundaries.
|
|
271
|
+
*/
|
|
272
|
+
static CODE_BLOCK_PATTERNS = [
|
|
273
|
+
// JavaScript/TypeScript
|
|
274
|
+
/^(?:export\s+)?(?:async\s+)?function\s+\w+/,
|
|
275
|
+
/^(?:export\s+)?(?:const|let|var)\s+\w+\s*=\s*(?:async\s*)?\(/,
|
|
276
|
+
/^(?:export\s+)?class\s+\w+/,
|
|
277
|
+
/^(?:export\s+)?(?:interface|type|enum)\s+\w+/,
|
|
278
|
+
// Python
|
|
279
|
+
/^(?:async\s+)?def\s+\w+/,
|
|
280
|
+
// Rust
|
|
281
|
+
/^(?:pub\s+)?(?:async\s+)?fn\s+\w+/,
|
|
282
|
+
/^(?:pub\s+)?(?:struct|enum|trait)\s+/,
|
|
283
|
+
/^impl\s+/,
|
|
284
|
+
// Go
|
|
285
|
+
/^func\s+/,
|
|
286
|
+
/^type\s+\w+\s+(?:struct|interface)/,
|
|
287
|
+
];
|
|
288
|
+
/**
|
|
289
|
+
* Pattern matching import/use/include lines.
|
|
290
|
+
*/
|
|
291
|
+
static IMPORT_PATTERN = /^(?:import\s|from\s|require\(|use\s|#include\s|include\s)/;
|
|
292
|
+
/**
|
|
293
|
+
* Check if a line is a code block boundary (top-level declaration).
|
|
294
|
+
*/
|
|
295
|
+
isBlockBoundary(line) {
|
|
296
|
+
return ContentStore.CODE_BLOCK_PATTERNS.some(p => p.test(line));
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Split code content into chunks at function/class/type boundaries.
|
|
300
|
+
* Falls back to line-based splitting for oversized single blocks.
|
|
301
|
+
*/
|
|
302
|
+
createCodeChunks(content, chunkSize = DEFAULT_CHUNK_SIZE) {
|
|
303
|
+
const lines = content.split('\n');
|
|
304
|
+
const blocks = [];
|
|
305
|
+
let currentBlock = [];
|
|
306
|
+
let inImportBlock = false;
|
|
307
|
+
for (const line of lines) {
|
|
308
|
+
const isImport = ContentStore.IMPORT_PATTERN.test(line);
|
|
309
|
+
const isBoundary = this.isBlockBoundary(line);
|
|
310
|
+
if (isImport) {
|
|
311
|
+
// Group consecutive imports together
|
|
312
|
+
if (!inImportBlock && currentBlock.length > 0) {
|
|
313
|
+
blocks.push(currentBlock);
|
|
314
|
+
currentBlock = [];
|
|
315
|
+
}
|
|
316
|
+
inImportBlock = true;
|
|
317
|
+
currentBlock.push(line);
|
|
318
|
+
}
|
|
319
|
+
else if (isBoundary) {
|
|
320
|
+
// New top-level declaration starts a new block
|
|
321
|
+
if (currentBlock.length > 0) {
|
|
322
|
+
blocks.push(currentBlock);
|
|
323
|
+
}
|
|
324
|
+
inImportBlock = false;
|
|
325
|
+
currentBlock = [line];
|
|
326
|
+
}
|
|
327
|
+
else {
|
|
328
|
+
// Continue current block (or start a new implicit block after imports)
|
|
329
|
+
if (inImportBlock && line.trim() !== '') {
|
|
330
|
+
blocks.push(currentBlock);
|
|
331
|
+
currentBlock = [line];
|
|
332
|
+
inImportBlock = false;
|
|
333
|
+
}
|
|
334
|
+
else {
|
|
335
|
+
currentBlock.push(line);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
if (currentBlock.length > 0) {
|
|
340
|
+
blocks.push(currentBlock);
|
|
341
|
+
}
|
|
342
|
+
// Now accumulate blocks into chunks, respecting size limits
|
|
343
|
+
const chunks = [];
|
|
344
|
+
let chunkLines = [];
|
|
345
|
+
let chunkTokens = 0;
|
|
346
|
+
for (const block of blocks) {
|
|
347
|
+
const blockContent = block.join('\n');
|
|
348
|
+
const blockTokens = estimateTokens(blockContent);
|
|
349
|
+
// If a single block exceeds chunk size, fall back to line-based splitting
|
|
350
|
+
if (blockTokens > chunkSize) {
|
|
351
|
+
// First, flush any accumulated lines
|
|
352
|
+
if (chunkLines.length > 0) {
|
|
353
|
+
const content = chunkLines.join('\n');
|
|
354
|
+
chunks.push({ index: chunks.length, content, tokenEstimate: chunkTokens });
|
|
355
|
+
chunkLines = [];
|
|
356
|
+
chunkTokens = 0;
|
|
357
|
+
}
|
|
358
|
+
// Split the oversized block by lines
|
|
359
|
+
const subChunks = this.createChunks(blockContent, chunkSize);
|
|
360
|
+
for (const sub of subChunks) {
|
|
361
|
+
chunks.push({
|
|
362
|
+
index: chunks.length,
|
|
363
|
+
content: sub.content,
|
|
364
|
+
tokenEstimate: sub.tokenEstimate,
|
|
365
|
+
});
|
|
366
|
+
}
|
|
367
|
+
continue;
|
|
368
|
+
}
|
|
369
|
+
// If adding this block would exceed chunk size, finalize current chunk
|
|
370
|
+
if (chunkTokens + blockTokens > chunkSize && chunkLines.length > 0) {
|
|
371
|
+
const content = chunkLines.join('\n');
|
|
372
|
+
chunks.push({ index: chunks.length, content, tokenEstimate: chunkTokens });
|
|
373
|
+
chunkLines = [];
|
|
374
|
+
chunkTokens = 0;
|
|
375
|
+
}
|
|
376
|
+
chunkLines.push(...block);
|
|
377
|
+
chunkTokens += blockTokens;
|
|
378
|
+
}
|
|
379
|
+
// Final chunk
|
|
380
|
+
if (chunkLines.length > 0) {
|
|
381
|
+
const content = chunkLines.join('\n');
|
|
382
|
+
chunks.push({ index: chunks.length, content, tokenEstimate: chunkTokens });
|
|
383
|
+
}
|
|
384
|
+
return chunks;
|
|
385
|
+
}
|
|
386
|
+
/**
|
|
387
|
+
* Generate a brief summary of content.
|
|
388
|
+
* Delegates to the shared heuristic summary generator in content-detector.
|
|
389
|
+
*/
|
|
390
|
+
generateSimpleSummary(content, contentType) {
|
|
391
|
+
return generateHeuristicSummary(content, contentType, 500);
|
|
392
|
+
}
|
|
393
|
+
/**
|
|
394
|
+
* Store content externally and return metadata.
|
|
395
|
+
* Returns existing content ID if content hash matches (deduplication).
|
|
396
|
+
*/
|
|
397
|
+
async store(content, options) {
|
|
398
|
+
await this.ensureDirectories();
|
|
399
|
+
const contentHash = this.computeHash(content);
|
|
400
|
+
// Check for existing content with same hash
|
|
401
|
+
const existing = await this.findByHash(contentHash);
|
|
402
|
+
if (existing) {
|
|
403
|
+
return existing;
|
|
404
|
+
}
|
|
405
|
+
const id = `cnt_${nanoid(10)}`;
|
|
406
|
+
const contentType = options.contentType ?? this.detectContentType(content);
|
|
407
|
+
const chunks = contentType === 'code'
|
|
408
|
+
? this.createCodeChunks(content)
|
|
409
|
+
: this.createStructuralChunks(content);
|
|
410
|
+
const summary = options.summary ?? this.generateSimpleSummary(content, contentType);
|
|
411
|
+
// Context propagation: prepend document-level context to chunks 1..N
|
|
412
|
+
if (chunks.length > 1) {
|
|
413
|
+
const sourceName = options.metadata?.filePath ??
|
|
414
|
+
options.metadata?.command ??
|
|
415
|
+
options.sourceTool ??
|
|
416
|
+
options.sourceType;
|
|
417
|
+
const contextHeader = `[Source: ${sourceName} | ${contentType} | ${summary.slice(0, 150)}]`;
|
|
418
|
+
const headerTokens = estimateTokens(contextHeader);
|
|
419
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
420
|
+
chunks[i].content = contextHeader + '\n\n' + chunks[i].content;
|
|
421
|
+
chunks[i].tokenEstimate += headerTokens;
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
const meta = {
|
|
425
|
+
id,
|
|
426
|
+
timestamp: Date.now(),
|
|
427
|
+
sourceType: options.sourceType,
|
|
428
|
+
sourceTool: options.sourceTool,
|
|
429
|
+
originalSize: content.length,
|
|
430
|
+
tokenEstimate: estimateTokens(content),
|
|
431
|
+
contentType,
|
|
432
|
+
summary,
|
|
433
|
+
chunkCount: chunks.length,
|
|
434
|
+
contentHash,
|
|
435
|
+
metadata: options.metadata ?? {},
|
|
436
|
+
};
|
|
437
|
+
// Write chunks to files
|
|
438
|
+
for (const chunk of chunks) {
|
|
439
|
+
const chunkPath = path.join(this.chunksDir, `${id}_${chunk.index}.txt`);
|
|
440
|
+
await fs.writeFile(chunkPath, chunk.content, 'utf-8');
|
|
441
|
+
}
|
|
442
|
+
// Append metadata to index
|
|
443
|
+
const indexLine = JSON.stringify(meta) + '\n';
|
|
444
|
+
await fs.appendFile(this.indexPath, indexLine, 'utf-8');
|
|
445
|
+
return meta;
|
|
446
|
+
}
|
|
447
|
+
/**
|
|
448
|
+
* Simple content type detection based on heuristics.
|
|
449
|
+
*/
|
|
450
|
+
detectContentType(content) {
|
|
451
|
+
const lines = content.split('\n');
|
|
452
|
+
const sample = content.slice(0, 2000);
|
|
453
|
+
// Check for JSON
|
|
454
|
+
if (sample.trim().startsWith('{') || sample.trim().startsWith('[')) {
|
|
455
|
+
try {
|
|
456
|
+
JSON.parse(content);
|
|
457
|
+
return 'json';
|
|
458
|
+
}
|
|
459
|
+
catch {
|
|
460
|
+
// Not valid JSON
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
// Check for code patterns
|
|
464
|
+
const codePatterns = [
|
|
465
|
+
/^import\s+/m,
|
|
466
|
+
/^export\s+/m,
|
|
467
|
+
/^(?:const|let|var)\s+\w+\s*=/m,
|
|
468
|
+
/^(?:function|def|fn|func)\s+\w+/m,
|
|
469
|
+
/^(?:class|interface|struct)\s+\w+/m,
|
|
470
|
+
/^(?:public|private|protected)\s+/m,
|
|
471
|
+
/^\s*(?:if|for|while|switch)\s*\(/m,
|
|
472
|
+
];
|
|
473
|
+
const codeScore = codePatterns.filter(p => p.test(sample)).length;
|
|
474
|
+
if (codeScore >= 2) {
|
|
475
|
+
return 'code';
|
|
476
|
+
}
|
|
477
|
+
// Check for log patterns
|
|
478
|
+
const logPatterns = [
|
|
479
|
+
/^\d{4}-\d{2}-\d{2}/m, // Date stamps
|
|
480
|
+
/^\[\w+\]/m, // [INFO], [ERROR], etc.
|
|
481
|
+
/^(?:DEBUG|INFO|WARN|ERROR|FATAL):/m,
|
|
482
|
+
/^\d+:\d+:\d+/m, // Time stamps
|
|
483
|
+
];
|
|
484
|
+
const logScore = logPatterns.filter(p => p.test(sample)).length;
|
|
485
|
+
if (logScore >= 2) {
|
|
486
|
+
return 'logs';
|
|
487
|
+
}
|
|
488
|
+
// Check for prose (sentences, paragraphs)
|
|
489
|
+
const avgLineLength = content.length / Math.max(lines.length, 1);
|
|
490
|
+
const hasLongLines = avgLineLength > 60;
|
|
491
|
+
const hasPunctuation = /[.!?]\s+[A-Z]/.test(sample);
|
|
492
|
+
if (hasLongLines && hasPunctuation) {
|
|
493
|
+
return 'prose';
|
|
494
|
+
}
|
|
495
|
+
return 'mixed';
|
|
496
|
+
}
|
|
497
|
+
/**
|
|
498
|
+
* Find content by hash (for deduplication).
|
|
499
|
+
*/
|
|
500
|
+
async findByHash(hash) {
|
|
501
|
+
const index = await this.loadIndex();
|
|
502
|
+
return index.find(meta => meta.contentHash === hash) ?? null;
|
|
503
|
+
}
|
|
504
|
+
/**
|
|
505
|
+
* Load the content index.
|
|
506
|
+
*/
|
|
507
|
+
async loadIndex() {
|
|
508
|
+
try {
|
|
509
|
+
const content = await fs.readFile(this.indexPath, 'utf-8');
|
|
510
|
+
if (!content.trim()) {
|
|
511
|
+
return [];
|
|
512
|
+
}
|
|
513
|
+
return content
|
|
514
|
+
.trim()
|
|
515
|
+
.split('\n')
|
|
516
|
+
.filter(line => line.trim())
|
|
517
|
+
.map(line => JSON.parse(line));
|
|
518
|
+
}
|
|
519
|
+
catch (error) {
|
|
520
|
+
if (error.code === 'ENOENT') {
|
|
521
|
+
return [];
|
|
522
|
+
}
|
|
523
|
+
throw error;
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
/**
|
|
527
|
+
* Retrieve content by ID.
|
|
528
|
+
* @param id - Content ID
|
|
529
|
+
* @param chunkIndex - Optional specific chunk index
|
|
530
|
+
* @returns Full content or specific chunk
|
|
531
|
+
*/
|
|
532
|
+
async retrieve(id, chunkIndex) {
|
|
533
|
+
const index = await this.loadIndex();
|
|
534
|
+
const meta = index.find(m => m.id === id);
|
|
535
|
+
if (!meta) {
|
|
536
|
+
return null;
|
|
537
|
+
}
|
|
538
|
+
if (chunkIndex !== undefined) {
|
|
539
|
+
if (chunkIndex < 0 || chunkIndex >= meta.chunkCount) {
|
|
540
|
+
return null;
|
|
541
|
+
}
|
|
542
|
+
const chunkPath = path.join(this.chunksDir, `${id}_${chunkIndex}.txt`);
|
|
543
|
+
try {
|
|
544
|
+
return await fs.readFile(chunkPath, 'utf-8');
|
|
545
|
+
}
|
|
546
|
+
catch {
|
|
547
|
+
return null;
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
// Retrieve all chunks
|
|
551
|
+
const chunks = [];
|
|
552
|
+
for (let i = 0; i < meta.chunkCount; i++) {
|
|
553
|
+
const chunkPath = path.join(this.chunksDir, `${id}_${i}.txt`);
|
|
554
|
+
try {
|
|
555
|
+
const chunk = await fs.readFile(chunkPath, 'utf-8');
|
|
556
|
+
chunks.push(chunk);
|
|
557
|
+
}
|
|
558
|
+
catch {
|
|
559
|
+
// Skip missing chunks
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
return chunks.join('\n');
|
|
563
|
+
}
|
|
564
|
+
/**
|
|
565
|
+
* Get metadata for stored content.
|
|
566
|
+
*/
|
|
567
|
+
async getMeta(id) {
|
|
568
|
+
const index = await this.loadIndex();
|
|
569
|
+
return index.find(m => m.id === id) ?? null;
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* Get previews of all chunks for a stored content item.
|
|
573
|
+
* Returns a brief preview of each chunk for navigation (table of contents).
|
|
574
|
+
*/
|
|
575
|
+
async getChunkPreviews(id) {
|
|
576
|
+
const meta = await this.getMeta(id);
|
|
577
|
+
if (!meta)
|
|
578
|
+
return null;
|
|
579
|
+
const previews = [];
|
|
580
|
+
for (let i = 0; i < meta.chunkCount; i++) {
|
|
581
|
+
const chunkPath = path.join(this.chunksDir, `${id}_${i}.txt`);
|
|
582
|
+
try {
|
|
583
|
+
const content = await fs.readFile(chunkPath, 'utf-8');
|
|
584
|
+
const lines = content.split('\n');
|
|
585
|
+
// Skip context propagation header (starts with [Source:)
|
|
586
|
+
let previewLine = '';
|
|
587
|
+
for (const line of lines) {
|
|
588
|
+
const trimmed = line.trim();
|
|
589
|
+
if (trimmed === '' || trimmed.startsWith('[Source:'))
|
|
590
|
+
continue;
|
|
591
|
+
previewLine = trimmed;
|
|
592
|
+
break;
|
|
593
|
+
}
|
|
594
|
+
// Truncate to 120 chars
|
|
595
|
+
const preview = previewLine.length > 120 ? previewLine.slice(0, 120) + '...' : previewLine;
|
|
596
|
+
const tokens = estimateTokens(content);
|
|
597
|
+
previews.push({ index: i, tokens, preview });
|
|
598
|
+
}
|
|
599
|
+
catch {
|
|
600
|
+
previews.push({ index: i, tokens: 0, preview: '(chunk not found)' });
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
return previews;
|
|
604
|
+
}
|
|
605
|
+
/**
|
|
606
|
+
* List all stored content.
|
|
607
|
+
*/
|
|
608
|
+
async list(options = {}) {
|
|
609
|
+
let index = await this.loadIndex();
|
|
610
|
+
if (options.sourceTypes && options.sourceTypes.length > 0) {
|
|
611
|
+
index = index.filter(m => options.sourceTypes.includes(m.sourceType));
|
|
612
|
+
}
|
|
613
|
+
// Sort by timestamp descending (most recent first)
|
|
614
|
+
index.sort((a, b) => b.timestamp - a.timestamp);
|
|
615
|
+
if (options.limit) {
|
|
616
|
+
index = index.slice(0, options.limit);
|
|
617
|
+
}
|
|
618
|
+
return index;
|
|
619
|
+
}
|
|
620
|
+
/**
|
|
621
|
+
* Delete stored content by ID.
|
|
622
|
+
* Removes from index, chunks, and embeddings.
|
|
623
|
+
* @returns true if content was found and deleted, false if not found
|
|
624
|
+
*/
|
|
625
|
+
async delete(id) {
|
|
626
|
+
const index = await this.loadIndex();
|
|
627
|
+
const metaIndex = index.findIndex(m => m.id === id);
|
|
628
|
+
if (metaIndex === -1) {
|
|
629
|
+
return false;
|
|
630
|
+
}
|
|
631
|
+
const meta = index[metaIndex];
|
|
632
|
+
// Remove chunk files
|
|
633
|
+
for (let i = 0; i < meta.chunkCount; i++) {
|
|
634
|
+
const chunkPath = path.join(this.chunksDir, `${id}_${i}.txt`);
|
|
635
|
+
try {
|
|
636
|
+
await fs.unlink(chunkPath);
|
|
637
|
+
}
|
|
638
|
+
catch {
|
|
639
|
+
// Ignore if chunk file doesn't exist
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
// Remove from index
|
|
643
|
+
index.splice(metaIndex, 1);
|
|
644
|
+
await this.saveIndex(index);
|
|
645
|
+
// Remove from embeddings
|
|
646
|
+
await this.deleteEmbeddings(id);
|
|
647
|
+
return true;
|
|
648
|
+
}
|
|
649
|
+
/**
|
|
650
|
+
* Delete embeddings for a content ID.
|
|
651
|
+
*/
|
|
652
|
+
async deleteEmbeddings(contentId) {
|
|
653
|
+
const embeddingsPath = this.getEmbeddingsPath();
|
|
654
|
+
try {
|
|
655
|
+
const content = await fs.readFile(embeddingsPath, 'utf-8');
|
|
656
|
+
const data = JSON.parse(content);
|
|
657
|
+
// Filter out entries for this content ID
|
|
658
|
+
data.entries = data.entries.filter(e => e.contentId !== contentId);
|
|
659
|
+
await fs.writeFile(embeddingsPath, JSON.stringify(data), 'utf-8');
|
|
660
|
+
}
|
|
661
|
+
catch {
|
|
662
|
+
// No embeddings file or parse error - nothing to delete
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
/**
|
|
666
|
+
* Save the index file.
|
|
667
|
+
*/
|
|
668
|
+
async saveIndex(index) {
|
|
669
|
+
await this.ensureDirectories();
|
|
670
|
+
const content = index.map(m => JSON.stringify(m)).join('\n');
|
|
671
|
+
await fs.writeFile(this.indexPath, content + (content ? '\n' : ''), 'utf-8');
|
|
672
|
+
}
|
|
673
|
+
/**
|
|
674
|
+
* Get path to embeddings file.
|
|
675
|
+
*/
|
|
676
|
+
getEmbeddingsPath() {
|
|
677
|
+
return path.join(this.basePath, 'embeddings.json');
|
|
678
|
+
}
|
|
679
|
+
/**
|
|
680
|
+
* Get embeddings for all chunks of a specific content item.
|
|
681
|
+
* Generates missing embeddings first if needed.
|
|
682
|
+
* Returns a Map keyed by chunk index.
|
|
683
|
+
*/
|
|
684
|
+
async getEmbeddingsForContent(contentId) {
|
|
685
|
+
const meta = await this.getMeta(contentId);
|
|
686
|
+
if (!meta)
|
|
687
|
+
return null;
|
|
688
|
+
await this.generateMissingEmbeddings();
|
|
689
|
+
const allEmbeddings = await this.loadContentEmbeddings();
|
|
690
|
+
const result = new Map();
|
|
691
|
+
for (let i = 0; i < meta.chunkCount; i++) {
|
|
692
|
+
const embedding = allEmbeddings.get(`${contentId}:${i}`);
|
|
693
|
+
if (embedding) {
|
|
694
|
+
result.set(i, embedding);
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
return result;
|
|
698
|
+
}
|
|
699
|
+
/**
|
|
700
|
+
* Load content embeddings.
|
|
701
|
+
*/
|
|
702
|
+
async loadContentEmbeddings() {
|
|
703
|
+
const embeddingsPath = this.getEmbeddingsPath();
|
|
704
|
+
try {
|
|
705
|
+
const content = await fs.readFile(embeddingsPath, 'utf-8');
|
|
706
|
+
const data = JSON.parse(content);
|
|
707
|
+
const map = new Map();
|
|
708
|
+
for (const entry of data.entries) {
|
|
709
|
+
// Key is contentId:chunkIndex
|
|
710
|
+
map.set(`${entry.contentId}:${entry.chunkIndex}`, entry.embedding);
|
|
711
|
+
}
|
|
712
|
+
return map;
|
|
713
|
+
}
|
|
714
|
+
catch (error) {
|
|
715
|
+
if (error.code === 'ENOENT') {
|
|
716
|
+
return new Map();
|
|
717
|
+
}
|
|
718
|
+
throw error;
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
/**
|
|
722
|
+
* Save content embeddings.
|
|
723
|
+
*/
|
|
724
|
+
async saveContentEmbeddings(entries) {
|
|
725
|
+
const embeddingsPath = this.getEmbeddingsPath();
|
|
726
|
+
const provider = getEmbeddingProvider();
|
|
727
|
+
let existingData;
|
|
728
|
+
try {
|
|
729
|
+
const content = await fs.readFile(embeddingsPath, 'utf-8');
|
|
730
|
+
existingData = JSON.parse(content);
|
|
731
|
+
}
|
|
732
|
+
catch {
|
|
733
|
+
existingData = {
|
|
734
|
+
version: 1,
|
|
735
|
+
model: provider.name,
|
|
736
|
+
dimensions: provider.dimensions,
|
|
737
|
+
entries: [],
|
|
738
|
+
};
|
|
739
|
+
}
|
|
740
|
+
existingData.entries.push(...entries);
|
|
741
|
+
await fs.mkdir(this.basePath, { recursive: true });
|
|
742
|
+
await fs.writeFile(embeddingsPath, JSON.stringify(existingData), 'utf-8');
|
|
743
|
+
}
|
|
744
|
+
/**
|
|
745
|
+
* Generate embeddings for content that doesn't have them yet.
|
|
746
|
+
*/
|
|
747
|
+
async generateMissingEmbeddings() {
|
|
748
|
+
if (!(await embeddingsAvailable())) {
|
|
749
|
+
return 0;
|
|
750
|
+
}
|
|
751
|
+
const index = await this.loadIndex();
|
|
752
|
+
const existingEmbeddings = await this.loadContentEmbeddings();
|
|
753
|
+
const provider = getEmbeddingProvider();
|
|
754
|
+
const newEntries = [];
|
|
755
|
+
for (const meta of index) {
|
|
756
|
+
for (let i = 0; i < meta.chunkCount; i++) {
|
|
757
|
+
const key = `${meta.id}:${i}`;
|
|
758
|
+
if (existingEmbeddings.has(key)) {
|
|
759
|
+
continue;
|
|
760
|
+
}
|
|
761
|
+
const chunkContent = await this.retrieve(meta.id, i);
|
|
762
|
+
if (!chunkContent)
|
|
763
|
+
continue;
|
|
764
|
+
const [embedding] = await provider.embed([chunkContent]);
|
|
765
|
+
if (embedding) {
|
|
766
|
+
newEntries.push({
|
|
767
|
+
contentId: meta.id,
|
|
768
|
+
chunkIndex: i,
|
|
769
|
+
embedding,
|
|
770
|
+
timestamp: new Date().toISOString(),
|
|
771
|
+
});
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
if (newEntries.length > 0) {
|
|
776
|
+
await this.saveContentEmbeddings(newEntries);
|
|
777
|
+
}
|
|
778
|
+
return newEntries.length;
|
|
779
|
+
}
|
|
780
|
+
/**
|
|
781
|
+
* Search stored content with keyword, semantic, or hybrid search.
|
|
782
|
+
*/
|
|
783
|
+
async search(query, options = {}) {
|
|
784
|
+
const index = await this.loadIndex();
|
|
785
|
+
const queryLower = query.toLowerCase();
|
|
786
|
+
// Use pre-expanded terms if provided, otherwise split query into terms
|
|
787
|
+
const queryTerms = options.expandedTerms ?? queryLower.split(/\s+/).filter(t => t.length > 2);
|
|
788
|
+
const limit = options.limit ?? 10;
|
|
789
|
+
const mode = options.mode ?? 'hybrid';
|
|
790
|
+
// Keyword scores
|
|
791
|
+
const keywordResults = new Map();
|
|
792
|
+
for (const meta of index) {
|
|
793
|
+
// Apply filters
|
|
794
|
+
if (options.sourceTypes && !options.sourceTypes.includes(meta.sourceType)) {
|
|
795
|
+
continue;
|
|
796
|
+
}
|
|
797
|
+
if (options.contentTypes && !options.contentTypes.includes(meta.contentType)) {
|
|
798
|
+
continue;
|
|
799
|
+
}
|
|
800
|
+
const summaryLower = meta.summary.toLowerCase();
|
|
801
|
+
let summaryScore = 0;
|
|
802
|
+
for (const term of queryTerms) {
|
|
803
|
+
if (summaryLower.includes(term)) {
|
|
804
|
+
summaryScore++;
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
// Check summary for keyword match (any term matches)
|
|
808
|
+
if (summaryScore > 0) {
|
|
809
|
+
keywordResults.set(meta.id, {
|
|
810
|
+
meta,
|
|
811
|
+
score: Math.min(0.5 * (summaryScore / queryTerms.length), 0.5),
|
|
812
|
+
snippet: meta.summary.slice(0, SNIPPET_LENGTH),
|
|
813
|
+
});
|
|
814
|
+
continue;
|
|
815
|
+
}
|
|
816
|
+
// Check content chunks for keyword match
|
|
817
|
+
const content = await this.retrieve(meta.id);
|
|
818
|
+
if (content) {
|
|
819
|
+
const contentLower = content.toLowerCase();
|
|
820
|
+
let contentScore = 0;
|
|
821
|
+
let bestMatchIndex = -1;
|
|
822
|
+
for (const term of queryTerms) {
|
|
823
|
+
if (contentLower.includes(term)) {
|
|
824
|
+
contentScore++;
|
|
825
|
+
// Track first significant term match for snippet
|
|
826
|
+
if (bestMatchIndex < 0 && term.length > 3) {
|
|
827
|
+
bestMatchIndex = contentLower.indexOf(term);
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
if (contentScore > 0) {
|
|
832
|
+
// Generate snippet around the best match
|
|
833
|
+
const matchIndex = bestMatchIndex >= 0 ? bestMatchIndex : 0;
|
|
834
|
+
const start = Math.max(0, matchIndex - 50);
|
|
835
|
+
const end = Math.min(content.length, matchIndex + 250);
|
|
836
|
+
const snippet = (start > 0 ? '...' : '') +
|
|
837
|
+
content.slice(start, end) +
|
|
838
|
+
(end < content.length ? '...' : '');
|
|
839
|
+
keywordResults.set(meta.id, {
|
|
840
|
+
meta,
|
|
841
|
+
score: Math.min(0.8 * (contentScore / queryTerms.length), 0.8),
|
|
842
|
+
snippet,
|
|
843
|
+
});
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
// If keyword-only mode, return keyword results
|
|
848
|
+
if (mode === 'keyword') {
|
|
849
|
+
const results = Array.from(keywordResults.values());
|
|
850
|
+
results.sort((a, b) => b.score - a.score);
|
|
851
|
+
return results.slice(0, limit);
|
|
852
|
+
}
|
|
853
|
+
// Semantic search
|
|
854
|
+
const semanticResults = new Map();
|
|
855
|
+
if (await embeddingsAvailable()) {
|
|
856
|
+
try {
|
|
857
|
+
// Generate embeddings for any content that doesn't have them
|
|
858
|
+
await this.generateMissingEmbeddings();
|
|
859
|
+
const embeddings = await this.loadContentEmbeddings();
|
|
860
|
+
const provider = getEmbeddingProvider();
|
|
861
|
+
const [queryEmbedding] = await provider.embed([query]);
|
|
862
|
+
if (queryEmbedding) {
|
|
863
|
+
for (const meta of index) {
|
|
864
|
+
// Apply filters
|
|
865
|
+
if (options.sourceTypes && !options.sourceTypes.includes(meta.sourceType)) {
|
|
866
|
+
continue;
|
|
867
|
+
}
|
|
868
|
+
if (options.contentTypes && !options.contentTypes.includes(meta.contentType)) {
|
|
869
|
+
continue;
|
|
870
|
+
}
|
|
871
|
+
// Find best matching chunk
|
|
872
|
+
let bestScore = 0;
|
|
873
|
+
let bestChunkIndex = 0;
|
|
874
|
+
for (let i = 0; i < meta.chunkCount; i++) {
|
|
875
|
+
const embedding = embeddings.get(`${meta.id}:${i}`);
|
|
876
|
+
if (!embedding)
|
|
877
|
+
continue;
|
|
878
|
+
const similarity = cosineSimilarity(queryEmbedding, embedding);
|
|
879
|
+
if (similarity > bestScore) {
|
|
880
|
+
bestScore = similarity;
|
|
881
|
+
bestChunkIndex = i;
|
|
882
|
+
}
|
|
883
|
+
}
|
|
884
|
+
if (bestScore > SEMANTIC_THRESHOLD) {
|
|
885
|
+
const chunkContent = await this.retrieve(meta.id, bestChunkIndex);
|
|
886
|
+
const snippet = chunkContent?.slice(0, SNIPPET_LENGTH) + '...' || meta.summary;
|
|
887
|
+
semanticResults.set(meta.id, {
|
|
888
|
+
meta,
|
|
889
|
+
score: bestScore,
|
|
890
|
+
snippet,
|
|
891
|
+
});
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
}
|
|
896
|
+
catch {
|
|
897
|
+
// Fall back to keyword-only if semantic search fails
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
// If semantic-only mode, return semantic results
|
|
901
|
+
if (mode === 'semantic') {
|
|
902
|
+
const results = Array.from(semanticResults.values());
|
|
903
|
+
results.sort((a, b) => b.score - a.score);
|
|
904
|
+
return results.slice(0, limit);
|
|
905
|
+
}
|
|
906
|
+
// Hybrid mode: combine scores
|
|
907
|
+
const combinedResults = new Map();
|
|
908
|
+
for (const [id, result] of keywordResults) {
|
|
909
|
+
combinedResults.set(id, result);
|
|
910
|
+
}
|
|
911
|
+
for (const [id, result] of semanticResults) {
|
|
912
|
+
const existing = combinedResults.get(id);
|
|
913
|
+
if (existing) {
|
|
914
|
+
// Combine scores: weight keyword 0.3, semantic 0.7, boost if both match
|
|
915
|
+
const combinedScore = (existing.score * 0.3 + result.score * 0.7) * 1.2;
|
|
916
|
+
combinedResults.set(id, {
|
|
917
|
+
meta: result.meta,
|
|
918
|
+
score: Math.min(combinedScore, 1),
|
|
919
|
+
snippet: existing.snippet, // Keep keyword snippet as it's more targeted
|
|
920
|
+
});
|
|
921
|
+
}
|
|
922
|
+
else {
|
|
923
|
+
combinedResults.set(id, {
|
|
924
|
+
meta: result.meta,
|
|
925
|
+
score: result.score * 0.7, // Semantic-only gets weighted down
|
|
926
|
+
snippet: result.snippet,
|
|
927
|
+
});
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
const results = Array.from(combinedResults.values());
|
|
931
|
+
results.sort((a, b) => b.score - a.score);
|
|
932
|
+
return results.slice(0, limit);
|
|
933
|
+
}
|
|
934
|
+
/**
|
|
935
|
+
* Get statistics about stored content.
|
|
936
|
+
*/
|
|
937
|
+
async getStats() {
|
|
938
|
+
const index = await this.loadIndex();
|
|
939
|
+
const stats = {
|
|
940
|
+
totalItems: index.length,
|
|
941
|
+
totalSize: 0,
|
|
942
|
+
totalTokens: 0,
|
|
943
|
+
bySourceType: {},
|
|
944
|
+
byContentType: {},
|
|
945
|
+
};
|
|
946
|
+
for (const meta of index) {
|
|
947
|
+
stats.totalSize += meta.originalSize;
|
|
948
|
+
stats.totalTokens += meta.tokenEstimate;
|
|
949
|
+
stats.bySourceType[meta.sourceType] = (stats.bySourceType[meta.sourceType] ?? 0) + 1;
|
|
950
|
+
stats.byContentType[meta.contentType] = (stats.byContentType[meta.contentType] ?? 0) + 1;
|
|
951
|
+
}
|
|
952
|
+
return stats;
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
//# sourceMappingURL=content-store.js.map
|