openprompt-lang 1.2.7 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -8
- package/bin/cli.js +2 -0
- package/docs/00-ARCHITECTURE/OPL-BOOST-MULTI-AGENT.md +406 -0
- package/docs/02-STANDARDS/AGENTS.template.md +89 -0
- package/docs/02-STANDARDS/ticket-driven-development.md +99 -0
- package/docs/04-TICKETS/BOOST-001-profile-registry.md +66 -0
- package/docs/04-TICKETS/BOOST-002-context-compression.md +58 -0
- package/docs/04-TICKETS/BOOST-003-template-hydration.md +69 -0
- package/docs/04-TICKETS/BOOST-004-fewshot-engine.md +58 -0
- package/docs/04-TICKETS/BOOST-005-agent-pool.md +69 -0
- package/docs/04-TICKETS/BOOST-006-specialized-agents.md +53 -0
- package/docs/04-TICKETS/BOOST-007-validation-loop.md +56 -0
- package/docs/04-TICKETS/BOOST-008-orchestrator.md +71 -0
- package/docs/04-TICKETS/BOOST-009-cache-system.md +56 -0
- package/docs/04-TICKETS/BOOST-010-cli-mcp.md +67 -0
- package/docs/04-TICKETS/BOOST-011-self-learning.md +50 -0
- package/docs/04-TICKETS/BOOST-012-prompt-preamble.md +109 -0
- package/docs/04-TICKETS/BOOST-013-hydrator-duplicate-code.md +132 -0
- package/docs/04-TICKETS/BOOST-014-multiagent-missing-parts.md +87 -0
- package/docs/04-TICKETS/BOOST-015-skeleton-type-missing.md +76 -0
- package/docs/04-TICKETS/BOOST-016-output-path-duplicate.md +68 -0
- package/docs/04-TICKETS/INDEX.md +89 -0
- package/docs/04-TICKETS/_archive/BOOST-005-micro-tasking.md +67 -0
- package/docs/04-TICKETS/_archive/BOOST-006-validation-loop.md +66 -0
- package/docs/04-TICKETS/_archive/BOOST-007-progressive-pipeline.md +69 -0
- package/docs/04-TICKETS/_archive/BOOST-008-cli-mcp-integration.md +74 -0
- package/docs/AI_CONTEXT.md +16 -0
- package/docs/EMBEDDINGS.md +214 -0
- package/docs/ONBOARDING_WORKFLOW.md +151 -0
- package/docs/OPL_ACADEMIC_ISSUES.md +158 -0
- package/docs/WEB_SCRAPER_PLAN.md +454 -0
- package/package.json +9 -2
- package/scripts/postinstall.js +37 -0
- package/src/boost/agent-pool.js +442 -0
- package/src/boost/agents/index.js +79 -0
- package/src/boost/cache.js +241 -0
- package/src/boost/context-compressor.js +354 -0
- package/src/boost/fewshot-retriever.js +332 -0
- package/src/boost/hardware-detector.js +486 -0
- package/src/boost/hydrator.js +398 -0
- package/src/boost/index.js +60 -0
- package/src/boost/orchestrator.js +615 -0
- package/src/boost/preamble.js +217 -0
- package/src/boost/profile-registry.js +264 -0
- package/src/boost/self-learn.js +247 -0
- package/src/boost/skeletons/component.skeleton.js +24 -0
- package/src/boost/skeletons/hook.skeleton.js +27 -0
- package/src/boost/skeletons/index.js +67 -0
- package/src/boost/skeletons/page.skeleton.js +22 -0
- package/src/boost/skeletons/service.skeleton.js +20 -0
- package/src/boost/skeletons/store.skeleton.js +18 -0
- package/src/boost/skeletons/type.skeleton.js +11 -0
- package/src/boost/task-dispatcher.js +142 -0
- package/src/boost/validation-loop.js +495 -0
- package/src/cli/commands-boost.js +394 -0
- package/src/cli/commands-knowledge.js +1 -0
- package/src/cli/commands-opl.js +79 -1
- package/src/cli/commands-workflow.js +125 -6
- package/src/commands/init-core.js +169 -5
- package/src/commands/knowledge-ops.js +52 -0
- package/src/commands/opl-embeddings.js +556 -0
- package/src/commands/opl-help.js +26 -2
- package/src/commands/opl-search.js +106 -2
- package/src/commands/opl-webscrape.js +390 -0
- package/src/commands/workflow/epic-cli.js +192 -0
- package/src/commands/workflow/select.js +146 -0
- package/src/commands/workflow/sprint-cli.js +174 -0
- package/src/core/webscrape/analyzer.js +481 -0
- package/src/core/webscrape/deep-scraper.js +1027 -0
- package/src/core/workflow/epic-manager.js +845 -0
- package/src/core/workflow/gates.js +180 -1
- package/src/core/workflow/selector.js +707 -0
- package/src/embeddings/chunker.js +450 -0
- package/src/embeddings/embedder.js +431 -0
- package/src/embeddings/index-pipeline.js +320 -0
- package/src/embeddings/vector-store.js +505 -0
- package/src/mcp-refactor/handlers/boost.js +295 -0
- package/src/mcp-refactor/router.js +19 -0
- package/src/mcp-refactor/tools.js +113 -0
|
@@ -0,0 +1,505 @@
|
|
|
1
|
+
// @use(kind, contract, limit, deps, pattern)
|
|
2
|
+
// @kind(module)
|
|
3
|
+
// @contract(in: ChunkWithVector -> out: void, sideEffect: SQLite writes)
|
|
4
|
+
// @limit(lines: 380)
|
|
5
|
+
// @deps(better-sqlite3, ../persistence/sqlite/connection)
|
|
6
|
+
// @pattern(repository)
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Almacenamiento vectorial en SQLite con búsqueda por similitud de coseno.
|
|
10
|
+
*
|
|
11
|
+
* Para ~5000 chunks, la búsqueda lineal por coseno es suficiente.
|
|
12
|
+
* Si el volumen supera 100k chunks, considerar índice HNSW.
|
|
13
|
+
*
|
|
14
|
+
* Serialización de vectores:
|
|
15
|
+
* - Almacenamiento: Float32Array → Buffer (4 bytes por float)
|
|
16
|
+
* - Recuperación: Buffer → Float32Array
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import { open } from "../persistence/sqlite/connection.js"
|
|
20
|
+
import { ensureSchema } from "../persistence/sqlite/schema.js"
|
|
21
|
+
|
|
22
|
+
// ─── Constantes ────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
const DEFAULT_DB_PATH = "./.opencode/opl.db"
|
|
25
|
+
const DEFAULT_TOP_K = 10
|
|
26
|
+
const DEFAULT_MIN_SCORE = 0.0
|
|
27
|
+
|
|
28
|
+
// ─── Schema ────────────────────────────────────────────────────────
|
|
29
|
+
|
|
30
|
+
const VECTOR_STORE_SQL = `
|
|
31
|
+
CREATE TABLE IF NOT EXISTS embeddings (
|
|
32
|
+
id TEXT PRIMARY KEY,
|
|
33
|
+
doc_id TEXT NOT NULL,
|
|
34
|
+
doc_title TEXT NOT NULL DEFAULT '',
|
|
35
|
+
chapter_idx INTEGER NOT NULL DEFAULT 0,
|
|
36
|
+
chapter_title TEXT NOT NULL DEFAULT '',
|
|
37
|
+
chunk_idx INTEGER NOT NULL DEFAULT 0,
|
|
38
|
+
content TEXT NOT NULL,
|
|
39
|
+
vector BLOB NOT NULL,
|
|
40
|
+
dimension INTEGER NOT NULL DEFAULT 768,
|
|
41
|
+
tokens INTEGER NOT NULL DEFAULT 0,
|
|
42
|
+
model TEXT NOT NULL DEFAULT '',
|
|
43
|
+
strategy TEXT NOT NULL DEFAULT 'section',
|
|
44
|
+
metadata TEXT NOT NULL DEFAULT '{}',
|
|
45
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
46
|
+
);
|
|
47
|
+
|
|
48
|
+
CREATE INDEX IF NOT EXISTS idx_embeddings_doc_id ON embeddings(doc_id);
|
|
49
|
+
CREATE INDEX IF NOT EXISTS idx_embeddings_created ON embeddings(created_at);
|
|
50
|
+
|
|
51
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS embeddings_fts USING fts5(
|
|
52
|
+
content,
|
|
53
|
+
content=embeddings,
|
|
54
|
+
content_rowid=rowid
|
|
55
|
+
);
|
|
56
|
+
`
|
|
57
|
+
|
|
58
|
+
// ─── Serialización de vectores ─────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Convierte un array de números a Buffer para almacenamiento SQLite.
|
|
62
|
+
* Usa Float32 (4 bytes por float).
|
|
63
|
+
*
|
|
64
|
+
* @param {number[]} vector
|
|
65
|
+
* @returns {Buffer}
|
|
66
|
+
*/
|
|
67
|
+
function vectorToBuffer(vector) {
|
|
68
|
+
const float32 = new Float32Array(vector)
|
|
69
|
+
return Buffer.from(float32.buffer)
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Convierte un Buffer SQLite a array de números.
|
|
74
|
+
*
|
|
75
|
+
* @param {Buffer} buffer
|
|
76
|
+
* @returns {number[]}
|
|
77
|
+
*/
|
|
78
|
+
function bufferToVector(buffer) {
|
|
79
|
+
const float32 = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 4)
|
|
80
|
+
return Array.from(float32)
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// ─── Coso similitud (re-exportada aquí para conveniencia) ──────────
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Calcula la similitud de coseno entre dos vectores.
|
|
87
|
+
*
|
|
88
|
+
* @param {number[]} a
|
|
89
|
+
* @param {number[]} b
|
|
90
|
+
* @returns {number}
|
|
91
|
+
*/
|
|
92
|
+
export function cosineSimilarity(a, b) {
|
|
93
|
+
if (a.length !== b.length) {
|
|
94
|
+
throw new Error(`Dimensión incorrecta: vector A tiene ${a.length}, vector B tiene ${b.length}.`)
|
|
95
|
+
}
|
|
96
|
+
if (a.length === 0) return 0
|
|
97
|
+
|
|
98
|
+
let dotProduct = 0
|
|
99
|
+
let normA = 0
|
|
100
|
+
let normB = 0
|
|
101
|
+
|
|
102
|
+
for (let i = 0; i < a.length; i++) {
|
|
103
|
+
dotProduct += a[i] * b[i]
|
|
104
|
+
normA += a[i] * a[i]
|
|
105
|
+
normB += b[i] * b[i]
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const magnitude = Math.sqrt(normA) * Math.sqrt(normB)
|
|
109
|
+
if (magnitude === 0) return 0
|
|
110
|
+
return dotProduct / magnitude
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// ─── Helpers de BD ─────────────────────────────────────────────────
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Obtiene o crea la conexión a la BD de vectores.
|
|
117
|
+
*
|
|
118
|
+
* @param {string} dbPath
|
|
119
|
+
* @returns {{ db: import('better-sqlite3').Database }}
|
|
120
|
+
*/
|
|
121
|
+
function getDb(dbPath) {
|
|
122
|
+
const conn = open(dbPath || DEFAULT_DB_PATH)
|
|
123
|
+
ensureSchema(conn.db)
|
|
124
|
+
conn.db.exec(VECTOR_STORE_SQL)
|
|
125
|
+
return { db: conn.db }
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Serializa metadatos de chunk para almacenar en metadata JSON.
|
|
130
|
+
*
|
|
131
|
+
* @param {Object} chunk
|
|
132
|
+
* @returns {string}
|
|
133
|
+
*/
|
|
134
|
+
function serializeMetadata(chunk) {
|
|
135
|
+
return JSON.stringify(chunk.metadata || {})
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// ─── API Pública ───────────────────────────────────────────────────
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Guarda un chunk con su vector embedding en SQLite.
|
|
142
|
+
*
|
|
143
|
+
* @param {Object} chunk - Chunk del documento (de chunker.js)
|
|
144
|
+
* @param {number[]} vector - Vector embedding
|
|
145
|
+
* @param {string} model - Modelo usado para el embedding
|
|
146
|
+
* @param {Object} [options]
|
|
147
|
+
* @param {string} [options.dbPath]
|
|
148
|
+
* @returns {{ id: string, success: boolean }}
|
|
149
|
+
*/
|
|
150
|
+
export function storeEmbedding(chunk, vector, model, options = {}) {
|
|
151
|
+
const { db } = getDb(options.dbPath)
|
|
152
|
+
|
|
153
|
+
if (!chunk || !chunk.id) {
|
|
154
|
+
throw new Error("Chunk inválido: se requiere un objeto con id")
|
|
155
|
+
}
|
|
156
|
+
if (!vector || vector.length === 0) {
|
|
157
|
+
throw new Error("Vector inválido: se requiere un array no vacío")
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
const vectorBuffer = vectorToBuffer(vector)
|
|
161
|
+
const metadata = serializeMetadata(chunk)
|
|
162
|
+
|
|
163
|
+
const insert = db.transaction(() => {
|
|
164
|
+
const stmt = db.prepare(`
|
|
165
|
+
INSERT OR REPLACE INTO embeddings
|
|
166
|
+
(id, doc_id, doc_title, chapter_idx, chapter_title, chunk_idx,
|
|
167
|
+
content, vector, dimension, tokens, model, strategy, metadata)
|
|
168
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
169
|
+
`)
|
|
170
|
+
|
|
171
|
+
stmt.run(
|
|
172
|
+
chunk.id,
|
|
173
|
+
chunk.docId || "",
|
|
174
|
+
chunk.docTitle || "",
|
|
175
|
+
chunk.chapterIdx || 0,
|
|
176
|
+
chunk.chapterTitle || "",
|
|
177
|
+
chunk.chunkIndex || 0,
|
|
178
|
+
chunk.content || "",
|
|
179
|
+
vectorBuffer,
|
|
180
|
+
vector.length,
|
|
181
|
+
chunk.tokens || 0,
|
|
182
|
+
model || "",
|
|
183
|
+
chunk.strategy || "section",
|
|
184
|
+
metadata
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
// Sincronizar FTS5
|
|
188
|
+
try {
|
|
189
|
+
const row = db.prepare(`SELECT rowid FROM embeddings WHERE id = ?`).get(chunk.id)
|
|
190
|
+
if (row) {
|
|
191
|
+
db.prepare(`INSERT INTO embeddings_fts(rowid, content) VALUES (?, ?)`).run(
|
|
192
|
+
row.rowid,
|
|
193
|
+
chunk.content || ""
|
|
194
|
+
)
|
|
195
|
+
}
|
|
196
|
+
} catch {
|
|
197
|
+
db.exec(`INSERT INTO embeddings_fts(embeddings_fts) VALUES('rebuild')`)
|
|
198
|
+
}
|
|
199
|
+
})
|
|
200
|
+
|
|
201
|
+
insert()
|
|
202
|
+
|
|
203
|
+
return { id: chunk.id, success: true }
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Guarda múltiples embeddings en una transacción SQLite.
|
|
208
|
+
*
|
|
209
|
+
* @param {Array<{ chunk: Object, vector: number[], model: string }>} entries
|
|
210
|
+
* @param {Object} [options]
|
|
211
|
+
* @param {string} [options.dbPath]
|
|
212
|
+
* @returns {{ count: number, durationMs: number }}
|
|
213
|
+
*/
|
|
214
|
+
export function storeEmbeddingsBatch(entries, options = {}) {
|
|
215
|
+
const { db } = getDb(options.dbPath)
|
|
216
|
+
|
|
217
|
+
if (!Array.isArray(entries) || entries.length === 0) {
|
|
218
|
+
return { count: 0, durationMs: 0 }
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
const start = Date.now()
|
|
222
|
+
|
|
223
|
+
const stmt = db.prepare(`
|
|
224
|
+
INSERT OR REPLACE INTO embeddings
|
|
225
|
+
(id, doc_id, doc_title, chapter_idx, chapter_title, chunk_idx,
|
|
226
|
+
content, vector, dimension, tokens, model, strategy, metadata)
|
|
227
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
228
|
+
`)
|
|
229
|
+
|
|
230
|
+
const insertMany = db.transaction((items) => {
|
|
231
|
+
for (const item of items) {
|
|
232
|
+
const { chunk, vector, model } = item
|
|
233
|
+
const vectorBuffer = vectorToBuffer(vector)
|
|
234
|
+
|
|
235
|
+
stmt.run(
|
|
236
|
+
chunk.id,
|
|
237
|
+
chunk.docId || "",
|
|
238
|
+
chunk.docTitle || "",
|
|
239
|
+
chunk.chapterIdx || 0,
|
|
240
|
+
chunk.chapterTitle || "",
|
|
241
|
+
chunk.chunkIndex || 0,
|
|
242
|
+
chunk.content || "",
|
|
243
|
+
vectorBuffer,
|
|
244
|
+
vector.length,
|
|
245
|
+
chunk.tokens || 0,
|
|
246
|
+
model || "",
|
|
247
|
+
chunk.strategy || "section",
|
|
248
|
+
serializeMetadata(chunk)
|
|
249
|
+
)
|
|
250
|
+
}
|
|
251
|
+
})
|
|
252
|
+
|
|
253
|
+
insertMany(entries)
|
|
254
|
+
|
|
255
|
+
// Reconstruir índice FTS5
|
|
256
|
+
try {
|
|
257
|
+
db.exec(`INSERT INTO embeddings_fts(embeddings_fts) VALUES('rebuild')`)
|
|
258
|
+
} catch {
|
|
259
|
+
// FTS5 puede no estar disponible, no es crítico
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const durationMs = Date.now() - start
|
|
263
|
+
return { count: entries.length, durationMs }
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Busca los top-K chunks más similares por similitud de coseno.
|
|
268
|
+
*
|
|
269
|
+
* Estrategia: carga todos los vectores y hace búsqueda lineal.
|
|
270
|
+
* Para ~5000 chunks, esto es < 50ms en Node.js.
|
|
271
|
+
*
|
|
272
|
+
* @param {number[]} queryVector - Vector de la consulta
|
|
273
|
+
* @param {Object} [options]
|
|
274
|
+
* @param {number} [options.topK=10] - Máximo de resultados
|
|
275
|
+
* @param {number} [options.minScore=0.0] - Score mínimo para incluir
|
|
276
|
+
* @param {string} [options.docFilter] - Filtrar por doc_id
|
|
277
|
+
* @param {string} [options.dbPath]
|
|
278
|
+
* @returns {Array<{
|
|
279
|
+
* id: string, docId: string, docTitle: string,
|
|
280
|
+
* chapterIdx: number, chapterTitle: string,
|
|
281
|
+
* content: string, score: number,
|
|
282
|
+
* tokens: number, model: string
|
|
283
|
+
* }>}
|
|
284
|
+
*/
|
|
285
|
+
export function searchSimilar(queryVector, options = {}) {
|
|
286
|
+
const { db } = getDb(options.dbPath)
|
|
287
|
+
const topK = options.topK || DEFAULT_TOP_K
|
|
288
|
+
const minScore = options.minScore ?? DEFAULT_MIN_SCORE
|
|
289
|
+
const docFilter = options.docFilter || null
|
|
290
|
+
|
|
291
|
+
if (!queryVector || queryVector.length === 0) {
|
|
292
|
+
return []
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Construir query base
|
|
296
|
+
let sql = `SELECT id, doc_id, doc_title, chapter_idx, chapter_title,
|
|
297
|
+
content, vector, dimension, tokens, model, metadata
|
|
298
|
+
FROM embeddings`
|
|
299
|
+
const params = []
|
|
300
|
+
|
|
301
|
+
if (docFilter) {
|
|
302
|
+
sql += ` WHERE doc_id = ?`
|
|
303
|
+
params.push(docFilter)
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
sql += ` ORDER BY created_at DESC`
|
|
307
|
+
|
|
308
|
+
const rows = db.prepare(sql).all(...params)
|
|
309
|
+
|
|
310
|
+
if (rows.length === 0) return []
|
|
311
|
+
|
|
312
|
+
// Calcular similitud de coseno para cada fila
|
|
313
|
+
const scored = []
|
|
314
|
+
for (const row of rows) {
|
|
315
|
+
const vector = bufferToVector(row.vector)
|
|
316
|
+
const score = cosineSimilarity(queryVector, vector)
|
|
317
|
+
|
|
318
|
+
if (score >= minScore) {
|
|
319
|
+
scored.push({
|
|
320
|
+
id: row.id,
|
|
321
|
+
docId: row.doc_id,
|
|
322
|
+
docTitle: row.doc_title,
|
|
323
|
+
chapterIdx: row.chapter_idx,
|
|
324
|
+
chapterTitle: row.chapter_title,
|
|
325
|
+
content: row.content,
|
|
326
|
+
score,
|
|
327
|
+
tokens: row.tokens,
|
|
328
|
+
model: row.model,
|
|
329
|
+
metadata: row.metadata,
|
|
330
|
+
})
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// Ordenar por score descendente y tomar top-K
|
|
335
|
+
scored.sort((a, b) => b.score - a.score)
|
|
336
|
+
return scored.slice(0, topK)
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Busca por texto primero (FTS5) y reordena por similitud de coseno.
|
|
341
|
+
*
|
|
342
|
+
* 1. Busca en FTS5 por palabras clave → candidatos
|
|
343
|
+
* 2. Calcula similitud de coseno para cada candidato
|
|
344
|
+
* 3. Reordena por score
|
|
345
|
+
*
|
|
346
|
+
* @param {string} queryText - Texto para búsqueda FTS5
|
|
347
|
+
* @param {number[]} queryVector - Vector para reordenamiento semántico
|
|
348
|
+
* @param {Object} [options]
|
|
349
|
+
* @param {number} [options.topK=10]
|
|
350
|
+
* @param {string} [options.dbPath]
|
|
351
|
+
* @returns {Promise<Array>}
|
|
352
|
+
*/
|
|
353
|
+
export function hybridSearch(queryText, queryVector, options = {}) {
|
|
354
|
+
const { db } = getDb(options.dbPath)
|
|
355
|
+
const topK = options.topK || DEFAULT_TOP_K
|
|
356
|
+
|
|
357
|
+
if (!queryText || queryText.trim().length === 0) {
|
|
358
|
+
return searchSimilar(queryVector, options)
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
try {
|
|
362
|
+
// Buscar en FTS5
|
|
363
|
+
const ftsQuery = queryText
|
|
364
|
+
.split(/\s+/)
|
|
365
|
+
.filter((w) => w.length > 2)
|
|
366
|
+
.map((w) => `${w}*`)
|
|
367
|
+
.join(" OR ")
|
|
368
|
+
|
|
369
|
+
if (!ftsQuery) return searchSimilar(queryVector, options)
|
|
370
|
+
|
|
371
|
+
const ftsResults = db
|
|
372
|
+
.prepare(
|
|
373
|
+
`SELECT rank, rowid FROM embeddings_fts WHERE embeddings_fts MATCH ? ORDER BY rank LIMIT ?`
|
|
374
|
+
)
|
|
375
|
+
.all(ftsQuery, topK * 3)
|
|
376
|
+
|
|
377
|
+
if (ftsResults.length === 0) {
|
|
378
|
+
return searchSimilar(queryVector, options)
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Obtener vectores completos de los candidatos FTS (por rowid)
|
|
382
|
+
const rowids = ftsResults.map((r) => r.rowid)
|
|
383
|
+
const placeholders = rowids.map(() => "?").join(",")
|
|
384
|
+
|
|
385
|
+
const rows = db
|
|
386
|
+
.prepare(
|
|
387
|
+
`SELECT id, doc_id, doc_title, chapter_idx, chapter_title,
|
|
388
|
+
content, vector, dimension, tokens, model
|
|
389
|
+
FROM embeddings WHERE rowid IN (${placeholders})`
|
|
390
|
+
)
|
|
391
|
+
.all(...rowids)
|
|
392
|
+
|
|
393
|
+
if (rows.length === 0) return []
|
|
394
|
+
|
|
395
|
+
// Reordenar por similitud de coseno
|
|
396
|
+
const scored = []
|
|
397
|
+
for (const row of rows) {
|
|
398
|
+
const vector = bufferToVector(row.vector)
|
|
399
|
+
const score = cosineSimilarity(queryVector, vector)
|
|
400
|
+
scored.push({
|
|
401
|
+
id: row.id,
|
|
402
|
+
docId: row.doc_id,
|
|
403
|
+
docTitle: row.doc_title,
|
|
404
|
+
chapterIdx: row.chapter_idx,
|
|
405
|
+
chapterTitle: row.chapter_title,
|
|
406
|
+
content: row.content,
|
|
407
|
+
score,
|
|
408
|
+
tokens: row.tokens,
|
|
409
|
+
model: row.model,
|
|
410
|
+
})
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
scored.sort((a, b) => b.score - a.score)
|
|
414
|
+
return scored.slice(0, topK)
|
|
415
|
+
} catch {
|
|
416
|
+
// Si FTS5 falla, caer a búsqueda por coseno puro
|
|
417
|
+
return searchSimilar(queryVector, options)
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
/**
|
|
422
|
+
* Elimina todos los embeddings de un documento específico.
|
|
423
|
+
*
|
|
424
|
+
* @param {string} docId - ID del documento a eliminar
|
|
425
|
+
* @param {Object} [options]
|
|
426
|
+
* @param {string} [options.dbPath]
|
|
427
|
+
* @returns {{ deleted: number }}
|
|
428
|
+
*/
|
|
429
|
+
export function deleteDocumentEmbeddings(docId, options = {}) {
|
|
430
|
+
const { db } = getDb(options.dbPath)
|
|
431
|
+
|
|
432
|
+
if (!docId) {
|
|
433
|
+
throw new Error("docId es requerido para eliminar embeddings")
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// Eliminar primero del FTS
|
|
437
|
+
try {
|
|
438
|
+
db.prepare(
|
|
439
|
+
`DELETE FROM embeddings_fts WHERE rowid IN (SELECT rowid FROM embeddings WHERE doc_id = ?)`
|
|
440
|
+
).run(docId)
|
|
441
|
+
} catch {
|
|
442
|
+
// FTS puede no existir
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
const result = db.prepare(`DELETE FROM embeddings WHERE doc_id = ?`).run(docId)
|
|
446
|
+
|
|
447
|
+
return { deleted: result.changes }
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
/**
|
|
451
|
+
* Obtiene estadísticas del índice de embeddings.
|
|
452
|
+
*
|
|
453
|
+
* @param {Object} [options]
|
|
454
|
+
* @param {string} [options.dbPath]
|
|
455
|
+
* @returns {{
|
|
456
|
+
* totalEmbeddings: number,
|
|
457
|
+
* totalDocs: number,
|
|
458
|
+
* dimension: number,
|
|
459
|
+
* model: string,
|
|
460
|
+
* lastIndexed: string|null,
|
|
461
|
+
* storageBytes: number
|
|
462
|
+
* }}
|
|
463
|
+
*/
|
|
464
|
+
export function getEmbeddingStats(options = {}) {
|
|
465
|
+
const { db } = getDb(options.dbPath)
|
|
466
|
+
|
|
467
|
+
const countResult = db.prepare(`SELECT COUNT(*) as count FROM embeddings`).get()
|
|
468
|
+
const totalEmbeddings = countResult.count
|
|
469
|
+
|
|
470
|
+
if (totalEmbeddings === 0) {
|
|
471
|
+
return {
|
|
472
|
+
totalEmbeddings: 0,
|
|
473
|
+
totalDocs: 0,
|
|
474
|
+
dimension: 0,
|
|
475
|
+
model: "",
|
|
476
|
+
lastIndexed: null,
|
|
477
|
+
storageBytes: 0,
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
const docsResult = db.prepare(`SELECT COUNT(DISTINCT doc_id) as count FROM embeddings`).get()
|
|
482
|
+
const lastResult = db.prepare(`SELECT MAX(created_at) as last FROM embeddings`).get()
|
|
483
|
+
const modelResult = db
|
|
484
|
+
.prepare(
|
|
485
|
+
`SELECT model, COUNT(*) as count FROM embeddings GROUP BY model ORDER BY count DESC LIMIT 1`
|
|
486
|
+
)
|
|
487
|
+
.get()
|
|
488
|
+
const dimResult = db
|
|
489
|
+
.prepare(
|
|
490
|
+
`SELECT dimension, COUNT(*) as count FROM embeddings GROUP BY dimension ORDER BY count DESC LIMIT 1`
|
|
491
|
+
)
|
|
492
|
+
.get()
|
|
493
|
+
|
|
494
|
+
// Estimar tamaño en bytes
|
|
495
|
+
const vectorSize = db.prepare(`SELECT SUM(LENGTH(vector)) as total FROM embeddings`).get()
|
|
496
|
+
|
|
497
|
+
return {
|
|
498
|
+
totalEmbeddings,
|
|
499
|
+
totalDocs: docsResult.count,
|
|
500
|
+
dimension: dimResult ? dimResult.dimension : 0,
|
|
501
|
+
model: modelResult ? modelResult.model : "",
|
|
502
|
+
lastIndexed: lastResult ? lastResult.last : null,
|
|
503
|
+
storageBytes: vectorSize ? vectorSize.total + totalEmbeddings * 512 : 0, // 512 aprox por metadata
|
|
504
|
+
}
|
|
505
|
+
}
|