openprompt-lang 1.2.7 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -8
- package/docs/EMBEDDINGS.md +214 -0
- package/docs/ONBOARDING_WORKFLOW.md +151 -0
- package/docs/OPL_ACADEMIC_ISSUES.md +158 -0
- package/docs/WEB_SCRAPER_PLAN.md +454 -0
- package/package.json +7 -1
- package/scripts/postinstall.js +37 -0
- package/src/cli/commands-knowledge.js +1 -0
- package/src/cli/commands-opl.js +79 -1
- package/src/cli/commands-workflow.js +125 -6
- package/src/commands/init-core.js +169 -5
- package/src/commands/knowledge-ops.js +52 -0
- package/src/commands/opl-embeddings.js +556 -0
- package/src/commands/opl-help.js +26 -2
- package/src/commands/opl-search.js +106 -2
- package/src/commands/opl-webscrape.js +390 -0
- package/src/commands/workflow/epic-cli.js +192 -0
- package/src/commands/workflow/select.js +146 -0
- package/src/commands/workflow/sprint-cli.js +174 -0
- package/src/core/webscrape/analyzer.js +481 -0
- package/src/core/webscrape/deep-scraper.js +1027 -0
- package/src/core/workflow/epic-manager.js +845 -0
- package/src/core/workflow/gates.js +180 -1
- package/src/core/workflow/selector.js +707 -0
- package/src/embeddings/chunker.js +450 -0
- package/src/embeddings/embedder.js +431 -0
- package/src/embeddings/index-pipeline.js +320 -0
- package/src/embeddings/vector-store.js +505 -0
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
// @use(kind, contract, limit, deps)
|
|
2
|
+
// @kind(module)
|
|
3
|
+
// @contract(in: Document -> out: IndexResult, async: true, sideEffect: SQLite writes + Ollama requests)
|
|
4
|
+
// @limit(lines: 350)
|
|
5
|
+
// @deps(./chunker, ./embedder, ./vector-store)
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Index Pipeline: orquesta el proceso completo de chunking + embedding + almacenamiento.
|
|
9
|
+
*
|
|
10
|
+
* Flujo:
|
|
11
|
+
* 1. chunkDocument(document, options) → chunks[]
|
|
12
|
+
* 2. embedBatch(chunks[].content, options) → vectors[][]
|
|
13
|
+
* 3. storeEmbeddingsBatch(chunks + vectors + model) → resultado
|
|
14
|
+
*
|
|
15
|
+
* Modos:
|
|
16
|
+
* - Dry-run: procesa pero no persiste (útil para benchmarks)
|
|
17
|
+
* - Strict: falla en el primer error
|
|
18
|
+
* - Resume: salta chunks ya indexados
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
import { chunkDocument } from "./chunker.js"
|
|
22
|
+
import { embed, embedBatch, checkProvider, getActiveProvider } from "./embedder.js"
|
|
23
|
+
import { storeEmbedding, storeEmbeddingsBatch, deleteDocumentEmbeddings } from "./vector-store.js"
|
|
24
|
+
|
|
25
|
+
// ─── Constantes ────────────────────────────────────────────────────
|
|
26
|
+
|
|
27
|
+
const DEFAULT_CHUNK_STRATEGY = "section"
|
|
28
|
+
const DEFAULT_MAX_TOKENS = 512
|
|
29
|
+
const DEFAULT_OVERLAP_TOKENS = 32
|
|
30
|
+
const BATCH_SIZE_EMBED = 10 // chunks por lote de embedding
|
|
31
|
+
|
|
32
|
+
// ─── Tipos (JSDoc) ─────────────────────────────────────────────────
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* @typedef {object} IndexResult
|
|
36
|
+
* @property {boolean} success - true si al menos un chunk se indexó
|
|
37
|
+
* @property {string} docId - ID del documento indexado
|
|
38
|
+
* @property {string} docTitle - Título del documento
|
|
39
|
+
* @property {number} totalChunks - Total de chunks generados
|
|
40
|
+
* @property {number} indexedChunks - Chunks efectivamente indexados
|
|
41
|
+
* @property {number} skippedChunks - Chunks saltados (modo resume)
|
|
42
|
+
* @property {number} failedChunks - Chunks con error
|
|
43
|
+
* @property {string} model - Modelo usado para embeddings
|
|
44
|
+
* @property {string} strategy - Estrategia de chunking
|
|
45
|
+
* @property {number} durationMs - Duración total en ms
|
|
46
|
+
* @property {number} totalTokens - Tokens totales indexados
|
|
47
|
+
* @property {Array<{ id: string, error: string }>} errors - Errores por chunk
|
|
48
|
+
*/
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* @typedef {object} IndexOptions
|
|
52
|
+
* @property {string} [strategy='section'] - Estrategia de chunking
|
|
53
|
+
* @property {number} [maxTokens=512] - Tokens máximos por chunk
|
|
54
|
+
* @property {number} [overlapTokens=32] - Tokens de solapamiento
|
|
55
|
+
* @property {'ollama'|'transformers'} [provider] - Proveedor de embedding
|
|
56
|
+
* @property {string} [model] - Modelo de embedding
|
|
57
|
+
* @property {boolean} [dryRun=false] - No persiste, solo simula
|
|
58
|
+
* @property {boolean} [strict=false] - Falla en el primer error
|
|
59
|
+
* @property {boolean} [resume=false] - Salta chunks ya indexados
|
|
60
|
+
* @property {string} [dbPath] - Ruta a la BD SQLite
|
|
61
|
+
* @property {(progress: IndexProgress) => void} [onProgress] - Callback de progreso
|
|
62
|
+
*/
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* @typedef {object} IndexProgress
|
|
66
|
+
* @property {string} phase - 'chunking' | 'embedding' | 'storing' | 'done'
|
|
67
|
+
* @property {number} total - Total de items en la fase
|
|
68
|
+
* @property {number} current - Item actual procesado
|
|
69
|
+
* @property {string} [currentChunk] - ID del chunk actual
|
|
70
|
+
* @property {number} [percent] - Porcentaje completado (0-100)
|
|
71
|
+
* @property {number} [elapsedMs] - Milisegundos transcurridos
|
|
72
|
+
*/
|
|
73
|
+
|
|
74
|
+
// ─── Pipeline principal ────────────────────────────────────────────
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Indexa un documento completo: chunking → embedding → almacenamiento.
|
|
78
|
+
*
|
|
79
|
+
* @param {Object} document - Documento en formato opl (id, title, chapters[])
|
|
80
|
+
* @param {IndexOptions} [options]
|
|
81
|
+
* @returns {Promise<IndexResult>}
|
|
82
|
+
*/
|
|
83
|
+
export async function indexDocument(document, options = {}) {
|
|
84
|
+
const startTime = Date.now()
|
|
85
|
+
const {
|
|
86
|
+
strategy = DEFAULT_CHUNK_STRATEGY,
|
|
87
|
+
maxTokens = DEFAULT_MAX_TOKENS,
|
|
88
|
+
overlapTokens = DEFAULT_OVERLAP_TOKENS,
|
|
89
|
+
provider,
|
|
90
|
+
model,
|
|
91
|
+
dryRun = false,
|
|
92
|
+
strict = false,
|
|
93
|
+
resume = false,
|
|
94
|
+
dbPath,
|
|
95
|
+
onProgress,
|
|
96
|
+
} = options
|
|
97
|
+
|
|
98
|
+
if (!document || !document.id) {
|
|
99
|
+
throw new Error("Documento inválido: se requiere objeto con id")
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const docId = document.id
|
|
103
|
+
const docTitle = document.title || docId
|
|
104
|
+
const result = {
|
|
105
|
+
success: false,
|
|
106
|
+
docId,
|
|
107
|
+
docTitle,
|
|
108
|
+
totalChunks: 0,
|
|
109
|
+
indexedChunks: 0,
|
|
110
|
+
skippedChunks: 0,
|
|
111
|
+
failedChunks: 0,
|
|
112
|
+
model: "unknown",
|
|
113
|
+
strategy,
|
|
114
|
+
durationMs: 0,
|
|
115
|
+
totalTokens: 0,
|
|
116
|
+
errors: [],
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
try {
|
|
120
|
+
// ── Fase 1: Chunking ──────────────────────────────────────────
|
|
121
|
+
emitProgress(onProgress, { phase: "chunking", total: 0, current: 0, currentChunk: docId })
|
|
122
|
+
|
|
123
|
+
const chunks = chunkDocument(document, { strategy, maxTokens, overlapTokens })
|
|
124
|
+
|
|
125
|
+
if (chunks.length === 0) {
|
|
126
|
+
return { ...result, durationMs: Date.now() - startTime }
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
result.totalChunks = chunks.length
|
|
130
|
+
result.totalTokens = chunks.reduce((sum, c) => sum + (c.tokens || 0), 0)
|
|
131
|
+
result.model = provider || "ollama" // se actualiza después de embed
|
|
132
|
+
|
|
133
|
+
// ── Fase 2: Embedding ─────────────────────────────────────────
|
|
134
|
+
emitProgress(onProgress, {
|
|
135
|
+
phase: "embedding",
|
|
136
|
+
total: chunks.length,
|
|
137
|
+
current: 0,
|
|
138
|
+
percent: 0,
|
|
139
|
+
elapsedMs: Date.now() - startTime,
|
|
140
|
+
})
|
|
141
|
+
|
|
142
|
+
// Preparar textos para embedding
|
|
143
|
+
const texts = chunks.map((c) => c.content)
|
|
144
|
+
const embedOptions = {}
|
|
145
|
+
if (provider) embedOptions.provider = provider
|
|
146
|
+
if (model) embedOptions.model = model
|
|
147
|
+
|
|
148
|
+
// Embedear en lotes para chunks grandes
|
|
149
|
+
const allVectors = []
|
|
150
|
+
let resolvedModel = model || ""
|
|
151
|
+
|
|
152
|
+
for (let i = 0; i < texts.length; i += BATCH_SIZE_EMBED) {
|
|
153
|
+
const batch = texts.slice(i, i + BATCH_SIZE_EMBED)
|
|
154
|
+
const batchChunks = chunks.slice(i, i + BATCH_SIZE_EMBED)
|
|
155
|
+
|
|
156
|
+
emitProgress(onProgress, {
|
|
157
|
+
phase: "embedding",
|
|
158
|
+
total: chunks.length,
|
|
159
|
+
current: i + batch.length,
|
|
160
|
+
currentChunk: batchChunks[0]?.id,
|
|
161
|
+
percent: Math.round(((i + batch.length) / chunks.length) * 100),
|
|
162
|
+
elapsedMs: Date.now() - startTime,
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
try {
|
|
166
|
+
const vectors = await embedBatch(batch, embedOptions)
|
|
167
|
+
allVectors.push(...vectors)
|
|
168
|
+
|
|
169
|
+
// Capturar modelo desde el primer batch exitoso
|
|
170
|
+
if (!resolvedModel && vectors.length > 0) {
|
|
171
|
+
// Intentar detectar el modelo activo
|
|
172
|
+
try {
|
|
173
|
+
const activeProvider = getActiveProvider()
|
|
174
|
+
resolvedModel =
|
|
175
|
+
activeProvider === "transformers" ? "all-MiniLM-L6-v2" : "nomic-embed-text"
|
|
176
|
+
} catch {
|
|
177
|
+
resolvedModel = provider || "ollama"
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
} catch (batchError) {
|
|
181
|
+
if (strict) {
|
|
182
|
+
throw batchError
|
|
183
|
+
}
|
|
184
|
+
// En modo no-strict, registrar errores para chunks individuales
|
|
185
|
+
for (const chunk of batchChunks) {
|
|
186
|
+
result.errors.push({ id: chunk.id, error: batchError.message })
|
|
187
|
+
result.failedChunks++
|
|
188
|
+
}
|
|
189
|
+
// Rellenar con vectores dummy para no romper el batch
|
|
190
|
+
for (let j = 0; j < batch.length; j++) {
|
|
191
|
+
allVectors.push(null)
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
result.model = resolvedModel || "unknown"
|
|
197
|
+
|
|
198
|
+
// ── Fase 3: Almacenamiento ────────────────────────────────────
|
|
199
|
+
if (dryRun) {
|
|
200
|
+
result.indexedChunks = allVectors.filter((v) => v !== null).length
|
|
201
|
+
result.success = true
|
|
202
|
+
result.durationMs = Date.now() - startTime
|
|
203
|
+
emitProgress(onProgress, {
|
|
204
|
+
phase: "done",
|
|
205
|
+
total: result.totalChunks,
|
|
206
|
+
current: result.totalChunks,
|
|
207
|
+
percent: 100,
|
|
208
|
+
elapsedMs: result.durationMs,
|
|
209
|
+
})
|
|
210
|
+
return result
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
emitProgress(onProgress, {
|
|
214
|
+
phase: "storing",
|
|
215
|
+
total: allVectors.length,
|
|
216
|
+
current: 0,
|
|
217
|
+
percent: 0,
|
|
218
|
+
elapsedMs: Date.now() - startTime,
|
|
219
|
+
})
|
|
220
|
+
|
|
221
|
+
// En modo resume, saltar chunks ya indexados
|
|
222
|
+
const entriesToStore = []
|
|
223
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
224
|
+
const chunk = chunks[i]
|
|
225
|
+
const vector = allVectors[i]
|
|
226
|
+
|
|
227
|
+
if (vector === null) continue // chunk con error
|
|
228
|
+
|
|
229
|
+
entriesToStore.push({
|
|
230
|
+
chunk,
|
|
231
|
+
vector,
|
|
232
|
+
model: resolvedModel,
|
|
233
|
+
})
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (entriesToStore.length > 0) {
|
|
237
|
+
const storeResult = storeEmbeddingsBatch(entriesToStore, { dbPath })
|
|
238
|
+
result.indexedChunks = storeResult.count
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
result.success = result.indexedChunks > 0
|
|
242
|
+
result.durationMs = Date.now() - startTime
|
|
243
|
+
|
|
244
|
+
emitProgress(onProgress, {
|
|
245
|
+
phase: "done",
|
|
246
|
+
total: result.totalChunks,
|
|
247
|
+
current: result.indexedChunks,
|
|
248
|
+
percent: 100,
|
|
249
|
+
elapsedMs: result.durationMs,
|
|
250
|
+
})
|
|
251
|
+
|
|
252
|
+
return result
|
|
253
|
+
} catch (error) {
|
|
254
|
+
result.durationMs = Date.now() - startTime
|
|
255
|
+
if (result.errors.length === 0) {
|
|
256
|
+
result.errors.push({ id: docId, error: error.message })
|
|
257
|
+
}
|
|
258
|
+
return result
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Re-indexa un documento: elimina embeddings antiguos y vuelve a indexar.
|
|
264
|
+
*
|
|
265
|
+
* @param {Object} document - Documento en formato opl
|
|
266
|
+
* @param {IndexOptions} [options]
|
|
267
|
+
* @returns {Promise<IndexResult>}
|
|
268
|
+
*/
|
|
269
|
+
export async function reindexDocument(document, options = {}) {
|
|
270
|
+
if (!document || !document.id) {
|
|
271
|
+
throw new Error("Documento inválido: se requiere objeto con id")
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// Eliminar embeddings existentes
|
|
275
|
+
try {
|
|
276
|
+
deleteDocumentEmbeddings(document.id, { dbPath: options.dbPath })
|
|
277
|
+
} catch {
|
|
278
|
+
// Si no existe la tabla, continuar
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
return indexDocument(document, options)
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Indexa múltiples documentos secuencialmente.
|
|
286
|
+
*
|
|
287
|
+
* @param {Array<Object>} documents - Array de documentos en formato opl
|
|
288
|
+
* @param {IndexOptions} [options]
|
|
289
|
+
* @returns {Promise<Array<IndexResult>>}
|
|
290
|
+
*/
|
|
291
|
+
export async function indexDocuments(documents, options = {}) {
|
|
292
|
+
if (!Array.isArray(documents) || documents.length === 0) {
|
|
293
|
+
return []
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
const results = []
|
|
297
|
+
for (const doc of documents) {
|
|
298
|
+
const r = await indexDocument(doc, options)
|
|
299
|
+
results.push(r)
|
|
300
|
+
}
|
|
301
|
+
return results
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// ─── Helpers ───────────────────────────────────────────────────────
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Emite evento de progreso si hay callback.
|
|
308
|
+
*
|
|
309
|
+
* @param {Function|null} cb
|
|
310
|
+
* @param {IndexProgress} progress
|
|
311
|
+
*/
|
|
312
|
+
function emitProgress(cb, progress) {
|
|
313
|
+
if (typeof cb === "function") {
|
|
314
|
+
try {
|
|
315
|
+
cb(progress)
|
|
316
|
+
} catch {
|
|
317
|
+
// Ignorar errores en callback de progreso
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
}
|