openprompt-lang 1.2.7 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,320 @@
1
+ // @use(kind, contract, limit, deps)
2
+ // @kind(module)
3
+ // @contract(in: Document -> out: IndexResult, async: true, sideEffect: SQLite writes + Ollama requests)
4
+ // @limit(lines: 350)
5
+ // @deps(./chunker, ./embedder, ./vector-store)
6
+
7
+ /**
8
+ * Index Pipeline: orquesta el proceso completo de chunking + embedding + almacenamiento.
9
+ *
10
+ * Flujo:
11
+ * 1. chunkDocument(document, options) → chunks[]
12
+ * 2. embedBatch(chunks[].content, options) → vectors[][]
13
+ * 3. storeEmbeddingsBatch(chunks + vectors + model) → resultado
14
+ *
15
+ * Modos:
16
+ * - Dry-run: procesa pero no persiste (útil para benchmarks)
17
+ * - Strict: falla en el primer error
18
+ * - Resume: salta chunks ya indexados
19
+ */
20
+
21
+ import { chunkDocument } from "./chunker.js"
22
+ import { embed, embedBatch, checkProvider, getActiveProvider } from "./embedder.js"
23
+ import { storeEmbedding, storeEmbeddingsBatch, deleteDocumentEmbeddings } from "./vector-store.js"
24
+
25
+ // ─── Constantes ────────────────────────────────────────────────────
26
+
27
+ const DEFAULT_CHUNK_STRATEGY = "section"
28
+ const DEFAULT_MAX_TOKENS = 512
29
+ const DEFAULT_OVERLAP_TOKENS = 32
30
+ const BATCH_SIZE_EMBED = 10 // chunks por lote de embedding
31
+
32
+ // ─── Tipos (JSDoc) ─────────────────────────────────────────────────
33
+
34
+ /**
35
+ * @typedef {object} IndexResult
36
+ * @property {boolean} success - true si al menos un chunk se indexó
37
+ * @property {string} docId - ID del documento indexado
38
+ * @property {string} docTitle - Título del documento
39
+ * @property {number} totalChunks - Total de chunks generados
40
+ * @property {number} indexedChunks - Chunks efectivamente indexados
41
+ * @property {number} skippedChunks - Chunks saltados (modo resume)
42
+ * @property {number} failedChunks - Chunks con error
43
+ * @property {string} model - Modelo usado para embeddings
44
+ * @property {string} strategy - Estrategia de chunking
45
+ * @property {number} durationMs - Duración total en ms
46
+ * @property {number} totalTokens - Tokens totales indexados
47
+ * @property {Array<{ id: string, error: string }>} errors - Errores por chunk
48
+ */
49
+
50
+ /**
51
+ * @typedef {object} IndexOptions
52
+ * @property {string} [strategy='section'] - Estrategia de chunking
53
+ * @property {number} [maxTokens=512] - Tokens máximos por chunk
54
+ * @property {number} [overlapTokens=32] - Tokens de solapamiento
55
+ * @property {'ollama'|'transformers'} [provider] - Proveedor de embedding
56
+ * @property {string} [model] - Modelo de embedding
57
+ * @property {boolean} [dryRun=false] - No persiste, solo simula
58
+ * @property {boolean} [strict=false] - Falla en el primer error
59
+ * @property {boolean} [resume=false] - Salta chunks ya indexados
60
+ * @property {string} [dbPath] - Ruta a la BD SQLite
61
+ * @property {(progress: IndexProgress) => void} [onProgress] - Callback de progreso
62
+ */
63
+
64
+ /**
65
+ * @typedef {object} IndexProgress
66
+ * @property {string} phase - 'chunking' | 'embedding' | 'storing' | 'done'
67
+ * @property {number} total - Total de items en la fase
68
+ * @property {number} current - Item actual procesado
69
+ * @property {string} [currentChunk] - ID del chunk actual
70
+ * @property {number} [percent] - Porcentaje completado (0-100)
71
+ * @property {number} [elapsedMs] - Milisegundos transcurridos
72
+ */
73
+
74
+ // ─── Pipeline principal ────────────────────────────────────────────
75
+
76
+ /**
77
+ * Indexa un documento completo: chunking → embedding → almacenamiento.
78
+ *
79
+ * @param {Object} document - Documento en formato opl (id, title, chapters[])
80
+ * @param {IndexOptions} [options]
81
+ * @returns {Promise<IndexResult>}
82
+ */
83
+ export async function indexDocument(document, options = {}) {
84
+ const startTime = Date.now()
85
+ const {
86
+ strategy = DEFAULT_CHUNK_STRATEGY,
87
+ maxTokens = DEFAULT_MAX_TOKENS,
88
+ overlapTokens = DEFAULT_OVERLAP_TOKENS,
89
+ provider,
90
+ model,
91
+ dryRun = false,
92
+ strict = false,
93
+ resume = false,
94
+ dbPath,
95
+ onProgress,
96
+ } = options
97
+
98
+ if (!document || !document.id) {
99
+ throw new Error("Documento inválido: se requiere objeto con id")
100
+ }
101
+
102
+ const docId = document.id
103
+ const docTitle = document.title || docId
104
+ const result = {
105
+ success: false,
106
+ docId,
107
+ docTitle,
108
+ totalChunks: 0,
109
+ indexedChunks: 0,
110
+ skippedChunks: 0,
111
+ failedChunks: 0,
112
+ model: "unknown",
113
+ strategy,
114
+ durationMs: 0,
115
+ totalTokens: 0,
116
+ errors: [],
117
+ }
118
+
119
+ try {
120
+ // ── Fase 1: Chunking ──────────────────────────────────────────
121
+ emitProgress(onProgress, { phase: "chunking", total: 0, current: 0, currentChunk: docId })
122
+
123
+ const chunks = chunkDocument(document, { strategy, maxTokens, overlapTokens })
124
+
125
+ if (chunks.length === 0) {
126
+ return { ...result, durationMs: Date.now() - startTime }
127
+ }
128
+
129
+ result.totalChunks = chunks.length
130
+ result.totalTokens = chunks.reduce((sum, c) => sum + (c.tokens || 0), 0)
131
+ result.model = provider || "ollama" // se actualiza después de embed
132
+
133
+ // ── Fase 2: Embedding ─────────────────────────────────────────
134
+ emitProgress(onProgress, {
135
+ phase: "embedding",
136
+ total: chunks.length,
137
+ current: 0,
138
+ percent: 0,
139
+ elapsedMs: Date.now() - startTime,
140
+ })
141
+
142
+ // Preparar textos para embedding
143
+ const texts = chunks.map((c) => c.content)
144
+ const embedOptions = {}
145
+ if (provider) embedOptions.provider = provider
146
+ if (model) embedOptions.model = model
147
+
148
+ // Embedear en lotes para chunks grandes
149
+ const allVectors = []
150
+ let resolvedModel = model || ""
151
+
152
+ for (let i = 0; i < texts.length; i += BATCH_SIZE_EMBED) {
153
+ const batch = texts.slice(i, i + BATCH_SIZE_EMBED)
154
+ const batchChunks = chunks.slice(i, i + BATCH_SIZE_EMBED)
155
+
156
+ emitProgress(onProgress, {
157
+ phase: "embedding",
158
+ total: chunks.length,
159
+ current: i + batch.length,
160
+ currentChunk: batchChunks[0]?.id,
161
+ percent: Math.round(((i + batch.length) / chunks.length) * 100),
162
+ elapsedMs: Date.now() - startTime,
163
+ })
164
+
165
+ try {
166
+ const vectors = await embedBatch(batch, embedOptions)
167
+ allVectors.push(...vectors)
168
+
169
+ // Capturar modelo desde el primer batch exitoso
170
+ if (!resolvedModel && vectors.length > 0) {
171
+ // Intentar detectar el modelo activo
172
+ try {
173
+ const activeProvider = getActiveProvider()
174
+ resolvedModel =
175
+ activeProvider === "transformers" ? "all-MiniLM-L6-v2" : "nomic-embed-text"
176
+ } catch {
177
+ resolvedModel = provider || "ollama"
178
+ }
179
+ }
180
+ } catch (batchError) {
181
+ if (strict) {
182
+ throw batchError
183
+ }
184
+ // En modo no-strict, registrar errores para chunks individuales
185
+ for (const chunk of batchChunks) {
186
+ result.errors.push({ id: chunk.id, error: batchError.message })
187
+ result.failedChunks++
188
+ }
189
+ // Rellenar con vectores dummy para no romper el batch
190
+ for (let j = 0; j < batch.length; j++) {
191
+ allVectors.push(null)
192
+ }
193
+ }
194
+ }
195
+
196
+ result.model = resolvedModel || "unknown"
197
+
198
+ // ── Fase 3: Almacenamiento ────────────────────────────────────
199
+ if (dryRun) {
200
+ result.indexedChunks = allVectors.filter((v) => v !== null).length
201
+ result.success = true
202
+ result.durationMs = Date.now() - startTime
203
+ emitProgress(onProgress, {
204
+ phase: "done",
205
+ total: result.totalChunks,
206
+ current: result.totalChunks,
207
+ percent: 100,
208
+ elapsedMs: result.durationMs,
209
+ })
210
+ return result
211
+ }
212
+
213
+ emitProgress(onProgress, {
214
+ phase: "storing",
215
+ total: allVectors.length,
216
+ current: 0,
217
+ percent: 0,
218
+ elapsedMs: Date.now() - startTime,
219
+ })
220
+
221
+ // En modo resume, saltar chunks ya indexados
222
+ const entriesToStore = []
223
+ for (let i = 0; i < chunks.length; i++) {
224
+ const chunk = chunks[i]
225
+ const vector = allVectors[i]
226
+
227
+ if (vector === null) continue // chunk con error
228
+
229
+ entriesToStore.push({
230
+ chunk,
231
+ vector,
232
+ model: resolvedModel,
233
+ })
234
+ }
235
+
236
+ if (entriesToStore.length > 0) {
237
+ const storeResult = storeEmbeddingsBatch(entriesToStore, { dbPath })
238
+ result.indexedChunks = storeResult.count
239
+ }
240
+
241
+ result.success = result.indexedChunks > 0
242
+ result.durationMs = Date.now() - startTime
243
+
244
+ emitProgress(onProgress, {
245
+ phase: "done",
246
+ total: result.totalChunks,
247
+ current: result.indexedChunks,
248
+ percent: 100,
249
+ elapsedMs: result.durationMs,
250
+ })
251
+
252
+ return result
253
+ } catch (error) {
254
+ result.durationMs = Date.now() - startTime
255
+ if (result.errors.length === 0) {
256
+ result.errors.push({ id: docId, error: error.message })
257
+ }
258
+ return result
259
+ }
260
+ }
261
+
262
+ /**
263
+ * Re-indexa un documento: elimina embeddings antiguos y vuelve a indexar.
264
+ *
265
+ * @param {Object} document - Documento en formato opl
266
+ * @param {IndexOptions} [options]
267
+ * @returns {Promise<IndexResult>}
268
+ */
269
+ export async function reindexDocument(document, options = {}) {
270
+ if (!document || !document.id) {
271
+ throw new Error("Documento inválido: se requiere objeto con id")
272
+ }
273
+
274
+ // Eliminar embeddings existentes
275
+ try {
276
+ deleteDocumentEmbeddings(document.id, { dbPath: options.dbPath })
277
+ } catch {
278
+ // Si no existe la tabla, continuar
279
+ }
280
+
281
+ return indexDocument(document, options)
282
+ }
283
+
284
+ /**
285
+ * Indexa múltiples documentos secuencialmente.
286
+ *
287
+ * @param {Array<Object>} documents - Array de documentos en formato opl
288
+ * @param {IndexOptions} [options]
289
+ * @returns {Promise<Array<IndexResult>>}
290
+ */
291
+ export async function indexDocuments(documents, options = {}) {
292
+ if (!Array.isArray(documents) || documents.length === 0) {
293
+ return []
294
+ }
295
+
296
+ const results = []
297
+ for (const doc of documents) {
298
+ const r = await indexDocument(doc, options)
299
+ results.push(r)
300
+ }
301
+ return results
302
+ }
303
+
304
+ // ─── Helpers ───────────────────────────────────────────────────────
305
+
306
+ /**
307
+ * Emite evento de progreso si hay callback.
308
+ *
309
+ * @param {Function|null} cb
310
+ * @param {IndexProgress} progress
311
+ */
312
+ function emitProgress(cb, progress) {
313
+ if (typeof cb === "function") {
314
+ try {
315
+ cb(progress)
316
+ } catch {
317
+ // Ignorar errores en callback de progreso
318
+ }
319
+ }
320
+ }