npm - openprompt-lang - Versions diffs - 1.2.7 → 1.3.0 - Mend

openprompt-lang 1.2.7 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +62 -8
package/docs/EMBEDDINGS.md +214 -0
package/docs/ONBOARDING_WORKFLOW.md +151 -0
package/docs/OPL_ACADEMIC_ISSUES.md +158 -0
package/docs/WEB_SCRAPER_PLAN.md +454 -0
package/package.json +7 -1
package/scripts/postinstall.js +37 -0
package/src/cli/commands-knowledge.js +1 -0
package/src/cli/commands-opl.js +79 -1
package/src/cli/commands-workflow.js +125 -6
package/src/commands/init-core.js +169 -5
package/src/commands/knowledge-ops.js +52 -0
package/src/commands/opl-embeddings.js +556 -0
package/src/commands/opl-help.js +26 -2
package/src/commands/opl-search.js +106 -2
package/src/commands/opl-webscrape.js +390 -0
package/src/commands/workflow/epic-cli.js +192 -0
package/src/commands/workflow/select.js +146 -0
package/src/commands/workflow/sprint-cli.js +174 -0
package/src/core/webscrape/analyzer.js +481 -0
package/src/core/webscrape/deep-scraper.js +1027 -0
package/src/core/workflow/epic-manager.js +845 -0
package/src/core/workflow/gates.js +180 -1
package/src/core/workflow/selector.js +707 -0
package/src/embeddings/chunker.js +450 -0
package/src/embeddings/embedder.js +431 -0
package/src/embeddings/index-pipeline.js +320 -0
package/src/embeddings/vector-store.js +505 -0

package/src/embeddings/chunker.js ADDED Viewed

@@ -0,0 +1,450 @@
+// @use(kind, contract, limit, deps)
+// @kind(module)
+// @contract(in: Document -> out: Chunk[], pure: true)
+// @limit(lines: 460)
+// @deps(none)
+/**
+ * Sistema de chunking para dividir documentos de conocimiento en fragmentos
+ * semánticamente coherentes, aptos para generar embeddings vectoriales.
+ *
+ * Estrategias disponibles:
+ * - 'paragraph': Respeta límites de párrafo, combina pequeños, parte grandes
+ * - 'section':   Respeta límites de sección (##, ###), sub-divide secciones grandes
+ * - 'fixed':     Divide por tamaño fijo de tokens con solapamiento
+ */
+// ─── Token Estimation ──────────────────────────────────────────────
+/**
+ * Calcula tokens aproximados sin tokenizer real.
+ * Regla general: 1 token ≈ 0.75 palabras en inglés, 0.6 en español.
+ * Usamos 0.7 como promedio conservador.
+ *
+ * @param {string} text - Texto a medir
+ * @returns {number} Tokens estimados
+ */
+export function estimateTokens(text) {
+  if (!text || typeof text !== "string") return 0
+  const cleaned = text.trim()
+  if (cleaned.length === 0) return 0
+  // Contar palabras (separadas por espacios)
+  const words = cleaned.split(/\s+/).filter(Boolean).length
+  // 1 token ≈ 0.7 palabras
+  return Math.ceil(words / 0.7)
+}
+// ─── Estrategias de Chunking ────────────────────────────────────────
+/**
+ * Divide texto por párrafos respetando maxTokens.
+ * - Párrafos pequeños se combinan
+ * - Párrafos grandes se parten por oraciones
+ *
+ * @param {string} text
+ * @param {number} maxTokens
+ * @param {number} overlapTokens
+ * @returns {string[]}
+ */
+function chunkByParagraph(text, maxTokens, overlapTokens) {
+  const paragraphs = text.split(/\n\s*\n/).filter((p) => p.trim().length > 0)
+  const chunks = []
+  let currentChunk = []
+  let currentTokens = 0
+  for (const para of paragraphs) {
+    const paraTokens = estimateTokens(para)
+    if (currentTokens + paraTokens <= maxTokens) {
+      currentChunk.push(para)
+      currentTokens += paraTokens
+    } else {
+      // Guardar chunk actual si tiene contenido
+      if (currentChunk.length > 0) {
+        chunks.push(currentChunk.join("\n\n"))
+      }
+      // Si el párrafo solo excede maxTokens, partirlo recursivamente
+      if (paraTokens > maxTokens) {
+        const splitPieces = splitTextGuaranteed(para, maxTokens)
+        chunks.push(...splitPieces)
+        currentChunk = []
+        currentTokens = 0
+      } else {
+        // Empezar nuevo chunk con este párrafo
+        currentChunk = [para]
+        currentTokens = paraTokens
+      }
+    }
+  }
+  // Último chunk
+  if (currentChunk.length > 0) {
+    chunks.push(currentChunk.join("\n\n"))
+  }
+  // Aplicar solapamiento si es necesario
+  if (overlapTokens > 0 && chunks.length > 1) {
+    return applyOverlap(chunks, overlapTokens)
+  }
+  return chunks
+}
+/**
+ * Divide texto por secciones (##, ###, etc.) respetando maxTokens.
+ * - Mantiene el título de sección en cada chunk
+ * - Secciones grandes se sub-dividen por párrafo
+ *
+ * @param {string} text
+ * @param {number} maxTokens
+ * @param {number} overlapTokens
+ * @returns {string[]}
+ */
+function chunkBySection(text, maxTokens, overlapTokens) {
+  // Detectar secciones por headings markdown
+  const sectionRegex = /^(#{1,4})\s+(.+)$/gm
+  const sections = []
+  let lastIndex = 0
+  let lastTitle = ""
+  let lastLevel = ""
+  let match
+  while ((match = sectionRegex.exec(text)) !== null) {
+    // Guardar sección anterior (desde el último heading hasta este)
+    if (lastIndex > 0 || sections.length > 0) {
+      const sectionContent = text.slice(lastIndex, match.index).trim()
+      if (sectionContent || sections.length === 0) {
+        sections.push({
+          title: lastTitle,
+          level: lastLevel,
+          content: sectionContent,
+        })
+      }
+    } else if (match.index > 0) {
+      // Contenido antes del primer heading
+      const preamble = text.slice(0, match.index).trim()
+      if (preamble) {
+        sections.push({
+          title: "",
+          level: "",
+          content: preamble,
+        })
+      }
+    }
+    lastTitle = match[2]
+    lastLevel = match[1]
+    lastIndex = match.index + match[0].length
+  }
+  // Última sección
+  const remaining = text.slice(lastIndex).trim()
+  if (remaining || sections.length === 0) {
+    sections.push({
+      title: lastTitle,
+      level: lastLevel,
+      content: remaining,
+    })
+  }
+  // Procesar cada sección
+  const chunks = []
+  for (const section of sections) {
+    const header = section.title ? `${section.level} ${section.title}\n\n` : ""
+    const sectionText = section.content
+    const sectionTokens = estimateTokens(sectionText)
+    const headerTokens = header ? estimateTokens(header) : 0
+    if (sectionTokens + headerTokens <= maxTokens) {
+      chunks.push(header + sectionText)
+    } else {
+      // Sección grande: sub-dividir por párrafo
+      const subChunks = chunkByParagraph(sectionText, maxTokens - headerTokens, 0)
+      if (subChunks.length > 0) {
+        // Primer sub-chunk lleva el título
+        chunks.push(header + subChunks[0])
+        // Sub-chunks restantes con título recortado
+        for (let i = 1; i < subChunks.length; i++) {
+          const miniHeader = section.title ? `### ${section.title} (cont.)\n\n` : ""
+          chunks.push(miniHeader + subChunks[i])
+        }
+      }
+    }
+  }
+  // Aplicar solapamiento
+  if (overlapTokens > 0 && chunks.length > 1) {
+    return applyOverlap(chunks, overlapTokens)
+  }
+  return chunks
+}
+/**
+ * Divide texto por tamaño fijo de tokens con solapamiento.
+ *
+ * @param {string} text
+ * @param {number} maxTokens
+ * @param {number} overlapTokens
+ * @returns {string[]}
+ */
+function chunkByFixed(text, maxTokens, overlapTokens) {
+  const words = text.split(/\s+/).filter(Boolean)
+  const chunks = []
+  const avgTokensPerWord = 1 / 0.7 // ~1.43 palabras por token
+  const wordsPerChunk = Math.floor(maxTokens * avgTokensPerWord)
+  const overlapWords = Math.floor(overlapTokens * avgTokensPerWord)
+  if (wordsPerChunk <= 0) return []
+  let start = 0
+  while (start < words.length) {
+    const end = Math.min(start + wordsPerChunk, words.length)
+    const chunk = words.slice(start, end).join(" ")
+    if (chunk.trim()) {
+      chunks.push(chunk)
+    }
+    if (end >= words.length) break
+    start += wordsPerChunk - overlapWords
+  }
+  return chunks
+}
+// ─── Utilities ──────────────────────────────────────────────────────
+/**
+ * Divide texto en fragmentos, CADA UNO garantizado ≤ maxTokens.
+ * Estrategia de cascada: oraciones → comas → word-count.
+ *
+ * @param {string} text - Texto a dividir
+ * @param {number} maxTokens - Máximo de tokens por fragmento
+ * @returns {string[]}
+ */
+function splitTextGuaranteed(text, maxTokens) {
+  // 1. Intentar por oraciones
+  const bySentences = splitBySentences(text, maxTokens)
+  const allOk = bySentences.every((s) => estimateTokens(s) <= maxTokens)
+  if (allOk) return bySentences
+  // 2. Si alguna oración excede, dividir recursivamente cada parte
+  const result = []
+  for (const part of bySentences) {
+    if (estimateTokens(part) <= maxTokens) {
+      result.push(part)
+    } else {
+      // Dividir por word count con 60% de margen
+      result.push(...splitByWordCount(part, maxTokens))
+    }
+  }
+  return result
+}
+/**
+ * Divide texto estrictamente por word count, garantizando ≤ maxTokens.
+ */
+function splitByWordCount(text, maxTokens) {
+  const avgTokensPerWord = 1 / 0.7
+  // Usar 60% del tamaño para dar margen seguro
+  const wordsPerChunk = Math.floor(maxTokens * avgTokensPerWord * 0.6)
+  if (wordsPerChunk < 1) return [text]
+  const words = text.split(/\s+/).filter(Boolean)
+  const chunks = []
+  for (let i = 0; i < words.length; i += wordsPerChunk) {
+    const chunk = words.slice(i, i + wordsPerChunk).join(" ")
+    if (chunk.trim()) chunks.push(chunk)
+  }
+  return chunks
+}
+/**
+ * Parte un párrafo grande por oraciones respetando maxTokens.
+ *
+ * @param {string} text - Texto a dividir
+ * @param {number} maxTokens - Máximo de tokens por fragmento
+ * @returns {string[]}
+ */
+function splitBySentences(text, maxTokens) {
+  // Dividir por oraciones (., !, ? seguido de espacio o salto)
+  const sentenceRegex = /[^.!?\n]+[.!?\n]*/g
+  const sentences = []
+  let match
+  while ((match = sentenceRegex.exec(text)) !== null) {
+    const s = match[0].trim()
+    if (s) sentences.push(s)
+  }
+  // Si no se pudieron detectar oraciones, partir por coma
+  if (sentences.length <= 1) {
+    const parts = text.split(/,\s+/)
+    sentences.length = 0
+    for (const part of parts) {
+      const trimmed = part.trim()
+      if (trimmed) sentences.push(trimmed)
+    }
+  }
+  // Nota: si aún no se pudo dividir, splitTextGuaranteed caerá a splitByWordCount
+  const chunks = []
+  let currentChunk = []
+  let currentTokens = 0
+  for (const sentence of sentences) {
+    const sentTokens = estimateTokens(sentence)
+    if (currentTokens + sentTokens <= maxTokens) {
+      currentChunk.push(sentence)
+      currentTokens += sentTokens
+    } else {
+      if (currentChunk.length > 0) {
+        chunks.push(currentChunk.join(" "))
+      }
+      currentChunk = [sentence]
+      currentTokens = sentTokens
+    }
+  }
+  if (currentChunk.length > 0) {
+    chunks.push(currentChunk.join(" "))
+  }
+  return chunks
+}
+/**
+ * Aplica solapamiento entre chunks consecutivos.
+ * Toma las últimas `overlapTokens` palabras del chunk anterior
+ * y las antepone al chunk actual.
+ *
+ * @param {string[]} chunks
+ * @param {number} overlapTokens
+ * @returns {string[]}
+ */
+function applyOverlap(chunks, overlapTokens) {
+  if (chunks.length <= 1) return chunks
+  const avgTokensPerWord = 1 / 0.7
+  const overlapWords = Math.floor(overlapTokens * avgTokensPerWord)
+  const result = [chunks[0]]
+  for (let i = 1; i < chunks.length; i++) {
+    const prevWords = chunks[i - 1].split(/\s+/)
+    const overlap = prevWords.slice(-overlapWords).join(" ")
+    const overlapped = overlap ? `${overlap}\n\n${chunks[i]}` : chunks[i]
+    result.push(overlapped)
+  }
+  return result
+}
+// ─── Main API ───────────────────────────────────────────────────────
+/**
+ * Divide un documento en chunks aptos para embedding.
+ *
+ * @param {Object} document - Documento a chunkear
+ * @param {string} document.id - ID único del documento
+ * @param {string} document.title - Título del documento
+ * @param {Array<{index: number, title: string, content: string}>} document.chapters
+ *   Array de capítulos del documento
+ * @param {Object} [options] - Opciones de chunking
+ * @param {number} [options.maxTokens=512] - Máximo de tokens por chunk
+ * @param {number} [options.overlapTokens=32] - Tokens de solapamiento entre chunks
+ * @param {'paragraph'|'section'|'fixed'} [options.strategy='section']
+ *   Estrategia de división:
+ *   - 'paragraph': Respeta límites de párrafo
+ *   - 'section': Respeta límites de sección (##)
+ *   - 'fixed': Divide por tamaño fijo de tokens
+ * @returns {Array<{
+ *   id: string,
+ *   docId: string,
+ *   docTitle: string,
+ *   chapterIdx: number,
+ *   chapterTitle: string,
+ *   chunkIndex: number,
+ *   totalChunks: number,
+ *   content: string,
+ *   tokens: number,
+ *   strategy: string,
+ *   metadata: Object
+ * }>}
+ * @throws {Error} Si maxTokens < 50
+ */
+export function chunkDocument(document, options = {}) {
+  const { maxTokens = 512, overlapTokens = 32, strategy = "section" } = options
+  if (!document) return []
+  if (maxTokens < 50) {
+    throw new Error(`maxTokens debe ser >= 50, recibido: ${maxTokens}`)
+  }
+  const { id: docId, title: docTitle, chapters = [] } = document
+  if (!docId || chapters.length === 0) return []
+  // Seleccionar estrategia
+  let chunkFn
+  switch (strategy) {
+    case "paragraph":
+      chunkFn = (text) => chunkByParagraph(text, maxTokens, overlapTokens)
+      break
+    case "section":
+      chunkFn = (text) => chunkBySection(text, maxTokens, overlapTokens)
+      break
+    case "fixed":
+      chunkFn = (text) => chunkByFixed(text, maxTokens, overlapTokens)
+      break
+    default:
+      chunkFn = (text) => chunkBySection(text, maxTokens, overlapTokens)
+  }
+  const allChunks = []
+  let totalGlobal = 0
+  for (const chapter of chapters) {
+    const { index: chapterIdx, title: chapterTitle, content } = chapter
+    if (!content || content.trim().length === 0) continue
+    const rawChunks = chunkFn(content)
+    const chapterChunks = rawChunks.filter((c) => c.trim().length > 0)
+    for (let chunkIdx = 0; chunkIdx < chapterChunks.length; chunkIdx++) {
+      const content = chapterChunks[chunkIdx]
+      const tokens = estimateTokens(content)
+      allChunks.push({
+        id: `${docId}-ch${chapterIdx}-${chunkIdx}`,
+        docId,
+        docTitle: docTitle || "",
+        chapterIdx,
+        chapterTitle: chapterTitle || "",
+        chunkIndex: chunkIdx,
+        totalChunks: 0, // se actualiza después
+        content,
+        tokens,
+        strategy,
+        metadata: {
+          docTitle: docTitle || "",
+          chapterTitle: chapterTitle || "",
+          chunkIndex: chunkIdx,
+          totalChunks: 0, // se actualiza después
+          strategy,
+        },
+      })
+      totalGlobal++
+    }
+  }
+  // Actualizar totalChunks en cada chunk
+  for (const chunk of allChunks) {
+    chunk.totalChunks = totalGlobal
+    chunk.metadata.totalChunks = totalGlobal
+  }
+  return allChunks
+}