openprompt-lang 1.2.6 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +62 -8
  2. package/docs/EMBEDDINGS.md +214 -0
  3. package/docs/FRAMEWORK.md +52 -0
  4. package/docs/ONBOARDING_WORKFLOW.md +151 -0
  5. package/docs/OPL-ERRORES.md +504 -0
  6. package/docs/OPL_ACADEMIC_ISSUES.md +158 -0
  7. package/docs/WEB_SCRAPER_PLAN.md +454 -0
  8. package/package.json +7 -1
  9. package/scripts/postinstall.js +37 -0
  10. package/src/cli/commands-knowledge.js +1 -0
  11. package/src/cli/commands-opl.js +79 -1
  12. package/src/cli/commands-work.js +3 -1
  13. package/src/cli/commands-workflow.js +125 -6
  14. package/src/commands/init-core.js +188 -12
  15. package/src/commands/init-existing.js +13 -6
  16. package/src/commands/init-helpers.js +20 -14
  17. package/src/commands/knowledge-ops.js +52 -0
  18. package/src/commands/opl-embeddings.js +556 -0
  19. package/src/commands/opl-help.js +26 -2
  20. package/src/commands/opl-search.js +106 -2
  21. package/src/commands/opl-webscrape.js +390 -0
  22. package/src/commands/work-context.js +17 -0
  23. package/src/commands/workflow/close/index.js +2 -1
  24. package/src/commands/workflow/delivery/index.js +4 -0
  25. package/src/commands/workflow/discovery/index.js +4 -0
  26. package/src/commands/workflow/epic-cli.js +192 -0
  27. package/src/commands/workflow/select.js +146 -0
  28. package/src/commands/workflow/specification/index.js +4 -0
  29. package/src/commands/workflow/sprint-cli.js +174 -0
  30. package/src/core/engine/sandbox.js +7 -3
  31. package/src/core/webscrape/analyzer.js +481 -0
  32. package/src/core/webscrape/deep-scraper.js +1027 -0
  33. package/src/core/workflow/epic-manager.js +845 -0
  34. package/src/core/workflow/gates.js +180 -1
  35. package/src/core/workflow/selector.js +707 -0
  36. package/src/embeddings/chunker.js +450 -0
  37. package/src/embeddings/embedder.js +431 -0
  38. package/src/embeddings/index-pipeline.js +320 -0
  39. package/src/embeddings/vector-store.js +505 -0
  40. package/src/mcp-plan-server.js +12 -5
  41. package/src/mcp-shared-state.js +25 -0
  42. package/src/mcp-refactor/mcp-server.js +0 -171
  43. package/src/mcp-server-backup.js +0 -1913
@@ -0,0 +1,481 @@
1
+ // @use(kind, contract, limit, deps)
2
+ // @kind(module)
3
+ // @contract(in: html+url -> out: extraction plan, sideEffect: llama3.2 API call)
4
+ // @limit(lines: 350)
5
+ // @deps(node-fetch)
6
+
7
+ /**
8
+ * Analyzer OPL — Analiza la estructura de una página web usando Ollama (llama3.2)
9
+ * y genera un plan de extracción personalizado.
10
+ *
11
+ * En lugar de reglas fijas (Readability.js solo para artículos), el analyzer
12
+ * entiende el tipo de página y dice QUÉ extraer y CÓMO.
13
+ *
14
+ * Flujo:
15
+ * 1. Recibe HTML + URL
16
+ * 2. Genera un resumen estructural compacto (~1500 chars)
17
+ * 3. Envía a llama3.2 con un prompt que pide un plan JSON
18
+ * 4. Parsea y valida la respuesta
19
+ * 5. Devuelve el plan de extracción
20
+ *
21
+ * Uso:
22
+ * import { analyzePage } from "./analyzer.js"
23
+ * const plan = await analyzePage(html, url)
24
+ * // plan.pageType → "catalog-home" | "category-index" | "item-list" | "article"
25
+ * // plan.contentBlocks → [{type, selector, importance, extractMode}]
26
+ * // plan.links → [{url, type, text}]
27
+ */
28
+
29
+ import chalk from "chalk"
30
+
31
+ // ─── Constantes ────────────────────────────────────────────────────
32
+
33
+ const OLLAMA_URL = "http://localhost:11434/api/generate"
34
+ const MODEL = "llama3.2"
35
+ const TIMEOUT_MS = 30000
36
+ const MAX_TOKENS = 1024
37
+
38
+ // ─── Tipos de página que el analyzer puede detectar ─────────────────
39
+
40
+ export const PAGE_TYPES = {
41
+ CATALOG_HOME: "catalog-home",
42
+ CATEGORY_INDEX: "category-index",
43
+ ITEM_LIST: "item-list",
44
+ ITEM_DETAIL: "item-detail",
45
+ ARTICLE: "article",
46
+ PRODUCT: "product",
47
+ DOCS: "docs",
48
+ PORTFOLIO: "portfolio",
49
+ LANDING: "landing",
50
+ UNKNOWN: "unknown",
51
+ }
52
+
53
+ // ─── Handler principal ─────────────────────────────────────────────
54
+
55
+ /**
56
+ * Analiza una página web y genera un plan de extracción.
57
+ *
58
+ * @param {string} html - HTML completo de la página
59
+ * @param {string} url - URL de la página
60
+ * @param {Object} [options]
61
+ * @param {string} [options.model='llama3.2'] - Modelo Ollama
62
+ * @param {number} [options.timeout=30000] - Timeout en ms
63
+ * @returns {Promise<Object>} Plan de extracción
64
+ */
65
+ export async function analyzePage(html, url, options = {}) {
66
+ const model = options.model || MODEL
67
+ const timeout = options.timeout || TIMEOUT_MS
68
+
69
+ // Generar resumen estructural
70
+ const summary = buildStructuralSummary(html, url)
71
+
72
+ // Construir prompt
73
+ const prompt = buildPrompt(summary, url)
74
+
75
+ // Enviar a Ollama
76
+ const plan = await queryOllama(prompt, model, timeout)
77
+
78
+ // Validar y enriquecer el plan
79
+ const enrichedPlan = enrichPlan(plan, summary, url)
80
+
81
+ return enrichedPlan
82
+ }
83
+
84
+ // ─── Resumen estructural ───────────────────────────────────────────
85
+
86
+ /**
87
+ * Genera un resumen compacto de la estructura HTML.
88
+ * NO envía el HTML completo — solo métricas y patrones clave.
89
+ *
90
+ * @param {string} html
91
+ * @param {string} url
92
+ * @returns {Object} summary
93
+ */
94
+ function buildStructuralSummary(html, url) {
95
+ const textContent = html
96
+ .replace(/<[^>]+>/g, " ")
97
+ .replace(/\s+/g, " ")
98
+ .trim()
99
+
100
+ // Conteo de elementos
101
+ const elementCounts = {
102
+ div: (html.match(/<div[ >]/gi) || []).length,
103
+ article: (html.match(/<article[ >]/gi) || []).length,
104
+ section: (html.match(/<section[ >]/gi) || []).length,
105
+ nav: (html.match(/<nav[ >]/gi) || []).length,
106
+ main: (html.match(/<main[ >]/gi) || []).length,
107
+ aside: (html.match(/<aside[ >]/gi) || []).length,
108
+ footer: (html.match(/<footer[ >]/gi) || []).length,
109
+ header: (html.match(/<header[ >]/gi) || []).length,
110
+ table: (html.match(/<table[ >]/gi) || []).length,
111
+ form: (html.match(/<form[ >]/gi) || []).length,
112
+ img: (html.match(/<img[ >]/gi) || []).length,
113
+ video: (html.match(/<video[ >]/gi) || []).length,
114
+ a: (html.match(/<a[ >]/gi) || []).length,
115
+ button: (html.match(/<button[ >]/gi) || []).length,
116
+ ul: (html.match(/<ul[ >]/gi) || []).length,
117
+ ol: (html.match(/<ol[ >]/gi) || []).length,
118
+ li: (html.match(/<li[ >]/gi) || []).length,
119
+ pre: (html.match(/<pre[ >]/gi) || []).length,
120
+ code: (html.match(/<code[ >]/gi) || []).length,
121
+ iframe: (html.match(/<iframe[ >]/gi) || []).length,
122
+ }
123
+
124
+ // Headings
125
+ const headings = []
126
+ for (let i = 1; i <= 6; i++) {
127
+ const matches = html.match(new RegExp(`<h${i}[^>]*>([^<]+)`, "gi"))
128
+ if (matches) {
129
+ matches.forEach((m) => {
130
+ const text = m.replace(/<[^>]+>/g, "").trim()
131
+ if (text.length > 2) {
132
+ headings.push({ level: i, text: text.slice(0, 80) })
133
+ }
134
+ })
135
+ }
136
+ }
137
+
138
+ // Links principales (hrefs internos)
139
+ const allLinks = [...html.matchAll(/href=(\S+?)(?:\s|>)/g)]
140
+ const internalLinks = []
141
+ const externalLinks = []
142
+ const seen = new Set()
143
+
144
+ allLinks.forEach((m) => {
145
+ let href = m[1]
146
+ if (
147
+ (href.startsWith('"') && href.endsWith('"')) ||
148
+ (href.startsWith("'") && href.endsWith("'"))
149
+ ) {
150
+ href = href.slice(1, -1)
151
+ }
152
+ if (
153
+ seen.has(href) ||
154
+ href.startsWith("#") ||
155
+ href.startsWith("javascript") ||
156
+ href.startsWith("mailto") ||
157
+ href.startsWith("tel")
158
+ )
159
+ return
160
+ seen.add(href)
161
+
162
+ if (href.startsWith("http")) {
163
+ externalLinks.push(href.slice(0, 80))
164
+ } else if (href.length > 2 && !href.endsWith(".css") && !href.endsWith(".js")) {
165
+ internalLinks.push(href.slice(0, 80))
166
+ }
167
+ })
168
+
169
+ // Meta info
170
+ const title = (html.match(/<title>([^<]+)<\/title>/i) || [])[1] || ""
171
+ const description =
172
+ (html.match(/<meta[^>]*name=["']description["'][^>]*content=["']([^"']+)["']/i) || [])[1] || ""
173
+ const ogTitle =
174
+ (html.match(/<meta[^>]*property=["']og:title["'][^>]*content=["']([^"']+)["']/i) || [])[1] || ""
175
+ const hasOG = html.includes("og:title") || html.includes("og:description")
176
+ const hasTwitter = html.includes("twitter:card")
177
+ const hasJSONLD = html.includes("application/ld+json")
178
+
179
+ // Cards (posibles items de catálogo)
180
+ const cardPatterns = [
181
+ ...new Set([
182
+ ...(html.match(/class="?card[^" ]*"?/gi) || []),
183
+ ...(html.match(/class="?item[^" ]*"?/gi) || []),
184
+ ...(html.match(/class="?product[^" ]*"?/gi) || []),
185
+ ...(html.match(/class="?snippet[^" ]*"?/gi) || []),
186
+ ]),
187
+ ]
188
+
189
+ // Texto visible de la página (primeros ~1500 chars)
190
+ const visibleText = textContent.slice(0, 1500).trim()
191
+
192
+ // Detectar navegación
193
+ const navText = extractNavText(html)
194
+
195
+ return {
196
+ url,
197
+ title: title || ogTitle || "",
198
+ description: description.slice(0, 200),
199
+ sizeKB: (html.length / 1024).toFixed(0),
200
+ hasOG,
201
+ hasTwitter,
202
+ hasJSONLD,
203
+ elements: elementCounts,
204
+ headings: headings.slice(0, 20),
205
+ internalLinks: internalLinks.slice(0, 30),
206
+ externalLinks: externalLinks.slice(0, 10),
207
+ cardClasses: cardPatterns.slice(0, 10),
208
+ visibleText: visibleText.slice(0, 1500),
209
+ navText: navText.slice(0, 300),
210
+ totalLinks: allLinks.length,
211
+ hasPagination: html.includes("page=") || html.includes("/page/") || html.includes("pagination"),
212
+ hasLoadMore:
213
+ html.includes("load-more") || html.includes("loadMore") || html.includes("Load More"),
214
+ hasInfiniteScroll: html.includes("infinite-scroll") || html.includes("infiniteScroll"),
215
+ hasPopover: html.includes("popover") || html.includes("popovertarget"),
216
+ hasCodePen: html.includes("codepen") || html.includes("data-slug-hash"),
217
+ }
218
+ }
219
+
220
+ /**
221
+ * Extrae texto de navegación del HTML (para entender la estructura del sitio).
222
+ */
223
+ function extractNavText(html) {
224
+ const navMatch = html.match(/<nav[^>]*>([\s\S]{0,2000}?)<\/nav>/i)
225
+ if (!navMatch) return ""
226
+
227
+ return navMatch[1]
228
+ .replace(/<[^>]+>/g, " ")
229
+ .replace(/\s+/g, " ")
230
+ .trim()
231
+ }
232
+
233
+ // ─── Prompt ────────────────────────────────────────────────────────
234
+
235
+ /**
236
+ * Construye el prompt para Ollama.
237
+ * Pide un plan JSON compacto con el tipo de página, bloques de contenido
238
+ * y enlaces a seguir.
239
+ *
240
+ * @param {Object} summary - Resumen estructural
241
+ * @param {string} url - URL de la página
242
+ * @returns {string} Prompt
243
+ */
244
+ function buildPrompt(summary, url) {
245
+ const elements = summary.elements
246
+ const headings = summary.headings
247
+ .slice(0, 8)
248
+ .map((h) => ` H${h.level}: ${h.text}`)
249
+ .join("\n")
250
+ const internals = summary.internalLinks.slice(0, 10).join("\n ")
251
+ const cards = summary.cardClasses.join(", ")
252
+
253
+ return `Eres un clasificador de páginas web. Analiza la estructura y responde SOLO con JSON.
254
+
255
+ ## PÁGINA
256
+ URL: ${url}
257
+ Título: ${summary.title || "(sin título)"}
258
+ Descripción: ${summary.description || "(sin descripción)"}
259
+ Tamaño: ${summary.sizeKB} KB
260
+
261
+ ## ELEMENTOS
262
+ Divs:${elements.div} Articles:${elements.article} Sections:${elements.section} Nav:${elements.nav} Main:${elements.main}
263
+ Links:${elements.a} Buttons:${elements.button} Imgs:${elements.img} Lists:${elements.ul + elements.ol}
264
+ Tables:${elements.table} Forms:${elements.form} Code:${elements.pre + elements.code} Iframes:${elements.iframe}
265
+ ${cards ? `Cards:${cards}` : ""}
266
+
267
+ ## ENCABEZADOS (primeros 8)
268
+ ${headings || "(ninguno)"}
269
+
270
+ ## LINKS INTERNOS (primeros 10)
271
+ ${internals || "(ninguno)"}
272
+
273
+ ## INDICADORES
274
+ OG:${summary.hasOG} Twitter:${summary.hasTwitter} JSONLD:${summary.hasJSONLD}
275
+ Paginación:${summary.hasPagination} LoadMore:${summary.hasLoadMore} Popovers:${summary.hasPopover} CodePen:${summary.hasCodePen}
276
+
277
+ ## TEXTO INICIO
278
+ ${summary.visibleText.slice(0, 400)}
279
+
280
+ ## REGLAS DE CLASIFICACIÓN
281
+ - catalog-home: página principal de un sitio con menú de tecnologías/categorías
282
+ - category-index: lista de subcategorías (ej: /tailwind-code-examples/ muestra buttons, cards, navbars, etc)
283
+ - item-list: grid de items/cards con previews (ej: /tailwind-buttons/ muestra 30+ botones)
284
+ - article: artículo/blog con texto principal
285
+ - docs: documentación con secciones y navegación
286
+ - product: página de un producto individual
287
+ - landing: landing page con hero, features, CTA
288
+ - unknown: no coincide con nada
289
+
290
+ ## RESPONDE SOLO JSON
291
+ {
292
+ "pageType": "UNO de: catalog-home | category-index | item-list | article | docs | product | landing | unknown",
293
+ "confidence": 0-100,
294
+ "title": "título",
295
+ "description": "para qué sirve esta página en 1 frase",
296
+ "links": [{"url":"ruta","type":"category|subcategory|item|page|nav","text":"texto"}],
297
+ "contentBlocks": [{"type":"cards|sidebar|nav|footer|main-content|table|form|gallery","importance":1-10,"description":"qué contiene"}],
298
+ "features": ["paginacion"|"load-more"|"popover"|"codepen"|"infinito"|"none"],
299
+ "strategy": "navegar-subcategorias|extraer-items|extraer-articulo|extraer-generico",
300
+ "maxDepth": 1-3,
301
+ "followLinks": true,
302
+ "linkFilter": "palabra clave para filtrar links relevantes"
303
+ }`
304
+ }
305
+
306
+ // ─── Ollama query ──────────────────────────────────────────────────
307
+
308
+ /**
309
+ * Envía el prompt a Ollama y parsea la respuesta JSON.
310
+ *
311
+ * @param {string} prompt
312
+ * @param {string} model
313
+ * @param {number} timeout
314
+ * @returns {Promise<Object>}
315
+ */
316
+ async function queryOllama(prompt, model, timeout) {
317
+ try {
318
+ const response = await fetch(OLLAMA_URL, {
319
+ method: "POST",
320
+ headers: { "Content-Type": "application/json" },
321
+ body: JSON.stringify({
322
+ model,
323
+ prompt,
324
+ stream: false,
325
+ options: {
326
+ temperature: 0.1,
327
+ num_predict: MAX_TOKENS,
328
+ },
329
+ }),
330
+ signal: AbortSignal.timeout(timeout),
331
+ })
332
+
333
+ if (!response.ok) {
334
+ throw new Error(`Ollama HTTP ${response.status}`)
335
+ }
336
+
337
+ const data = await response.json()
338
+ const text = data.response || ""
339
+
340
+ // Extraer JSON de la respuesta (puede venir con markdown ```)
341
+ const jsonMatch = text.match(/\{[\s\S]*\}/)
342
+ if (!jsonMatch) {
343
+ throw new Error("No se encontró JSON en la respuesta de Ollama")
344
+ }
345
+
346
+ return JSON.parse(jsonMatch[0])
347
+ } catch (error) {
348
+ console.log(chalk.yellow(` ⚠️ Ollama: ${error.message}`))
349
+ // Fallback: devolver plan genérico
350
+ return getFallbackPlan()
351
+ }
352
+ }
353
+
354
+ // ─── Enriquecimiento del plan ──────────────────────────────────────
355
+
356
+ /**
357
+ * Valida y enriquece el plan devuelto por Ollama.
358
+ * Asegura que todos los campos necesarios existan.
359
+ *
360
+ * @param {Object} plan - Plan crudo de Ollama
361
+ * @param {Object} summary - Resumen estructural
362
+ * @param {string} url
363
+ * @returns {Object} Plan enriquecido
364
+ */
365
+ function enrichPlan(plan, summary, url) {
366
+ // Valores por defecto
367
+ const enriched = {
368
+ pageType: plan.pageType || PAGE_TYPES.UNKNOWN,
369
+ confidence: typeof plan.confidence === "number" ? plan.confidence : 50,
370
+ title: plan.title || summary.title || new URL(url).hostname,
371
+ description: plan.description || summary.description || "",
372
+ links: Array.isArray(plan.links) ? plan.links : [],
373
+ contentBlocks: Array.isArray(plan.contentBlocks) ? plan.contentBlocks : [],
374
+ features: Array.isArray(plan.features) ? plan.features : [],
375
+ strategy: plan.strategy || "extraer-generico",
376
+ maxDepth: typeof plan.maxDepth === "number" ? Math.min(plan.maxDepth, 3) : 1,
377
+ followLinks: plan.followLinks !== false,
378
+ linkFilter: plan.linkFilter || "",
379
+ // Metadata adicional
380
+ _summary: summary,
381
+ _url: url,
382
+ _timestamp: new Date().toISOString(),
383
+ _model: MODEL,
384
+ }
385
+
386
+ return enriched
387
+ }
388
+
389
+ // ─── Fallback ──────────────────────────────────────────────────────
390
+
391
+ /**
392
+ * Plan genérico de fallback cuando Ollama no está disponible.
393
+ */
394
+ function getFallbackPlan() {
395
+ return {
396
+ pageType: PAGE_TYPES.UNKNOWN,
397
+ confidence: 0,
398
+ title: "",
399
+ description: "",
400
+ links: [],
401
+ contentBlocks: [{ type: "main-content", importance: 10, description: "Contenido principal" }],
402
+ features: [],
403
+ strategy: "extraer-generico",
404
+ maxDepth: 1,
405
+ followLinks: false,
406
+ linkFilter: "",
407
+ }
408
+ }
409
+
410
+ // ─── Test directo ──────────────────────────────────────────────────
411
+
412
+ /**
413
+ * Función de prueba para ver el analyzer en acción.
414
+ * Uso: node src/core/webscrape/analyzer.js <url>
415
+ */
416
+ async function testAnalyzer() {
417
+ const url = process.argv[2] || "https://freefrontend.com/tailwind-code-examples/"
418
+
419
+ console.log("")
420
+ console.log(chalk.cyan(`🧠 Analyzer OPL`))
421
+ console.log(chalk.gray(` URL: ${url}`))
422
+ console.log(chalk.gray(` Modelo: ${MODEL}`))
423
+ console.log("")
424
+
425
+ const resp = await fetch(url, {
426
+ headers: { "User-Agent": "Mozilla/5.0 (compatible; OPL-Analyzer/1.0)" },
427
+ })
428
+ const html = await resp.text()
429
+
430
+ console.log(chalk.gray(` HTML: ${(html.length / 1024).toFixed(0)} KB`))
431
+
432
+ const start = Date.now()
433
+ const plan = await analyzePage(html, url)
434
+ const elapsed = Date.now() - start
435
+
436
+ console.log(chalk.green(` ✅ Analizado en ${elapsed}ms`))
437
+ console.log("")
438
+ console.log(chalk.cyan("═".repeat(50)))
439
+ console.log(chalk.white("📋 PLAN DE EXTRACCIÓN"))
440
+ console.log(chalk.cyan("═".repeat(50)))
441
+ console.log(
442
+ chalk.white(` Tipo: ${chalk.bold(plan.pageType)} (${plan.confidence}% confianza)`)
443
+ )
444
+ console.log(chalk.white(` Título: ${plan.title}`))
445
+ console.log(chalk.white(` Estrategia: ${plan.strategy}`))
446
+ console.log(chalk.white(` Profundidad: ${plan.maxDepth}`))
447
+ console.log(chalk.white(` Seguir links: ${plan.followLinks}`))
448
+ if (plan.linkFilter) console.log(chalk.white(` Filtro links: ${plan.linkFilter}`))
449
+
450
+ if (plan.contentBlocks.length > 0) {
451
+ console.log("")
452
+ console.log(chalk.gray(" Bloques de contenido:"))
453
+ plan.contentBlocks.forEach((b) => {
454
+ console.log(
455
+ chalk.gray(` [${b.importance}/10] ${b.type} — ${(b.description || "").slice(0, 60)}`)
456
+ )
457
+ })
458
+ }
459
+
460
+ if (plan.links.length > 0) {
461
+ console.log("")
462
+ console.log(chalk.gray(" Enlaces detectados (primeros 10):"))
463
+ plan.links.slice(0, 10).forEach((l) => {
464
+ console.log(
465
+ chalk.gray(` [${l.type}] ${l.url.slice(0, 40)} — ${(l.text || "").slice(0, 40)}`)
466
+ )
467
+ })
468
+ }
469
+
470
+ if (plan.features.length > 0) {
471
+ console.log("")
472
+ console.log(chalk.gray(` Features: ${plan.features.join(", ")}`))
473
+ }
474
+
475
+ console.log("")
476
+ }
477
+
478
+ // Permitir ejecución directa
479
+ if (process.argv[1]?.includes("analyzer")) {
480
+ testAnalyzer().catch(console.error)
481
+ }