openprompt-lang 1.2.6 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -8
- package/docs/EMBEDDINGS.md +214 -0
- package/docs/FRAMEWORK.md +52 -0
- package/docs/ONBOARDING_WORKFLOW.md +151 -0
- package/docs/OPL-ERRORES.md +504 -0
- package/docs/OPL_ACADEMIC_ISSUES.md +158 -0
- package/docs/WEB_SCRAPER_PLAN.md +454 -0
- package/package.json +7 -1
- package/scripts/postinstall.js +37 -0
- package/src/cli/commands-knowledge.js +1 -0
- package/src/cli/commands-opl.js +79 -1
- package/src/cli/commands-work.js +3 -1
- package/src/cli/commands-workflow.js +125 -6
- package/src/commands/init-core.js +188 -12
- package/src/commands/init-existing.js +13 -6
- package/src/commands/init-helpers.js +20 -14
- package/src/commands/knowledge-ops.js +52 -0
- package/src/commands/opl-embeddings.js +556 -0
- package/src/commands/opl-help.js +26 -2
- package/src/commands/opl-search.js +106 -2
- package/src/commands/opl-webscrape.js +390 -0
- package/src/commands/work-context.js +17 -0
- package/src/commands/workflow/close/index.js +2 -1
- package/src/commands/workflow/delivery/index.js +4 -0
- package/src/commands/workflow/discovery/index.js +4 -0
- package/src/commands/workflow/epic-cli.js +192 -0
- package/src/commands/workflow/select.js +146 -0
- package/src/commands/workflow/specification/index.js +4 -0
- package/src/commands/workflow/sprint-cli.js +174 -0
- package/src/core/engine/sandbox.js +7 -3
- package/src/core/webscrape/analyzer.js +481 -0
- package/src/core/webscrape/deep-scraper.js +1027 -0
- package/src/core/workflow/epic-manager.js +845 -0
- package/src/core/workflow/gates.js +180 -1
- package/src/core/workflow/selector.js +707 -0
- package/src/embeddings/chunker.js +450 -0
- package/src/embeddings/embedder.js +431 -0
- package/src/embeddings/index-pipeline.js +320 -0
- package/src/embeddings/vector-store.js +505 -0
- package/src/mcp-plan-server.js +12 -5
- package/src/mcp-shared-state.js +25 -0
- package/src/mcp-refactor/mcp-server.js +0 -171
- package/src/mcp-server-backup.js +0 -1913
|
@@ -0,0 +1,481 @@
|
|
|
1
|
+
// @use(kind, contract, limit, deps)
|
|
2
|
+
// @kind(module)
|
|
3
|
+
// @contract(in: html+url -> out: extraction plan, sideEffect: llama3.2 API call)
|
|
4
|
+
// @limit(lines: 350)
|
|
5
|
+
// @deps(node-fetch)
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Analyzer OPL — Analiza la estructura de una página web usando Ollama (llama3.2)
|
|
9
|
+
* y genera un plan de extracción personalizado.
|
|
10
|
+
*
|
|
11
|
+
* En lugar de reglas fijas (Readability.js solo para artículos), el analyzer
|
|
12
|
+
* entiende el tipo de página y dice QUÉ extraer y CÓMO.
|
|
13
|
+
*
|
|
14
|
+
* Flujo:
|
|
15
|
+
* 1. Recibe HTML + URL
|
|
16
|
+
* 2. Genera un resumen estructural compacto (~1500 chars)
|
|
17
|
+
* 3. Envía a llama3.2 con un prompt que pide un plan JSON
|
|
18
|
+
* 4. Parsea y valida la respuesta
|
|
19
|
+
* 5. Devuelve el plan de extracción
|
|
20
|
+
*
|
|
21
|
+
* Uso:
|
|
22
|
+
* import { analyzePage } from "./analyzer.js"
|
|
23
|
+
* const plan = await analyzePage(html, url)
|
|
24
|
+
* // plan.pageType → "catalog-home" | "category-index" | "item-list" | "article"
|
|
25
|
+
* // plan.contentBlocks → [{type, selector, importance, extractMode}]
|
|
26
|
+
* // plan.links → [{url, type, text}]
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
import chalk from "chalk"
|
|
30
|
+
|
|
31
|
+
// ─── Constantes ────────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
const OLLAMA_URL = "http://localhost:11434/api/generate"
|
|
34
|
+
const MODEL = "llama3.2"
|
|
35
|
+
const TIMEOUT_MS = 30000
|
|
36
|
+
const MAX_TOKENS = 1024
|
|
37
|
+
|
|
38
|
+
// ─── Tipos de página que el analyzer puede detectar ─────────────────
|
|
39
|
+
|
|
40
|
+
export const PAGE_TYPES = {
|
|
41
|
+
CATALOG_HOME: "catalog-home",
|
|
42
|
+
CATEGORY_INDEX: "category-index",
|
|
43
|
+
ITEM_LIST: "item-list",
|
|
44
|
+
ITEM_DETAIL: "item-detail",
|
|
45
|
+
ARTICLE: "article",
|
|
46
|
+
PRODUCT: "product",
|
|
47
|
+
DOCS: "docs",
|
|
48
|
+
PORTFOLIO: "portfolio",
|
|
49
|
+
LANDING: "landing",
|
|
50
|
+
UNKNOWN: "unknown",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// ─── Handler principal ─────────────────────────────────────────────
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Analiza una página web y genera un plan de extracción.
|
|
57
|
+
*
|
|
58
|
+
* @param {string} html - HTML completo de la página
|
|
59
|
+
* @param {string} url - URL de la página
|
|
60
|
+
* @param {Object} [options]
|
|
61
|
+
* @param {string} [options.model='llama3.2'] - Modelo Ollama
|
|
62
|
+
* @param {number} [options.timeout=30000] - Timeout en ms
|
|
63
|
+
* @returns {Promise<Object>} Plan de extracción
|
|
64
|
+
*/
|
|
65
|
+
export async function analyzePage(html, url, options = {}) {
|
|
66
|
+
const model = options.model || MODEL
|
|
67
|
+
const timeout = options.timeout || TIMEOUT_MS
|
|
68
|
+
|
|
69
|
+
// Generar resumen estructural
|
|
70
|
+
const summary = buildStructuralSummary(html, url)
|
|
71
|
+
|
|
72
|
+
// Construir prompt
|
|
73
|
+
const prompt = buildPrompt(summary, url)
|
|
74
|
+
|
|
75
|
+
// Enviar a Ollama
|
|
76
|
+
const plan = await queryOllama(prompt, model, timeout)
|
|
77
|
+
|
|
78
|
+
// Validar y enriquecer el plan
|
|
79
|
+
const enrichedPlan = enrichPlan(plan, summary, url)
|
|
80
|
+
|
|
81
|
+
return enrichedPlan
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// ─── Resumen estructural ───────────────────────────────────────────
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Genera un resumen compacto de la estructura HTML.
|
|
88
|
+
* NO envía el HTML completo — solo métricas y patrones clave.
|
|
89
|
+
*
|
|
90
|
+
* @param {string} html
|
|
91
|
+
* @param {string} url
|
|
92
|
+
* @returns {Object} summary
|
|
93
|
+
*/
|
|
94
|
+
function buildStructuralSummary(html, url) {
|
|
95
|
+
const textContent = html
|
|
96
|
+
.replace(/<[^>]+>/g, " ")
|
|
97
|
+
.replace(/\s+/g, " ")
|
|
98
|
+
.trim()
|
|
99
|
+
|
|
100
|
+
// Conteo de elementos
|
|
101
|
+
const elementCounts = {
|
|
102
|
+
div: (html.match(/<div[ >]/gi) || []).length,
|
|
103
|
+
article: (html.match(/<article[ >]/gi) || []).length,
|
|
104
|
+
section: (html.match(/<section[ >]/gi) || []).length,
|
|
105
|
+
nav: (html.match(/<nav[ >]/gi) || []).length,
|
|
106
|
+
main: (html.match(/<main[ >]/gi) || []).length,
|
|
107
|
+
aside: (html.match(/<aside[ >]/gi) || []).length,
|
|
108
|
+
footer: (html.match(/<footer[ >]/gi) || []).length,
|
|
109
|
+
header: (html.match(/<header[ >]/gi) || []).length,
|
|
110
|
+
table: (html.match(/<table[ >]/gi) || []).length,
|
|
111
|
+
form: (html.match(/<form[ >]/gi) || []).length,
|
|
112
|
+
img: (html.match(/<img[ >]/gi) || []).length,
|
|
113
|
+
video: (html.match(/<video[ >]/gi) || []).length,
|
|
114
|
+
a: (html.match(/<a[ >]/gi) || []).length,
|
|
115
|
+
button: (html.match(/<button[ >]/gi) || []).length,
|
|
116
|
+
ul: (html.match(/<ul[ >]/gi) || []).length,
|
|
117
|
+
ol: (html.match(/<ol[ >]/gi) || []).length,
|
|
118
|
+
li: (html.match(/<li[ >]/gi) || []).length,
|
|
119
|
+
pre: (html.match(/<pre[ >]/gi) || []).length,
|
|
120
|
+
code: (html.match(/<code[ >]/gi) || []).length,
|
|
121
|
+
iframe: (html.match(/<iframe[ >]/gi) || []).length,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Headings
|
|
125
|
+
const headings = []
|
|
126
|
+
for (let i = 1; i <= 6; i++) {
|
|
127
|
+
const matches = html.match(new RegExp(`<h${i}[^>]*>([^<]+)`, "gi"))
|
|
128
|
+
if (matches) {
|
|
129
|
+
matches.forEach((m) => {
|
|
130
|
+
const text = m.replace(/<[^>]+>/g, "").trim()
|
|
131
|
+
if (text.length > 2) {
|
|
132
|
+
headings.push({ level: i, text: text.slice(0, 80) })
|
|
133
|
+
}
|
|
134
|
+
})
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Links principales (hrefs internos)
|
|
139
|
+
const allLinks = [...html.matchAll(/href=(\S+?)(?:\s|>)/g)]
|
|
140
|
+
const internalLinks = []
|
|
141
|
+
const externalLinks = []
|
|
142
|
+
const seen = new Set()
|
|
143
|
+
|
|
144
|
+
allLinks.forEach((m) => {
|
|
145
|
+
let href = m[1]
|
|
146
|
+
if (
|
|
147
|
+
(href.startsWith('"') && href.endsWith('"')) ||
|
|
148
|
+
(href.startsWith("'") && href.endsWith("'"))
|
|
149
|
+
) {
|
|
150
|
+
href = href.slice(1, -1)
|
|
151
|
+
}
|
|
152
|
+
if (
|
|
153
|
+
seen.has(href) ||
|
|
154
|
+
href.startsWith("#") ||
|
|
155
|
+
href.startsWith("javascript") ||
|
|
156
|
+
href.startsWith("mailto") ||
|
|
157
|
+
href.startsWith("tel")
|
|
158
|
+
)
|
|
159
|
+
return
|
|
160
|
+
seen.add(href)
|
|
161
|
+
|
|
162
|
+
if (href.startsWith("http")) {
|
|
163
|
+
externalLinks.push(href.slice(0, 80))
|
|
164
|
+
} else if (href.length > 2 && !href.endsWith(".css") && !href.endsWith(".js")) {
|
|
165
|
+
internalLinks.push(href.slice(0, 80))
|
|
166
|
+
}
|
|
167
|
+
})
|
|
168
|
+
|
|
169
|
+
// Meta info
|
|
170
|
+
const title = (html.match(/<title>([^<]+)<\/title>/i) || [])[1] || ""
|
|
171
|
+
const description =
|
|
172
|
+
(html.match(/<meta[^>]*name=["']description["'][^>]*content=["']([^"']+)["']/i) || [])[1] || ""
|
|
173
|
+
const ogTitle =
|
|
174
|
+
(html.match(/<meta[^>]*property=["']og:title["'][^>]*content=["']([^"']+)["']/i) || [])[1] || ""
|
|
175
|
+
const hasOG = html.includes("og:title") || html.includes("og:description")
|
|
176
|
+
const hasTwitter = html.includes("twitter:card")
|
|
177
|
+
const hasJSONLD = html.includes("application/ld+json")
|
|
178
|
+
|
|
179
|
+
// Cards (posibles items de catálogo)
|
|
180
|
+
const cardPatterns = [
|
|
181
|
+
...new Set([
|
|
182
|
+
...(html.match(/class="?card[^" ]*"?/gi) || []),
|
|
183
|
+
...(html.match(/class="?item[^" ]*"?/gi) || []),
|
|
184
|
+
...(html.match(/class="?product[^" ]*"?/gi) || []),
|
|
185
|
+
...(html.match(/class="?snippet[^" ]*"?/gi) || []),
|
|
186
|
+
]),
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
// Texto visible de la página (primeros ~1500 chars)
|
|
190
|
+
const visibleText = textContent.slice(0, 1500).trim()
|
|
191
|
+
|
|
192
|
+
// Detectar navegación
|
|
193
|
+
const navText = extractNavText(html)
|
|
194
|
+
|
|
195
|
+
return {
|
|
196
|
+
url,
|
|
197
|
+
title: title || ogTitle || "",
|
|
198
|
+
description: description.slice(0, 200),
|
|
199
|
+
sizeKB: (html.length / 1024).toFixed(0),
|
|
200
|
+
hasOG,
|
|
201
|
+
hasTwitter,
|
|
202
|
+
hasJSONLD,
|
|
203
|
+
elements: elementCounts,
|
|
204
|
+
headings: headings.slice(0, 20),
|
|
205
|
+
internalLinks: internalLinks.slice(0, 30),
|
|
206
|
+
externalLinks: externalLinks.slice(0, 10),
|
|
207
|
+
cardClasses: cardPatterns.slice(0, 10),
|
|
208
|
+
visibleText: visibleText.slice(0, 1500),
|
|
209
|
+
navText: navText.slice(0, 300),
|
|
210
|
+
totalLinks: allLinks.length,
|
|
211
|
+
hasPagination: html.includes("page=") || html.includes("/page/") || html.includes("pagination"),
|
|
212
|
+
hasLoadMore:
|
|
213
|
+
html.includes("load-more") || html.includes("loadMore") || html.includes("Load More"),
|
|
214
|
+
hasInfiniteScroll: html.includes("infinite-scroll") || html.includes("infiniteScroll"),
|
|
215
|
+
hasPopover: html.includes("popover") || html.includes("popovertarget"),
|
|
216
|
+
hasCodePen: html.includes("codepen") || html.includes("data-slug-hash"),
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Extrae texto de navegación del HTML (para entender la estructura del sitio).
|
|
222
|
+
*/
|
|
223
|
+
function extractNavText(html) {
|
|
224
|
+
const navMatch = html.match(/<nav[^>]*>([\s\S]{0,2000}?)<\/nav>/i)
|
|
225
|
+
if (!navMatch) return ""
|
|
226
|
+
|
|
227
|
+
return navMatch[1]
|
|
228
|
+
.replace(/<[^>]+>/g, " ")
|
|
229
|
+
.replace(/\s+/g, " ")
|
|
230
|
+
.trim()
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// ─── Prompt ────────────────────────────────────────────────────────
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Construye el prompt para Ollama.
|
|
237
|
+
* Pide un plan JSON compacto con el tipo de página, bloques de contenido
|
|
238
|
+
* y enlaces a seguir.
|
|
239
|
+
*
|
|
240
|
+
* @param {Object} summary - Resumen estructural
|
|
241
|
+
* @param {string} url - URL de la página
|
|
242
|
+
* @returns {string} Prompt
|
|
243
|
+
*/
|
|
244
|
+
function buildPrompt(summary, url) {
|
|
245
|
+
const elements = summary.elements
|
|
246
|
+
const headings = summary.headings
|
|
247
|
+
.slice(0, 8)
|
|
248
|
+
.map((h) => ` H${h.level}: ${h.text}`)
|
|
249
|
+
.join("\n")
|
|
250
|
+
const internals = summary.internalLinks.slice(0, 10).join("\n ")
|
|
251
|
+
const cards = summary.cardClasses.join(", ")
|
|
252
|
+
|
|
253
|
+
return `Eres un clasificador de páginas web. Analiza la estructura y responde SOLO con JSON.
|
|
254
|
+
|
|
255
|
+
## PÁGINA
|
|
256
|
+
URL: ${url}
|
|
257
|
+
Título: ${summary.title || "(sin título)"}
|
|
258
|
+
Descripción: ${summary.description || "(sin descripción)"}
|
|
259
|
+
Tamaño: ${summary.sizeKB} KB
|
|
260
|
+
|
|
261
|
+
## ELEMENTOS
|
|
262
|
+
Divs:${elements.div} Articles:${elements.article} Sections:${elements.section} Nav:${elements.nav} Main:${elements.main}
|
|
263
|
+
Links:${elements.a} Buttons:${elements.button} Imgs:${elements.img} Lists:${elements.ul + elements.ol}
|
|
264
|
+
Tables:${elements.table} Forms:${elements.form} Code:${elements.pre + elements.code} Iframes:${elements.iframe}
|
|
265
|
+
${cards ? `Cards:${cards}` : ""}
|
|
266
|
+
|
|
267
|
+
## ENCABEZADOS (primeros 8)
|
|
268
|
+
${headings || "(ninguno)"}
|
|
269
|
+
|
|
270
|
+
## LINKS INTERNOS (primeros 10)
|
|
271
|
+
${internals || "(ninguno)"}
|
|
272
|
+
|
|
273
|
+
## INDICADORES
|
|
274
|
+
OG:${summary.hasOG} Twitter:${summary.hasTwitter} JSONLD:${summary.hasJSONLD}
|
|
275
|
+
Paginación:${summary.hasPagination} LoadMore:${summary.hasLoadMore} Popovers:${summary.hasPopover} CodePen:${summary.hasCodePen}
|
|
276
|
+
|
|
277
|
+
## TEXTO INICIO
|
|
278
|
+
${summary.visibleText.slice(0, 400)}
|
|
279
|
+
|
|
280
|
+
## REGLAS DE CLASIFICACIÓN
|
|
281
|
+
- catalog-home: página principal de un sitio con menú de tecnologías/categorías
|
|
282
|
+
- category-index: lista de subcategorías (ej: /tailwind-code-examples/ muestra buttons, cards, navbars, etc)
|
|
283
|
+
- item-list: grid de items/cards con previews (ej: /tailwind-buttons/ muestra 30+ botones)
|
|
284
|
+
- article: artículo/blog con texto principal
|
|
285
|
+
- docs: documentación con secciones y navegación
|
|
286
|
+
- product: página de un producto individual
|
|
287
|
+
- landing: landing page con hero, features, CTA
|
|
288
|
+
- unknown: no coincide con nada
|
|
289
|
+
|
|
290
|
+
## RESPONDE SOLO JSON
|
|
291
|
+
{
|
|
292
|
+
"pageType": "UNO de: catalog-home | category-index | item-list | article | docs | product | landing | unknown",
|
|
293
|
+
"confidence": 0-100,
|
|
294
|
+
"title": "título",
|
|
295
|
+
"description": "para qué sirve esta página en 1 frase",
|
|
296
|
+
"links": [{"url":"ruta","type":"category|subcategory|item|page|nav","text":"texto"}],
|
|
297
|
+
"contentBlocks": [{"type":"cards|sidebar|nav|footer|main-content|table|form|gallery","importance":1-10,"description":"qué contiene"}],
|
|
298
|
+
"features": ["paginacion"|"load-more"|"popover"|"codepen"|"infinito"|"none"],
|
|
299
|
+
"strategy": "navegar-subcategorias|extraer-items|extraer-articulo|extraer-generico",
|
|
300
|
+
"maxDepth": 1-3,
|
|
301
|
+
"followLinks": true,
|
|
302
|
+
"linkFilter": "palabra clave para filtrar links relevantes"
|
|
303
|
+
}`
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// ─── Ollama query ──────────────────────────────────────────────────
|
|
307
|
+
|
|
308
|
+
/**
|
|
309
|
+
* Envía el prompt a Ollama y parsea la respuesta JSON.
|
|
310
|
+
*
|
|
311
|
+
* @param {string} prompt
|
|
312
|
+
* @param {string} model
|
|
313
|
+
* @param {number} timeout
|
|
314
|
+
* @returns {Promise<Object>}
|
|
315
|
+
*/
|
|
316
|
+
async function queryOllama(prompt, model, timeout) {
|
|
317
|
+
try {
|
|
318
|
+
const response = await fetch(OLLAMA_URL, {
|
|
319
|
+
method: "POST",
|
|
320
|
+
headers: { "Content-Type": "application/json" },
|
|
321
|
+
body: JSON.stringify({
|
|
322
|
+
model,
|
|
323
|
+
prompt,
|
|
324
|
+
stream: false,
|
|
325
|
+
options: {
|
|
326
|
+
temperature: 0.1,
|
|
327
|
+
num_predict: MAX_TOKENS,
|
|
328
|
+
},
|
|
329
|
+
}),
|
|
330
|
+
signal: AbortSignal.timeout(timeout),
|
|
331
|
+
})
|
|
332
|
+
|
|
333
|
+
if (!response.ok) {
|
|
334
|
+
throw new Error(`Ollama HTTP ${response.status}`)
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
const data = await response.json()
|
|
338
|
+
const text = data.response || ""
|
|
339
|
+
|
|
340
|
+
// Extraer JSON de la respuesta (puede venir con markdown ```)
|
|
341
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/)
|
|
342
|
+
if (!jsonMatch) {
|
|
343
|
+
throw new Error("No se encontró JSON en la respuesta de Ollama")
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
return JSON.parse(jsonMatch[0])
|
|
347
|
+
} catch (error) {
|
|
348
|
+
console.log(chalk.yellow(` ⚠️ Ollama: ${error.message}`))
|
|
349
|
+
// Fallback: devolver plan genérico
|
|
350
|
+
return getFallbackPlan()
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// ─── Enriquecimiento del plan ──────────────────────────────────────
|
|
355
|
+
|
|
356
|
+
/**
|
|
357
|
+
* Valida y enriquece el plan devuelto por Ollama.
|
|
358
|
+
* Asegura que todos los campos necesarios existan.
|
|
359
|
+
*
|
|
360
|
+
* @param {Object} plan - Plan crudo de Ollama
|
|
361
|
+
* @param {Object} summary - Resumen estructural
|
|
362
|
+
* @param {string} url
|
|
363
|
+
* @returns {Object} Plan enriquecido
|
|
364
|
+
*/
|
|
365
|
+
function enrichPlan(plan, summary, url) {
|
|
366
|
+
// Valores por defecto
|
|
367
|
+
const enriched = {
|
|
368
|
+
pageType: plan.pageType || PAGE_TYPES.UNKNOWN,
|
|
369
|
+
confidence: typeof plan.confidence === "number" ? plan.confidence : 50,
|
|
370
|
+
title: plan.title || summary.title || new URL(url).hostname,
|
|
371
|
+
description: plan.description || summary.description || "",
|
|
372
|
+
links: Array.isArray(plan.links) ? plan.links : [],
|
|
373
|
+
contentBlocks: Array.isArray(plan.contentBlocks) ? plan.contentBlocks : [],
|
|
374
|
+
features: Array.isArray(plan.features) ? plan.features : [],
|
|
375
|
+
strategy: plan.strategy || "extraer-generico",
|
|
376
|
+
maxDepth: typeof plan.maxDepth === "number" ? Math.min(plan.maxDepth, 3) : 1,
|
|
377
|
+
followLinks: plan.followLinks !== false,
|
|
378
|
+
linkFilter: plan.linkFilter || "",
|
|
379
|
+
// Metadata adicional
|
|
380
|
+
_summary: summary,
|
|
381
|
+
_url: url,
|
|
382
|
+
_timestamp: new Date().toISOString(),
|
|
383
|
+
_model: MODEL,
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
return enriched
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// ─── Fallback ──────────────────────────────────────────────────────
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* Plan genérico de fallback cuando Ollama no está disponible.
|
|
393
|
+
*/
|
|
394
|
+
function getFallbackPlan() {
|
|
395
|
+
return {
|
|
396
|
+
pageType: PAGE_TYPES.UNKNOWN,
|
|
397
|
+
confidence: 0,
|
|
398
|
+
title: "",
|
|
399
|
+
description: "",
|
|
400
|
+
links: [],
|
|
401
|
+
contentBlocks: [{ type: "main-content", importance: 10, description: "Contenido principal" }],
|
|
402
|
+
features: [],
|
|
403
|
+
strategy: "extraer-generico",
|
|
404
|
+
maxDepth: 1,
|
|
405
|
+
followLinks: false,
|
|
406
|
+
linkFilter: "",
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// ─── Test directo ──────────────────────────────────────────────────
|
|
411
|
+
|
|
412
|
+
/**
|
|
413
|
+
* Función de prueba para ver el analyzer en acción.
|
|
414
|
+
* Uso: node src/core/webscrape/analyzer.js <url>
|
|
415
|
+
*/
|
|
416
|
+
async function testAnalyzer() {
|
|
417
|
+
const url = process.argv[2] || "https://freefrontend.com/tailwind-code-examples/"
|
|
418
|
+
|
|
419
|
+
console.log("")
|
|
420
|
+
console.log(chalk.cyan(`🧠 Analyzer OPL`))
|
|
421
|
+
console.log(chalk.gray(` URL: ${url}`))
|
|
422
|
+
console.log(chalk.gray(` Modelo: ${MODEL}`))
|
|
423
|
+
console.log("")
|
|
424
|
+
|
|
425
|
+
const resp = await fetch(url, {
|
|
426
|
+
headers: { "User-Agent": "Mozilla/5.0 (compatible; OPL-Analyzer/1.0)" },
|
|
427
|
+
})
|
|
428
|
+
const html = await resp.text()
|
|
429
|
+
|
|
430
|
+
console.log(chalk.gray(` HTML: ${(html.length / 1024).toFixed(0)} KB`))
|
|
431
|
+
|
|
432
|
+
const start = Date.now()
|
|
433
|
+
const plan = await analyzePage(html, url)
|
|
434
|
+
const elapsed = Date.now() - start
|
|
435
|
+
|
|
436
|
+
console.log(chalk.green(` ✅ Analizado en ${elapsed}ms`))
|
|
437
|
+
console.log("")
|
|
438
|
+
console.log(chalk.cyan("═".repeat(50)))
|
|
439
|
+
console.log(chalk.white("📋 PLAN DE EXTRACCIÓN"))
|
|
440
|
+
console.log(chalk.cyan("═".repeat(50)))
|
|
441
|
+
console.log(
|
|
442
|
+
chalk.white(` Tipo: ${chalk.bold(plan.pageType)} (${plan.confidence}% confianza)`)
|
|
443
|
+
)
|
|
444
|
+
console.log(chalk.white(` Título: ${plan.title}`))
|
|
445
|
+
console.log(chalk.white(` Estrategia: ${plan.strategy}`))
|
|
446
|
+
console.log(chalk.white(` Profundidad: ${plan.maxDepth}`))
|
|
447
|
+
console.log(chalk.white(` Seguir links: ${plan.followLinks}`))
|
|
448
|
+
if (plan.linkFilter) console.log(chalk.white(` Filtro links: ${plan.linkFilter}`))
|
|
449
|
+
|
|
450
|
+
if (plan.contentBlocks.length > 0) {
|
|
451
|
+
console.log("")
|
|
452
|
+
console.log(chalk.gray(" Bloques de contenido:"))
|
|
453
|
+
plan.contentBlocks.forEach((b) => {
|
|
454
|
+
console.log(
|
|
455
|
+
chalk.gray(` [${b.importance}/10] ${b.type} — ${(b.description || "").slice(0, 60)}`)
|
|
456
|
+
)
|
|
457
|
+
})
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
if (plan.links.length > 0) {
|
|
461
|
+
console.log("")
|
|
462
|
+
console.log(chalk.gray(" Enlaces detectados (primeros 10):"))
|
|
463
|
+
plan.links.slice(0, 10).forEach((l) => {
|
|
464
|
+
console.log(
|
|
465
|
+
chalk.gray(` [${l.type}] ${l.url.slice(0, 40)} — ${(l.text || "").slice(0, 40)}`)
|
|
466
|
+
)
|
|
467
|
+
})
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
if (plan.features.length > 0) {
|
|
471
|
+
console.log("")
|
|
472
|
+
console.log(chalk.gray(` Features: ${plan.features.join(", ")}`))
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
console.log("")
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
// Permitir ejecución directa
|
|
479
|
+
if (process.argv[1]?.includes("analyzer")) {
|
|
480
|
+
testAnalyzer().catch(console.error)
|
|
481
|
+
}
|