openprompt-lang 1.2.6 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -8
- package/docs/EMBEDDINGS.md +214 -0
- package/docs/FRAMEWORK.md +52 -0
- package/docs/ONBOARDING_WORKFLOW.md +151 -0
- package/docs/OPL-ERRORES.md +504 -0
- package/docs/OPL_ACADEMIC_ISSUES.md +158 -0
- package/docs/WEB_SCRAPER_PLAN.md +454 -0
- package/package.json +7 -1
- package/scripts/postinstall.js +37 -0
- package/src/cli/commands-knowledge.js +1 -0
- package/src/cli/commands-opl.js +79 -1
- package/src/cli/commands-work.js +3 -1
- package/src/cli/commands-workflow.js +125 -6
- package/src/commands/init-core.js +188 -12
- package/src/commands/init-existing.js +13 -6
- package/src/commands/init-helpers.js +20 -14
- package/src/commands/knowledge-ops.js +52 -0
- package/src/commands/opl-embeddings.js +556 -0
- package/src/commands/opl-help.js +26 -2
- package/src/commands/opl-search.js +106 -2
- package/src/commands/opl-webscrape.js +390 -0
- package/src/commands/work-context.js +17 -0
- package/src/commands/workflow/close/index.js +2 -1
- package/src/commands/workflow/delivery/index.js +4 -0
- package/src/commands/workflow/discovery/index.js +4 -0
- package/src/commands/workflow/epic-cli.js +192 -0
- package/src/commands/workflow/select.js +146 -0
- package/src/commands/workflow/specification/index.js +4 -0
- package/src/commands/workflow/sprint-cli.js +174 -0
- package/src/core/engine/sandbox.js +7 -3
- package/src/core/webscrape/analyzer.js +481 -0
- package/src/core/webscrape/deep-scraper.js +1027 -0
- package/src/core/workflow/epic-manager.js +845 -0
- package/src/core/workflow/gates.js +180 -1
- package/src/core/workflow/selector.js +707 -0
- package/src/embeddings/chunker.js +450 -0
- package/src/embeddings/embedder.js +431 -0
- package/src/embeddings/index-pipeline.js +320 -0
- package/src/embeddings/vector-store.js +505 -0
- package/src/mcp-plan-server.js +12 -5
- package/src/mcp-shared-state.js +25 -0
- package/src/mcp-refactor/mcp-server.js +0 -171
- package/src/mcp-server-backup.js +0 -1913
|
@@ -0,0 +1,1027 @@
|
|
|
1
|
+
// @use(kind, contract, limit, deps)
|
|
2
|
+
// @kind(module)
|
|
3
|
+
// @contract(in: url -> out: structured knowledge tree, sideEffect: multiple fetches + saves)
|
|
4
|
+
// @limit(lines: 500)
|
|
5
|
+
// @deps(../commands/knowledge-helpers, node-fetch, jsdom)
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Deep Scraper OPL — Navega sitios jerárquicos (catálogos, directorios)
|
|
9
|
+
* y extrae contenido estructurado con navegación recursiva.
|
|
10
|
+
*
|
|
11
|
+
* Arquitectura:
|
|
12
|
+
* 1. Detecta estructura del sitio (categorías, subcategorías, items)
|
|
13
|
+
* 2. Sigue enlaces recursivamente con profundidad configurable
|
|
14
|
+
* 3. Extrae metadatos de cada página (cards, listados, etc.)
|
|
15
|
+
* 4. Intenta obtener código fuente (CodePen, GitHub, etc.)
|
|
16
|
+
* 5. Guarda todo en knowledge/ con estructura jerárquica
|
|
17
|
+
*
|
|
18
|
+
* Uso:
|
|
19
|
+
* import { deepScrape } from "./deep-scraper.js"
|
|
20
|
+
* await deepScrape("https://freefrontend.com", { maxDepth: 3 })
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import chalk from "chalk"
|
|
24
|
+
import { analyzePage, PAGE_TYPES as ANALYZER_PAGE_TYPES } from "./analyzer.js"
|
|
25
|
+
|
|
26
|
+
// Modelo Ollama por defecto (llama3.2 es más ligero para PCs modestas)
|
|
27
|
+
const DEFAULT_OLLAMA_MODEL = "llama3.2"
|
|
28
|
+
|
|
29
|
+
// ─── Constantes ────────────────────────────────────────────────────
|
|
30
|
+
|
|
31
|
+
const DEFAULT_MAX_DEPTH = 2
|
|
32
|
+
const DEFAULT_CONCURRENCY = 3
|
|
33
|
+
const REQUEST_TIMEOUT = 15000
|
|
34
|
+
const USER_AGENT = "Mozilla/5.0 (compatible; OPL-DeepScraper/1.0; +https://openprompt-lang.dev)"
|
|
35
|
+
|
|
36
|
+
// ─── Tipos de página detectables ────────────────────────────────────
|
|
37
|
+
|
|
38
|
+
const PAGE_TYPES = {
|
|
39
|
+
CATALOG_HOME: "catalog-home", // freefrontend.com (tech categories)
|
|
40
|
+
CATEGORY_INDEX: "category-index", // /tailwind-code-examples/ (subcategories list)
|
|
41
|
+
ITEM_LIST: "item-list", // /tailwind-buttons/ (card grid)
|
|
42
|
+
ITEM_DETAIL: "item-detail", // Individual example page
|
|
43
|
+
ARTICLE: "article", // Blog post / article
|
|
44
|
+
UNKNOWN: "unknown",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ─── Handler principal ────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Ejecuta el deep scrape desde una URL raíz.
|
|
51
|
+
*
|
|
52
|
+
* @param {string} startUrl - URL inicial
|
|
53
|
+
* @param {Object} [options]
|
|
54
|
+
* @param {number} [options.maxDepth=2] - Profundidad máxima de navegación
|
|
55
|
+
* @param {number} [options.concurrency=3] - Fetchs simultáneos
|
|
56
|
+
* @param {string} [options.domain='web'] - Dominio OPL destino
|
|
57
|
+
* @param {boolean} [options.noEmbed] - Saltar embeddings
|
|
58
|
+
* @param {boolean} [options.verbose] - Log detallado
|
|
59
|
+
* @returns {Promise<{success: boolean, pagesScraped: number, itemsExtracted: number, tree: Object}>}
|
|
60
|
+
*/
|
|
61
|
+
export async function deepScrape(startUrl, options = {}) {
|
|
62
|
+
const maxDepth = options.maxDepth || DEFAULT_MAX_DEPTH
|
|
63
|
+
const concurrency = options.concurrency || DEFAULT_CONCURRENCY
|
|
64
|
+
const domain = options.domain || "web"
|
|
65
|
+
const verbose = options.verbose === true
|
|
66
|
+
const noEmbed = options.noEmbed === true
|
|
67
|
+
const useOllama = options.ollama !== false // default: true
|
|
68
|
+
|
|
69
|
+
console.log("")
|
|
70
|
+
console.log(chalk.cyan(`🕸️ Deep Scraper: "${chalk.bold(startUrl)}"`))
|
|
71
|
+
console.log(chalk.gray(` Profundidad máxima: ${maxDepth}`))
|
|
72
|
+
console.log(chalk.gray(` Concurrencia: ${concurrency}`))
|
|
73
|
+
if (useOllama) console.log(chalk.gray(` 🧠 Ollama: habilitado (llama3.2)`))
|
|
74
|
+
console.log("")
|
|
75
|
+
|
|
76
|
+
const visited = new Set()
|
|
77
|
+
const tree = {
|
|
78
|
+
url: startUrl,
|
|
79
|
+
type: null,
|
|
80
|
+
title: null,
|
|
81
|
+
children: [],
|
|
82
|
+
items: [],
|
|
83
|
+
stats: { pagesScraped: 0, itemsExtracted: 0 },
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
try {
|
|
87
|
+
await scrapeRecursive(startUrl, 0, maxDepth, visited, tree, {
|
|
88
|
+
domain,
|
|
89
|
+
verbose,
|
|
90
|
+
concurrency,
|
|
91
|
+
noEmbed,
|
|
92
|
+
useOllama,
|
|
93
|
+
ollamaModel: DEFAULT_OLLAMA_MODEL,
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
// Resultado final
|
|
97
|
+
console.log("")
|
|
98
|
+
console.log(chalk.cyan("═".repeat(50)))
|
|
99
|
+
console.log(chalk.green(`✅ Deep Scrape completado`))
|
|
100
|
+
console.log(chalk.gray(` Páginas scrapeadas: ${tree.stats.pagesScraped}`))
|
|
101
|
+
console.log(chalk.gray(` Items extraídos: ${tree.stats.itemsExtracted}`))
|
|
102
|
+
console.log(chalk.gray(` Nivel de profundidad: ${maxDepth}`))
|
|
103
|
+
if (tree.stats.codepenLinks) {
|
|
104
|
+
console.log(chalk.gray(` Enlaces a CodePen: ${tree.stats.codepenLinks}`))
|
|
105
|
+
}
|
|
106
|
+
console.log("")
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
success: true,
|
|
110
|
+
pagesScraped: tree.stats.pagesScraped,
|
|
111
|
+
itemsExtracted: tree.stats.itemsExtracted,
|
|
112
|
+
tree,
|
|
113
|
+
}
|
|
114
|
+
} catch (error) {
|
|
115
|
+
console.log(chalk.red(`\n❌ Deep Scrape falló: ${error.message}\n`))
|
|
116
|
+
return {
|
|
117
|
+
success: false,
|
|
118
|
+
pagesScraped: tree.stats.pagesScraped,
|
|
119
|
+
itemsExtracted: tree.stats.itemsExtracted,
|
|
120
|
+
tree,
|
|
121
|
+
error: error.message,
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// ─── Navegación recursiva ─────────────────────────────────────────
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Scrapea una URL y decide cómo continuar según el tipo de página.
|
|
130
|
+
*
|
|
131
|
+
* @param {string} url
|
|
132
|
+
* @param {number} depth - Profundidad actual
|
|
133
|
+
* @param {number} maxDepth - Profundidad máxima
|
|
134
|
+
* @param {Set} visited - URLs ya visitadas
|
|
135
|
+
* @param {Object} tree - Árbol de resultado
|
|
136
|
+
* @param {Object} opts - Opciones
|
|
137
|
+
*/
|
|
138
|
+
async function scrapeRecursive(url, depth, maxDepth, visited, tree, opts) {
|
|
139
|
+
if (visited.has(url) || depth > maxDepth) return
|
|
140
|
+
visited.add(url)
|
|
141
|
+
|
|
142
|
+
const { hostname, pathname } = new URL(url)
|
|
143
|
+
const isSameDomain = (link) => {
|
|
144
|
+
try {
|
|
145
|
+
return new URL(link, url).hostname === hostname
|
|
146
|
+
} catch {
|
|
147
|
+
return false
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (opts.verbose) {
|
|
152
|
+
console.log(chalk.gray(` ${" ".repeat(depth)}🔍 ${depth > 0 ? "└─ " : ""}${url}`))
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Fetch HTML
|
|
156
|
+
const html = await fetchPage(url)
|
|
157
|
+
if (!html) return
|
|
158
|
+
|
|
159
|
+
tree.stats.pagesScraped++
|
|
160
|
+
|
|
161
|
+
// Detectar tipo de página
|
|
162
|
+
let pageType
|
|
163
|
+
let ollamaPlan = null
|
|
164
|
+
|
|
165
|
+
// Solo usar Ollama para la primera página (depth === 0).
|
|
166
|
+
// Para subpáginas, usar heurísticas que son más rápidas.
|
|
167
|
+
if (opts.useOllama && depth === 0) {
|
|
168
|
+
// Usar Ollama para clasificar la página raíz
|
|
169
|
+
try {
|
|
170
|
+
ollamaPlan = await analyzePage(html, url, { model: opts.ollamaModel })
|
|
171
|
+
pageType = mapAnalyzerType(ollamaPlan.pageType)
|
|
172
|
+
if (opts.verbose) {
|
|
173
|
+
console.log(
|
|
174
|
+
chalk.gray(
|
|
175
|
+
` ${" ".repeat(depth)}🧠 Ollama: ${ollamaPlan.pageType} (${ollamaPlan.confidence}%) → ${pageType}`
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
}
|
|
179
|
+
} catch {
|
|
180
|
+
// Fallback a heurísticas si Ollama falla
|
|
181
|
+
pageType = detectPageType(url, html)
|
|
182
|
+
}
|
|
183
|
+
} else {
|
|
184
|
+
// Heurísticas rápidas para subpáginas
|
|
185
|
+
pageType = detectPageType(url, html)
|
|
186
|
+
}
|
|
187
|
+
if (!tree.type) tree.type = pageType
|
|
188
|
+
|
|
189
|
+
// Extraer título
|
|
190
|
+
const title = extractTitle(html, url)
|
|
191
|
+
if (!tree.title) tree.title = title
|
|
192
|
+
|
|
193
|
+
// Según el tipo de página, extraer diferente contenido
|
|
194
|
+
switch (pageType) {
|
|
195
|
+
case PAGE_TYPES.CATALOG_HOME:
|
|
196
|
+
await handleCatalogHome(
|
|
197
|
+
url,
|
|
198
|
+
html,
|
|
199
|
+
depth,
|
|
200
|
+
maxDepth,
|
|
201
|
+
visited,
|
|
202
|
+
tree,
|
|
203
|
+
opts,
|
|
204
|
+
isSameDomain,
|
|
205
|
+
ollamaPlan
|
|
206
|
+
)
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
case PAGE_TYPES.CATEGORY_INDEX:
|
|
210
|
+
await handleCategoryIndex(
|
|
211
|
+
url,
|
|
212
|
+
html,
|
|
213
|
+
depth,
|
|
214
|
+
maxDepth,
|
|
215
|
+
visited,
|
|
216
|
+
tree,
|
|
217
|
+
opts,
|
|
218
|
+
isSameDomain,
|
|
219
|
+
ollamaPlan
|
|
220
|
+
)
|
|
221
|
+
break
|
|
222
|
+
|
|
223
|
+
case PAGE_TYPES.ITEM_LIST:
|
|
224
|
+
await handleItemList(
|
|
225
|
+
url,
|
|
226
|
+
html,
|
|
227
|
+
depth,
|
|
228
|
+
maxDepth,
|
|
229
|
+
visited,
|
|
230
|
+
tree,
|
|
231
|
+
opts,
|
|
232
|
+
isSameDomain,
|
|
233
|
+
ollamaPlan
|
|
234
|
+
)
|
|
235
|
+
break
|
|
236
|
+
|
|
237
|
+
default:
|
|
238
|
+
// Unknown: solo extraer contenido con Readability
|
|
239
|
+
await extractGenericContent(url, html, tree, opts)
|
|
240
|
+
break
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// ─── Mapeo de tipos de página ──────────────────────────────────────
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Mapea tipos del analyzer a tipos internos del deep scraper.
|
|
248
|
+
* El analyzer puede devolver strings como "catalog-home" o "item-list"
|
|
249
|
+
* que mapeamos a nuestros PAGE_TYPES.
|
|
250
|
+
*/
|
|
251
|
+
function mapAnalyzerType(analyzerType) {
|
|
252
|
+
const type = (analyzerType || "").toLowerCase().trim()
|
|
253
|
+
const mapping = {
|
|
254
|
+
"catalog-home": PAGE_TYPES.CATALOG_HOME,
|
|
255
|
+
"category-index": PAGE_TYPES.CATEGORY_INDEX,
|
|
256
|
+
"item-list": PAGE_TYPES.ITEM_LIST,
|
|
257
|
+
"item-detail": PAGE_TYPES.ITEM_DETAIL,
|
|
258
|
+
article: PAGE_TYPES.ARTICLE,
|
|
259
|
+
product: PAGE_TYPES.PRODUCT,
|
|
260
|
+
docs: PAGE_TYPES.DOCS,
|
|
261
|
+
portfolio: PAGE_TYPES.PORTFOLIO,
|
|
262
|
+
landing: PAGE_TYPES.LANDING,
|
|
263
|
+
}
|
|
264
|
+
return mapping[type] || PAGE_TYPES.UNKNOWN
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// ─── Detectores de tipo de página ─────────────────────────────────
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Detecta el tipo de página basado en URL y contenido HTML.
|
|
271
|
+
*/
|
|
272
|
+
function detectPageType(url, html) {
|
|
273
|
+
const path = new URL(url).pathname.toLowerCase()
|
|
274
|
+
|
|
275
|
+
// Catalog homepage: tiene nav con categorías principales
|
|
276
|
+
if (path === "/" || path === "") {
|
|
277
|
+
if (hasCatalogStructure(html)) return PAGE_TYPES.CATALOG_HOME
|
|
278
|
+
return PAGE_TYPES.UNKNOWN
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// Category index: /tailwind-code-examples/ - tiene subcategorías listadas
|
|
282
|
+
if (path.match(/\/[a-z-]+(?:code-examples|examples)\/?$/)) {
|
|
283
|
+
if (hasSubcategoryLinks(html)) return PAGE_TYPES.CATEGORY_INDEX
|
|
284
|
+
return PAGE_TYPES.ITEM_LIST
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Item list: /tailwind-buttons/ - tiene grid de cards
|
|
288
|
+
if (path.match(/\/tailwind-[a-z-]+\/?$/)) {
|
|
289
|
+
if (hasCardGrid(html)) return PAGE_TYPES.ITEM_LIST
|
|
290
|
+
return PAGE_TYPES.UNKNOWN
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
return PAGE_TYPES.UNKNOWN
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Detecta si el HTML tiene estructura de catálogo (nav con tecnologías).
|
|
298
|
+
*/
|
|
299
|
+
function hasCatalogStructure(html) {
|
|
300
|
+
// Buscar sidebar con grupos de navegación (Languages, Frameworks)
|
|
301
|
+
return (
|
|
302
|
+
(html.includes("nav-title") || html.includes("nav-group")) &&
|
|
303
|
+
(html.includes("code-examples") || html.includes("nav-list"))
|
|
304
|
+
)
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Detecta si el HTML contiene una lista de subcategorías.
|
|
309
|
+
*/
|
|
310
|
+
function hasSubcategoryLinks(html) {
|
|
311
|
+
const links = extractAllLinks(html)
|
|
312
|
+
const subcatCount = links.filter(
|
|
313
|
+
(l) => l.match(/\/[a-z]+-[a-z-]+\/?$/) && !l.includes("page/")
|
|
314
|
+
).length
|
|
315
|
+
return subcatCount > 5
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Detecta si el HTML contiene un grid de cards.
|
|
320
|
+
*/
|
|
321
|
+
function hasCardGrid(html) {
|
|
322
|
+
const cardCount = (html.match(/<div[^>]*class="?card[^"]*"?>/gi) || []).length
|
|
323
|
+
return cardCount > 2
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// ─── Handlers por tipo de página ──────────────────────────────────
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Maneja páginas tipo CATALOG_HOME (freefrontend.com).
|
|
330
|
+
* Extrae categorías de tecnología del nav y las sigue.
|
|
331
|
+
*/
|
|
332
|
+
async function handleCatalogHome(
|
|
333
|
+
url,
|
|
334
|
+
html,
|
|
335
|
+
depth,
|
|
336
|
+
maxDepth,
|
|
337
|
+
visited,
|
|
338
|
+
tree,
|
|
339
|
+
opts,
|
|
340
|
+
isSameDomain,
|
|
341
|
+
ollamaPlan = null
|
|
342
|
+
) {
|
|
343
|
+
// Extraer links de categorías del nav/sidebar
|
|
344
|
+
const links = extractCategoryLinks(html, url)
|
|
345
|
+
|
|
346
|
+
if (opts.verbose) {
|
|
347
|
+
console.log(chalk.gray(` ${" ".repeat(depth)}📂 Categorías encontradas: ${links.length}`))
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
const childNode = {
|
|
351
|
+
url,
|
|
352
|
+
type: PAGE_TYPES.CATALOG_HOME,
|
|
353
|
+
title: extractTitle(html, url),
|
|
354
|
+
children: [],
|
|
355
|
+
items: [],
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
// Seguir cada categoría (con concurrencia limitada)
|
|
359
|
+
const batchSize = opts.concurrency
|
|
360
|
+
for (let i = 0; i < links.length; i += batchSize) {
|
|
361
|
+
const batch = links.slice(i, i + batchSize)
|
|
362
|
+
await Promise.all(
|
|
363
|
+
batch.map(async (link) => {
|
|
364
|
+
const childTree = {
|
|
365
|
+
url: link,
|
|
366
|
+
children: [],
|
|
367
|
+
items: [],
|
|
368
|
+
stats: { pagesScraped: 0, itemsExtracted: 0 },
|
|
369
|
+
}
|
|
370
|
+
await scrapeRecursive(link, depth + 1, maxDepth, visited, childTree, opts)
|
|
371
|
+
if (childTree.stats.pagesScraped > 0 || childTree.stats.itemsExtracted > 0) {
|
|
372
|
+
childNode.children.push(childTree)
|
|
373
|
+
tree.stats.pagesScraped += childTree.stats.pagesScraped
|
|
374
|
+
tree.stats.itemsExtracted += childTree.stats.itemsExtracted
|
|
375
|
+
}
|
|
376
|
+
})
|
|
377
|
+
)
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
tree.children.push(childNode)
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Maneja páginas tipo CATEGORY_INDEX (/tailwind-code-examples/).
|
|
385
|
+
* Extrae subcategorías y las sigue.
|
|
386
|
+
*/
|
|
387
|
+
async function handleCategoryIndex(
|
|
388
|
+
url,
|
|
389
|
+
html,
|
|
390
|
+
depth,
|
|
391
|
+
maxDepth,
|
|
392
|
+
visited,
|
|
393
|
+
tree,
|
|
394
|
+
opts,
|
|
395
|
+
isSameDomain,
|
|
396
|
+
ollamaPlan = null
|
|
397
|
+
) {
|
|
398
|
+
let links = extractSubcategoryLinks(html, url)
|
|
399
|
+
|
|
400
|
+
// Si Ollama sugirió un filtro de links, usarlo para priorizar
|
|
401
|
+
if (ollamaPlan?.linkFilter && links.length > 0) {
|
|
402
|
+
const filter = ollamaPlan.linkFilter.toLowerCase()
|
|
403
|
+
const filtered = links.filter((l) => l.toLowerCase().includes(filter))
|
|
404
|
+
if (filtered.length > 0 && filtered.length < links.length) {
|
|
405
|
+
if (opts.verbose) {
|
|
406
|
+
console.log(
|
|
407
|
+
chalk.gray(
|
|
408
|
+
` ${" ".repeat(depth)}🧠 Ollama filter: "${filter}" → ${filtered.length}/${links.length} links`
|
|
409
|
+
)
|
|
410
|
+
)
|
|
411
|
+
}
|
|
412
|
+
links = filtered
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// Si Ollama detectó links específicos, usarlos como prioridad
|
|
417
|
+
if (ollamaPlan?.links?.length > 0) {
|
|
418
|
+
const ollamaUrls = ollamaPlan.links
|
|
419
|
+
.filter((l) => l.type === "category" || l.type === "subcategory")
|
|
420
|
+
.map((l) => {
|
|
421
|
+
try {
|
|
422
|
+
return l.url.startsWith("http") ? l.url : new URL(l.url, url).href
|
|
423
|
+
} catch {
|
|
424
|
+
return null
|
|
425
|
+
}
|
|
426
|
+
})
|
|
427
|
+
.filter(Boolean)
|
|
428
|
+
|
|
429
|
+
if (ollamaUrls.length > 0) {
|
|
430
|
+
// Combinar: links de Ollama que también estén en los extraídos por heurísticas
|
|
431
|
+
const heuristicSet = new Set(links)
|
|
432
|
+
const ollamaMatched = ollamaUrls.filter((u) => heuristicSet.has(u))
|
|
433
|
+
if (ollamaMatched.length > 0) {
|
|
434
|
+
// Ollama acertó: usar solo los relevantes
|
|
435
|
+
links = ollamaMatched
|
|
436
|
+
if (opts.verbose) {
|
|
437
|
+
console.log(
|
|
438
|
+
chalk.gray(
|
|
439
|
+
` ${" ".repeat(depth)}🧠 Ollama links: ${ollamaMatched.length}/${ollamaUrls.length} matched`
|
|
440
|
+
)
|
|
441
|
+
)
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
if (opts.verbose) {
|
|
448
|
+
console.log(chalk.gray(` ${" ".repeat(depth)}📁 Subcategorías: ${links.length}`))
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
const childNode = {
|
|
452
|
+
url,
|
|
453
|
+
type: PAGE_TYPES.CATEGORY_INDEX,
|
|
454
|
+
title: extractTitle(html, url),
|
|
455
|
+
children: [],
|
|
456
|
+
items: [],
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
const batchSize = opts.concurrency
|
|
460
|
+
for (let i = 0; i < links.length; i += batchSize) {
|
|
461
|
+
const batch = links.slice(i, i + batchSize)
|
|
462
|
+
await Promise.all(
|
|
463
|
+
batch.map(async (link) => {
|
|
464
|
+
const childTree = {
|
|
465
|
+
url: link,
|
|
466
|
+
children: [],
|
|
467
|
+
items: [],
|
|
468
|
+
stats: { pagesScraped: 0, itemsExtracted: 0 },
|
|
469
|
+
}
|
|
470
|
+
await scrapeRecursive(link, depth + 1, maxDepth, visited, childTree, opts)
|
|
471
|
+
if (childTree.stats.pagesScraped > 0 || childTree.stats.itemsExtracted > 0) {
|
|
472
|
+
childNode.children.push(childTree)
|
|
473
|
+
tree.stats.pagesScraped += childTree.stats.pagesScraped
|
|
474
|
+
tree.stats.itemsExtracted += childTree.stats.itemsExtracted
|
|
475
|
+
}
|
|
476
|
+
})
|
|
477
|
+
)
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
tree.children.push(childNode)
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
/**
|
|
484
|
+
* Maneja páginas tipo ITEM_LIST (/tailwind-buttons/).
|
|
485
|
+
* Extrae todas las cards con su metadata.
|
|
486
|
+
*/
|
|
487
|
+
async function handleItemList(
|
|
488
|
+
url,
|
|
489
|
+
html,
|
|
490
|
+
depth,
|
|
491
|
+
maxDepth,
|
|
492
|
+
visited,
|
|
493
|
+
tree,
|
|
494
|
+
opts,
|
|
495
|
+
isSameDomain,
|
|
496
|
+
ollamaPlan = null
|
|
497
|
+
) {
|
|
498
|
+
const cards = extractCards(html, url)
|
|
499
|
+
|
|
500
|
+
if (opts.verbose) {
|
|
501
|
+
console.log(chalk.gray(` ${" ".repeat(depth)}🎴 Cards: ${cards.length}`))
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
const childNode = {
|
|
505
|
+
url,
|
|
506
|
+
type: PAGE_TYPES.ITEM_LIST,
|
|
507
|
+
title: extractTitle(html, url),
|
|
508
|
+
children: [],
|
|
509
|
+
items: cards,
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
tree.children.push(childNode)
|
|
513
|
+
tree.stats.itemsExtracted += cards.length
|
|
514
|
+
|
|
515
|
+
// Guardar cada card como conocimiento
|
|
516
|
+
for (const card of cards) {
|
|
517
|
+
await saveCard(card, opts.domain)
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// Contar CodePen links
|
|
521
|
+
const codepenCount = cards.filter((c) => c.codepenSlug).length
|
|
522
|
+
if (!tree.stats.codepenLinks) tree.stats.codepenLinks = 0
|
|
523
|
+
tree.stats.codepenLinks += codepenCount
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
// ─── Extractores de estructura ─────────────────────────────────────
|
|
527
|
+
|
|
528
|
+
/**
|
|
529
|
+
* Extrae links de categorías del nav/sidebar del homepage.
|
|
530
|
+
*/
|
|
531
|
+
function extractCategoryLinks(html, baseUrl) {
|
|
532
|
+
const links = extractAllLinks(html)
|
|
533
|
+
// Filtrar solo links que sean categorías de tecnología
|
|
534
|
+
const techPatterns = [
|
|
535
|
+
"code-examples",
|
|
536
|
+
"html",
|
|
537
|
+
"css",
|
|
538
|
+
"javascript",
|
|
539
|
+
"js",
|
|
540
|
+
"bootstrap",
|
|
541
|
+
"tailwind",
|
|
542
|
+
"react",
|
|
543
|
+
"vue",
|
|
544
|
+
"angular",
|
|
545
|
+
"svelte",
|
|
546
|
+
"jquery",
|
|
547
|
+
"typescript",
|
|
548
|
+
]
|
|
549
|
+
|
|
550
|
+
return [
|
|
551
|
+
...new Set(
|
|
552
|
+
links.filter((l) => {
|
|
553
|
+
const path = l.replace(baseUrl.origin, "")
|
|
554
|
+
return techPatterns.some((p) => path.includes(p)) && !path.includes("page/")
|
|
555
|
+
})
|
|
556
|
+
),
|
|
557
|
+
].map((l) => (l.startsWith("http") ? l : new URL(l, baseUrl).href))
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
/**
|
|
561
|
+
* Extrae links de subcategorías de una página de categoría.
|
|
562
|
+
* Ej: /tailwind-code-examples/ → /tailwind-buttons/, /tailwind-cards/, etc.
|
|
563
|
+
*/
|
|
564
|
+
function extractSubcategoryLinks(html, baseUrl) {
|
|
565
|
+
const links = extractAllLinks(html)
|
|
566
|
+
const currentPath = new URL(baseUrl).pathname.replace(/\/$/, "")
|
|
567
|
+
const pathPrefix = currentPath.split("/").pop() || ""
|
|
568
|
+
const tech = pathPrefix.split("-code")[0] || pathPrefix
|
|
569
|
+
|
|
570
|
+
return [
|
|
571
|
+
...new Set(
|
|
572
|
+
links.filter((l) => {
|
|
573
|
+
const path = typeof l === "string" ? l.replace(baseUrl.origin, "") : ""
|
|
574
|
+
// Debe ser un link a una subcategoría del mismo tipo
|
|
575
|
+
return (
|
|
576
|
+
path.includes(tech) &&
|
|
577
|
+
path !== currentPath && // excluir la misma página
|
|
578
|
+
!path.includes("page/") &&
|
|
579
|
+
!path.includes("code-examples") &&
|
|
580
|
+
path.split("/").filter(Boolean).length <= 3
|
|
581
|
+
)
|
|
582
|
+
})
|
|
583
|
+
),
|
|
584
|
+
]
|
|
585
|
+
.map((l) => (l.startsWith("http") ? l : new URL(l, baseUrl).href))
|
|
586
|
+
.slice(0, 100)
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
/**
|
|
590
|
+
* Extrae todas las URLs href de un HTML.
|
|
591
|
+
*/
|
|
592
|
+
function extractAllLinks(html) {
|
|
593
|
+
const hrefs = [...html.matchAll(/href=(\S+?)(?:\s|>)/g)]
|
|
594
|
+
return hrefs
|
|
595
|
+
.map((m) => {
|
|
596
|
+
let v = m[1]
|
|
597
|
+
if ((v.startsWith('"') && v.endsWith('"')) || (v.startsWith("'") && v.endsWith("'"))) {
|
|
598
|
+
v = v.slice(1, -1)
|
|
599
|
+
}
|
|
600
|
+
return v
|
|
601
|
+
})
|
|
602
|
+
.filter((h) => {
|
|
603
|
+
return (
|
|
604
|
+
h &&
|
|
605
|
+
!h.startsWith("#") &&
|
|
606
|
+
!h.startsWith("javascript") &&
|
|
607
|
+
!h.startsWith("mailto") &&
|
|
608
|
+
!h.startsWith("tel") &&
|
|
609
|
+
!h.includes("google") &&
|
|
610
|
+
!h.includes("facebook") &&
|
|
611
|
+
!h.includes("twitter") &&
|
|
612
|
+
!h.endsWith(".css") &&
|
|
613
|
+
!h.endsWith(".js") &&
|
|
614
|
+
!h.endsWith(".png") &&
|
|
615
|
+
!h.endsWith(".jpg") &&
|
|
616
|
+
!h.endsWith(".svg") &&
|
|
617
|
+
!h.endsWith(".ico") &&
|
|
618
|
+
!h.endsWith(".xml") &&
|
|
619
|
+
h.length > 2
|
|
620
|
+
)
|
|
621
|
+
})
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
// ─── Extractores de contenido ──────────────────────────────────────
|
|
625
|
+
|
|
626
|
+
/**
|
|
627
|
+
* Extrae el título de una página HTML.
|
|
628
|
+
*/
|
|
629
|
+
function extractTitle(html, url) {
|
|
630
|
+
const titleMatch = html.match(/<title>([^<]+)<\/title>/i)
|
|
631
|
+
if (titleMatch) return titleMatch[1].trim()
|
|
632
|
+
return new URL(url).hostname
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
/**
|
|
636
|
+
* Extrae todas las cards de una página ITEM_LIST.
|
|
637
|
+
*
|
|
638
|
+
* freefrontend.com tiene 2 tipos de cards:
|
|
639
|
+
*
|
|
640
|
+
* Tipo A (CodePen popover, ~12 por página):
|
|
641
|
+
* <div class=card>
|
|
642
|
+
* <div class=card-header>
|
|
643
|
+
* <h3>Título</h3>
|
|
644
|
+
* <button popovertarget=XXX>demo & code</button>
|
|
645
|
+
* </div>
|
|
646
|
+
* <div class=card-content>...</div>
|
|
647
|
+
* <div class=card-meta-info>...</div>
|
|
648
|
+
* </div>
|
|
649
|
+
* <div id=XXX popover><p class=codepen data-slug-hash=XXX></div>
|
|
650
|
+
*
|
|
651
|
+
* Tipo B (External link, ~18 por página, <div class=snippet-card>):
|
|
652
|
+
* <div class=snippet-card>
|
|
653
|
+
* <div class=card-header>
|
|
654
|
+
* <h3><a href=https://...>Título<img class=external></a></h3>
|
|
655
|
+
* </div>
|
|
656
|
+
* <video/img ...>
|
|
657
|
+
* </div>
|
|
658
|
+
*/
|
|
659
|
+
function extractCards(html, baseUrl) {
|
|
660
|
+
// ── Fase 1: Construir mapa popover ID → slug ──
|
|
661
|
+
const popoverMap = new Map()
|
|
662
|
+
const popoverRegex = /<div[^>]*\bid=(\S+?)[\s>][^>]*\bpopover\b[^>]*>([\s\S]{0,2000}?)<\/div>/gi
|
|
663
|
+
let pMatch
|
|
664
|
+
while ((pMatch = popoverRegex.exec(html)) !== null) {
|
|
665
|
+
const id = pMatch[1].replace(/"/g, "")
|
|
666
|
+
const slugMatch = pMatch[2].match(/data-slug-hash=([^\s>"']+)/)
|
|
667
|
+
if (slugMatch) {
|
|
668
|
+
popoverMap.set(id, slugMatch[1])
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
const cards = []
|
|
673
|
+
|
|
674
|
+
// ── Tipo A: cards con popover CodePen ──
|
|
675
|
+
// La estructura real tiene card-content ANIDADO:
|
|
676
|
+
// <div class=card-header>...<button popovertarget=XXX>...</div>
|
|
677
|
+
// <div class=card-content>
|
|
678
|
+
// <div class=card-content> ← anidado!
|
|
679
|
+
// <p>Descripción</p>
|
|
680
|
+
// </div>
|
|
681
|
+
// </div>
|
|
682
|
+
// <div class=card-meta-info>...</div>
|
|
683
|
+
//
|
|
684
|
+
// Estrategia: buscar card-header que contenga popovertarget,
|
|
685
|
+
// luego card-meta-info, y entre medio está el content.
|
|
686
|
+
const typeAPattern = /<div class=card-header>([\s\S]{0,800}?)<\/div>/gi
|
|
687
|
+
let aMatch
|
|
688
|
+
while ((aMatch = typeAPattern.exec(html)) !== null) {
|
|
689
|
+
const header = aMatch[1]
|
|
690
|
+
const headerStart = aMatch.index
|
|
691
|
+
|
|
692
|
+
// Solo procesar si tiene popovertarget (Tipo A)
|
|
693
|
+
const popoverTarget = (header.match(/popovertarget=(\S+?)(?:\s|>)/) || [])[1]
|
|
694
|
+
if (!popoverTarget) continue
|
|
695
|
+
|
|
696
|
+
const title = extractCardTitle(header)
|
|
697
|
+
const codepenSlug = popoverMap.get(popoverTarget) || null
|
|
698
|
+
|
|
699
|
+
// Buscar card-meta-info DESPUÉS del header (hasta ~5000 chars)
|
|
700
|
+
const afterHeader = html.slice(headerStart, headerStart + 5000)
|
|
701
|
+
const metaMatch = afterHeader.match(
|
|
702
|
+
/<div class=card-meta-info>([\s\S]{0,3000}?)<\/div>\s*<\/div>/
|
|
703
|
+
)
|
|
704
|
+
const meta = metaMatch ? metaMatch[0] : ""
|
|
705
|
+
|
|
706
|
+
// Buscar descripción: el <p> entre card-header y card-meta-info
|
|
707
|
+
const contentEnd = metaMatch
|
|
708
|
+
? afterHeader.indexOf("<div class=card-meta-info>")
|
|
709
|
+
: afterHeader.length
|
|
710
|
+
const contentSection = afterHeader.slice(0, contentEnd)
|
|
711
|
+
|
|
712
|
+
// Extraer la descripción del <p> más largo
|
|
713
|
+
const desc = extractCardDescription(contentSection)
|
|
714
|
+
|
|
715
|
+
cards.push({
|
|
716
|
+
title,
|
|
717
|
+
description: desc,
|
|
718
|
+
technologies: extractTechnologies(meta),
|
|
719
|
+
difficulty: extractDifficulty(meta),
|
|
720
|
+
browserSupport: extractBrowserSupport(meta),
|
|
721
|
+
features: extractFeatures(meta),
|
|
722
|
+
requires: extractRequires(contentSection),
|
|
723
|
+
codepenSlug,
|
|
724
|
+
demoUrl: codepenSlug ? `https://codepen.io/anon/pen/${codepenSlug}` : null,
|
|
725
|
+
sourceUrl: baseUrl,
|
|
726
|
+
type: "codepen",
|
|
727
|
+
})
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
// ── Tipo B: snippet-cards (enlaces externos) ──
|
|
731
|
+
// Solo extraer tipo B si hay AL MENOS un card tipo A (significa que
|
|
732
|
+
// estamos en una página de subcategoría con componentes, no en una
|
|
733
|
+
// página de categorías que solo lista subcategorías)
|
|
734
|
+
const hasTypeA = cards.some((c) => c.type === "codepen")
|
|
735
|
+
if (!hasTypeA) return cards
|
|
736
|
+
|
|
737
|
+
// Patrón: <div class=snippet-card>...<div class=card-header>...<a href=...
|
|
738
|
+
const typeBPattern = /<div class=snippet-card>([\s\S]{0,3000}?)<\/div>\s*<\/div>/gi
|
|
739
|
+
let bMatch
|
|
740
|
+
while ((bMatch = typeBPattern.exec(html)) !== null) {
|
|
741
|
+
const section = bMatch[1]
|
|
742
|
+
const headerMatch = section.match(/<div class=card-header>([\s\S]{0,800}?)<\/div>/)
|
|
743
|
+
if (!headerMatch) continue
|
|
744
|
+
|
|
745
|
+
const header = headerMatch[1]
|
|
746
|
+
const title = extractCardTitle(header)
|
|
747
|
+
|
|
748
|
+
// Extraer enlace externo
|
|
749
|
+
const linkMatch = header.match(/href=(\S+?)(?:\s|>)/)
|
|
750
|
+
let demoUrl = null
|
|
751
|
+
if (linkMatch) {
|
|
752
|
+
let link = linkMatch[1]
|
|
753
|
+
if (
|
|
754
|
+
(link.startsWith('"') && link.endsWith('"')) ||
|
|
755
|
+
(link.startsWith("'") && link.endsWith("'"))
|
|
756
|
+
) {
|
|
757
|
+
link = link.slice(1, -1)
|
|
758
|
+
}
|
|
759
|
+
if (link.startsWith("http")) {
|
|
760
|
+
demoUrl = link
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
// Extraer descripción del snippet-card
|
|
765
|
+
const desc = extractCardDescription(section)
|
|
766
|
+
|
|
767
|
+
// Extraer video/imagen
|
|
768
|
+
const videoMatch = section.match(/<video[^>]*aria-label="([^"]+)"/)
|
|
769
|
+
const imgMatch = section.match(/<img[^>]*alt="([^"]+)"/)
|
|
770
|
+
const mediaDesc = videoMatch ? videoMatch[1] : imgMatch ? imgMatch[1] : null
|
|
771
|
+
|
|
772
|
+
cards.push({
|
|
773
|
+
title,
|
|
774
|
+
description: desc || mediaDesc || "",
|
|
775
|
+
technologies: [],
|
|
776
|
+
difficulty: null,
|
|
777
|
+
browserSupport: [],
|
|
778
|
+
features: [],
|
|
779
|
+
requires: [],
|
|
780
|
+
codepenSlug: null,
|
|
781
|
+
demoUrl,
|
|
782
|
+
sourceUrl: baseUrl,
|
|
783
|
+
type: "external",
|
|
784
|
+
})
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
return cards
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
/**
|
|
791
|
+
* Extrae el título de una card desde el header.
|
|
792
|
+
*/
|
|
793
|
+
function extractCardTitle(header) {
|
|
794
|
+
const hMatch = header.match(/<h[234][^>]*>([^<]+)/)
|
|
795
|
+
return hMatch ? hMatch[1].trim() : "Untitled"
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
/**
|
|
799
|
+
* Extrae la descripción de una card.
|
|
800
|
+
* Busca el <p> más largo dentro del contenido.
|
|
801
|
+
*/
|
|
802
|
+
function extractCardDescription(content) {
|
|
803
|
+
const pTags = [...content.matchAll(/<p>([\s\S]{0,2000}?)<\/p>/gi)]
|
|
804
|
+
if (!pTags.length) return ""
|
|
805
|
+
|
|
806
|
+
// Tomar el <p> más largo (el principal, no snippets cortos)
|
|
807
|
+
const sorted = pTags
|
|
808
|
+
.map((m) => m[1])
|
|
809
|
+
.filter((t) => t.replace(/<[^>]+>/g, "").trim().length > 20)
|
|
810
|
+
.sort((a, b) => b.length - a.length)
|
|
811
|
+
|
|
812
|
+
const best = sorted[0] || ""
|
|
813
|
+
return best
|
|
814
|
+
.replace(/<[^>]+>/g, " ")
|
|
815
|
+
.replace(/’/g, "'")
|
|
816
|
+
.replace(/“/g, '"')
|
|
817
|
+
.replace(/”/g, '"')
|
|
818
|
+
.replace(/&/g, "&")
|
|
819
|
+
.replace(/ /g, " ")
|
|
820
|
+
.replace(/\s+/g, " ")
|
|
821
|
+
.trim()
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
/**
|
|
825
|
+
* Extrae las tecnologías de la metadata.
|
|
826
|
+
*/
|
|
827
|
+
function extractTechnologies(meta) {
|
|
828
|
+
const techs = [...meta.matchAll(/<span class="meta-value tech-([^"]+)"[^>]*>([^<]+)<\/span>/gi)]
|
|
829
|
+
return techs.map((t) => t[2].trim())
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
/**
|
|
833
|
+
* Extrae la dificultad de la metadata.
|
|
834
|
+
*/
|
|
835
|
+
function extractDifficulty(meta) {
|
|
836
|
+
const diffMatch = meta.match(/Difficulty:<\/span>\s*\n?\s*<span[^>]*>([^<]+)/i)
|
|
837
|
+
return diffMatch ? diffMatch[1].trim() : null
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
/**
|
|
841
|
+
* Extrae browser support de la metadata.
|
|
842
|
+
*/
|
|
843
|
+
function extractBrowserSupport(meta) {
|
|
844
|
+
const browsers = [
|
|
845
|
+
...meta.matchAll(/<span class="meta-value browser-tag">[\s\S]{0,200}?<span>([^<]+)<\/span>/gi),
|
|
846
|
+
]
|
|
847
|
+
return browsers.map((b) => b[1].trim())
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
/**
|
|
851
|
+
* Extrae features de la metadata.
|
|
852
|
+
*/
|
|
853
|
+
function extractFeatures(meta) {
|
|
854
|
+
const features = [...meta.matchAll(/<span class=meta-value>([^<]+)<\/span>/g)]
|
|
855
|
+
// Filtrar technologies y otros valores no-features
|
|
856
|
+
const knownTechs = ["HTML", "CSS", "JavaScript", "TypeScript", "JS", "TS"]
|
|
857
|
+
const difficultyLevels = ["Beginner", "Intermediate", "Advanced"]
|
|
858
|
+
return features
|
|
859
|
+
.map((f) => f[1].trim())
|
|
860
|
+
.filter((f) => !knownTechs.includes(f) && !difficultyLevels.includes(f) && f.length > 2)
|
|
861
|
+
.slice(0, 10)
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
/**
|
|
865
|
+
* Extrae los requires de una card (tecnologías requeridas).
|
|
866
|
+
*/
|
|
867
|
+
function extractRequires(content) {
|
|
868
|
+
const reqMatch = content.match(/Requires?:\s*([^<]+)/i)
|
|
869
|
+
if (!reqMatch) return []
|
|
870
|
+
return reqMatch[1]
|
|
871
|
+
.split(",")
|
|
872
|
+
.map((r) => r.trim())
|
|
873
|
+
.filter(Boolean)
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
/**
|
|
877
|
+
* Extrae el CodePen slug hash del HTML (busca popovers con data-slug-hash).
|
|
878
|
+
* Busca en todo el HTML cercano al índice de la card.
|
|
879
|
+
*/
|
|
880
|
+
function extractCodePenSlug(html, cardIndex) {
|
|
881
|
+
// Buscar popover después de la card (dentro de ~5000 chars)
|
|
882
|
+
const afterCard = html.slice(cardIndex, cardIndex + 5000)
|
|
883
|
+
const slugMatch = afterCard.match(/data-slug-hash=([^\s>"']+)/)
|
|
884
|
+
return slugMatch ? slugMatch[1] : null
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
// ─── Fetch ─────────────────────────────────────────────────────────
|
|
888
|
+
|
|
889
|
+
/**
|
|
890
|
+
* Fetch de una página HTML con timeout y User-Agent.
|
|
891
|
+
*
|
|
892
|
+
* @param {string} url
|
|
893
|
+
* @returns {Promise<string|null>}
|
|
894
|
+
*/
|
|
895
|
+
async function fetchPage(url) {
|
|
896
|
+
try {
|
|
897
|
+
const response = await fetch(url, {
|
|
898
|
+
headers: {
|
|
899
|
+
"User-Agent": USER_AGENT,
|
|
900
|
+
Accept: "text/html,application/xhtml+xml",
|
|
901
|
+
},
|
|
902
|
+
signal: AbortSignal.timeout(REQUEST_TIMEOUT),
|
|
903
|
+
})
|
|
904
|
+
|
|
905
|
+
if (!response.ok) {
|
|
906
|
+
console.log(chalk.yellow(` ⚠️ HTTP ${response.status}: ${url}`))
|
|
907
|
+
return null
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
return await response.text()
|
|
911
|
+
} catch (error) {
|
|
912
|
+
console.log(chalk.yellow(` ⚠️ Fetch error: ${url} - ${error.message}`))
|
|
913
|
+
return null
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
// ─── Guardado ──────────────────────────────────────────────────────
|
|
918
|
+
|
|
919
|
+
/**
|
|
920
|
+
* Guarda una card extraída en la estructura knowledge/.
|
|
921
|
+
*
|
|
922
|
+
* @param {Object} card - Card data
|
|
923
|
+
* @param {string} domain - Dominio OPL
|
|
924
|
+
*/
|
|
925
|
+
async function saveCard(card, domain) {
|
|
926
|
+
try {
|
|
927
|
+
const { existsSync, mkdirSync, writeFileSync } = await import("fs")
|
|
928
|
+
const { join } = await import("path")
|
|
929
|
+
const { getProjectKnowledgeDir } = await import("../../commands/knowledge-helpers.js")
|
|
930
|
+
|
|
931
|
+
const knowledgeDir = getProjectKnowledgeDir()
|
|
932
|
+
if (!knowledgeDir) return
|
|
933
|
+
|
|
934
|
+
const slug = slugify(card.title)
|
|
935
|
+
const docDir = join(knowledgeDir, domain, "components", slug)
|
|
936
|
+
mkdirSync(docDir, { recursive: true })
|
|
937
|
+
|
|
938
|
+
// Guardar metadata
|
|
939
|
+
const meta = {
|
|
940
|
+
title: card.title,
|
|
941
|
+
description: card.description,
|
|
942
|
+
technologies: card.technologies,
|
|
943
|
+
difficulty: card.difficulty,
|
|
944
|
+
browserSupport: card.browserSupport,
|
|
945
|
+
features: card.features,
|
|
946
|
+
requires: card.requires,
|
|
947
|
+
codepen: card.demoUrl,
|
|
948
|
+
codepenSlug: card.codepenSlug,
|
|
949
|
+
source: card.sourceUrl,
|
|
950
|
+
scrapedAt: new Date().toISOString(),
|
|
951
|
+
type: "frontend-component",
|
|
952
|
+
domain,
|
|
953
|
+
}
|
|
954
|
+
writeFileSync(join(docDir, "meta.json"), JSON.stringify(meta, null, 2), "utf-8")
|
|
955
|
+
|
|
956
|
+
// Guardar descripción como markdown
|
|
957
|
+
const markdown = `# ${card.title}
|
|
958
|
+
|
|
959
|
+
${card.description}
|
|
960
|
+
|
|
961
|
+
## Metadata
|
|
962
|
+
|
|
963
|
+
- **Tecnologías:** ${card.technologies.join(", ") || "N/A"}
|
|
964
|
+
- **Dificultad:** ${card.difficulty || "N/A"}
|
|
965
|
+
- **Features:** ${card.features.join(", ") || "N/A"}
|
|
966
|
+
- **Browser Support:** ${card.browserSupport.join(", ") || "N/A"}
|
|
967
|
+
|
|
968
|
+
## Demo
|
|
969
|
+
|
|
970
|
+
${card.demoUrl || "No disponible"}
|
|
971
|
+
`
|
|
972
|
+
writeFileSync(join(docDir, "page.md"), markdown, "utf-8")
|
|
973
|
+
} catch (error) {
|
|
974
|
+
// No crítico - solo log si verbose
|
|
975
|
+
if (error.message.includes("knowledge-helpers")) {
|
|
976
|
+
// knowledge dir not configured
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
/**
|
|
982
|
+
* Extracción genérica con Readability (fallback para páginas desconocidas).
|
|
983
|
+
*/
|
|
984
|
+
async function extractGenericContent(url, html, tree, opts) {
|
|
985
|
+
try {
|
|
986
|
+
const { JSDOM } = await import("jsdom")
|
|
987
|
+
const dom = new JSDOM(html, { url })
|
|
988
|
+
const { Readability } = await import("@mozilla/readability")
|
|
989
|
+
const reader = new Readability(dom.window.document)
|
|
990
|
+
const article = reader.parse()
|
|
991
|
+
|
|
992
|
+
if (article && article.content) {
|
|
993
|
+
const TurndownService = (await import("turndown")).default
|
|
994
|
+
const turndown = new TurndownService({
|
|
995
|
+
headingStyle: "atx",
|
|
996
|
+
codeBlockStyle: "fenced",
|
|
997
|
+
})
|
|
998
|
+
const markdown = turndown.turndown(article.content)
|
|
999
|
+
|
|
1000
|
+
tree.children.push({
|
|
1001
|
+
url,
|
|
1002
|
+
type: PAGE_TYPES.ARTICLE,
|
|
1003
|
+
title: article.title,
|
|
1004
|
+
children: [],
|
|
1005
|
+
items: [{ title: article.title, content: markdown, sourceUrl: url }],
|
|
1006
|
+
})
|
|
1007
|
+
tree.stats.itemsExtracted++
|
|
1008
|
+
}
|
|
1009
|
+
} catch {
|
|
1010
|
+
// Readability no disponible, ignorar
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
// ─── Utilities ──────────────────────────────────────────────────────
|
|
1015
|
+
|
|
1016
|
+
/**
|
|
1017
|
+
* Slugify un texto para usar como ID/directorio.
|
|
1018
|
+
*/
|
|
1019
|
+
function slugify(text) {
|
|
1020
|
+
return text
|
|
1021
|
+
.toLowerCase()
|
|
1022
|
+
.replace(/[^a-z0-9áéíóúüñ\s-]/g, "")
|
|
1023
|
+
.replace(/\s+/g, "-")
|
|
1024
|
+
.replace(/-+/g, "-")
|
|
1025
|
+
.replace(/^-|-$/g, "")
|
|
1026
|
+
.slice(0, 80)
|
|
1027
|
+
}
|