openprompt-lang 1.2.6 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +62 -8
  2. package/docs/EMBEDDINGS.md +214 -0
  3. package/docs/FRAMEWORK.md +52 -0
  4. package/docs/ONBOARDING_WORKFLOW.md +151 -0
  5. package/docs/OPL-ERRORES.md +504 -0
  6. package/docs/OPL_ACADEMIC_ISSUES.md +158 -0
  7. package/docs/WEB_SCRAPER_PLAN.md +454 -0
  8. package/package.json +7 -1
  9. package/scripts/postinstall.js +37 -0
  10. package/src/cli/commands-knowledge.js +1 -0
  11. package/src/cli/commands-opl.js +79 -1
  12. package/src/cli/commands-work.js +3 -1
  13. package/src/cli/commands-workflow.js +125 -6
  14. package/src/commands/init-core.js +188 -12
  15. package/src/commands/init-existing.js +13 -6
  16. package/src/commands/init-helpers.js +20 -14
  17. package/src/commands/knowledge-ops.js +52 -0
  18. package/src/commands/opl-embeddings.js +556 -0
  19. package/src/commands/opl-help.js +26 -2
  20. package/src/commands/opl-search.js +106 -2
  21. package/src/commands/opl-webscrape.js +390 -0
  22. package/src/commands/work-context.js +17 -0
  23. package/src/commands/workflow/close/index.js +2 -1
  24. package/src/commands/workflow/delivery/index.js +4 -0
  25. package/src/commands/workflow/discovery/index.js +4 -0
  26. package/src/commands/workflow/epic-cli.js +192 -0
  27. package/src/commands/workflow/select.js +146 -0
  28. package/src/commands/workflow/specification/index.js +4 -0
  29. package/src/commands/workflow/sprint-cli.js +174 -0
  30. package/src/core/engine/sandbox.js +7 -3
  31. package/src/core/webscrape/analyzer.js +481 -0
  32. package/src/core/webscrape/deep-scraper.js +1027 -0
  33. package/src/core/workflow/epic-manager.js +845 -0
  34. package/src/core/workflow/gates.js +180 -1
  35. package/src/core/workflow/selector.js +707 -0
  36. package/src/embeddings/chunker.js +450 -0
  37. package/src/embeddings/embedder.js +431 -0
  38. package/src/embeddings/index-pipeline.js +320 -0
  39. package/src/embeddings/vector-store.js +505 -0
  40. package/src/mcp-plan-server.js +12 -5
  41. package/src/mcp-shared-state.js +25 -0
  42. package/src/mcp-refactor/mcp-server.js +0 -171
  43. package/src/mcp-server-backup.js +0 -1913
@@ -0,0 +1,1027 @@
1
+ // @use(kind, contract, limit, deps)
2
+ // @kind(module)
3
+ // @contract(in: url -> out: structured knowledge tree, sideEffect: multiple fetches + saves)
4
+ // @limit(lines: 500)
5
+ // @deps(../commands/knowledge-helpers, node-fetch, jsdom)
6
+
7
+ /**
8
+ * Deep Scraper OPL — Navega sitios jerárquicos (catálogos, directorios)
9
+ * y extrae contenido estructurado con navegación recursiva.
10
+ *
11
+ * Arquitectura:
12
+ * 1. Detecta estructura del sitio (categorías, subcategorías, items)
13
+ * 2. Sigue enlaces recursivamente con profundidad configurable
14
+ * 3. Extrae metadatos de cada página (cards, listados, etc.)
15
+ * 4. Intenta obtener código fuente (CodePen, GitHub, etc.)
16
+ * 5. Guarda todo en knowledge/ con estructura jerárquica
17
+ *
18
+ * Uso:
19
+ * import { deepScrape } from "./deep-scraper.js"
20
+ * await deepScrape("https://freefrontend.com", { maxDepth: 3 })
21
+ */
22
+
23
+ import chalk from "chalk"
24
+ import { analyzePage, PAGE_TYPES as ANALYZER_PAGE_TYPES } from "./analyzer.js"
25
+
26
+ // Modelo Ollama por defecto (llama3.2 es más ligero para PCs modestas)
27
+ const DEFAULT_OLLAMA_MODEL = "llama3.2"
28
+
29
+ // ─── Constantes ────────────────────────────────────────────────────
30
+
31
+ const DEFAULT_MAX_DEPTH = 2
32
+ const DEFAULT_CONCURRENCY = 3
33
+ const REQUEST_TIMEOUT = 15000
34
+ const USER_AGENT = "Mozilla/5.0 (compatible; OPL-DeepScraper/1.0; +https://openprompt-lang.dev)"
35
+
36
+ // ─── Tipos de página detectables ────────────────────────────────────
37
+
38
+ const PAGE_TYPES = {
39
+ CATALOG_HOME: "catalog-home", // freefrontend.com (tech categories)
40
+ CATEGORY_INDEX: "category-index", // /tailwind-code-examples/ (subcategories list)
41
+ ITEM_LIST: "item-list", // /tailwind-buttons/ (card grid)
42
+ ITEM_DETAIL: "item-detail", // Individual example page
43
+ ARTICLE: "article", // Blog post / article
44
+ UNKNOWN: "unknown",
45
+ }
46
+
47
+ // ─── Handler principal ────────────────────────────────────────────
48
+
49
+ /**
50
+ * Ejecuta el deep scrape desde una URL raíz.
51
+ *
52
+ * @param {string} startUrl - URL inicial
53
+ * @param {Object} [options]
54
+ * @param {number} [options.maxDepth=2] - Profundidad máxima de navegación
55
+ * @param {number} [options.concurrency=3] - Fetchs simultáneos
56
+ * @param {string} [options.domain='web'] - Dominio OPL destino
57
+ * @param {boolean} [options.noEmbed] - Saltar embeddings
58
+ * @param {boolean} [options.verbose] - Log detallado
59
+ * @returns {Promise<{success: boolean, pagesScraped: number, itemsExtracted: number, tree: Object}>}
60
+ */
61
+ export async function deepScrape(startUrl, options = {}) {
62
+ const maxDepth = options.maxDepth || DEFAULT_MAX_DEPTH
63
+ const concurrency = options.concurrency || DEFAULT_CONCURRENCY
64
+ const domain = options.domain || "web"
65
+ const verbose = options.verbose === true
66
+ const noEmbed = options.noEmbed === true
67
+ const useOllama = options.ollama !== false // default: true
68
+
69
+ console.log("")
70
+ console.log(chalk.cyan(`🕸️ Deep Scraper: "${chalk.bold(startUrl)}"`))
71
+ console.log(chalk.gray(` Profundidad máxima: ${maxDepth}`))
72
+ console.log(chalk.gray(` Concurrencia: ${concurrency}`))
73
+ if (useOllama) console.log(chalk.gray(` 🧠 Ollama: habilitado (llama3.2)`))
74
+ console.log("")
75
+
76
+ const visited = new Set()
77
+ const tree = {
78
+ url: startUrl,
79
+ type: null,
80
+ title: null,
81
+ children: [],
82
+ items: [],
83
+ stats: { pagesScraped: 0, itemsExtracted: 0 },
84
+ }
85
+
86
+ try {
87
+ await scrapeRecursive(startUrl, 0, maxDepth, visited, tree, {
88
+ domain,
89
+ verbose,
90
+ concurrency,
91
+ noEmbed,
92
+ useOllama,
93
+ ollamaModel: DEFAULT_OLLAMA_MODEL,
94
+ })
95
+
96
+ // Resultado final
97
+ console.log("")
98
+ console.log(chalk.cyan("═".repeat(50)))
99
+ console.log(chalk.green(`✅ Deep Scrape completado`))
100
+ console.log(chalk.gray(` Páginas scrapeadas: ${tree.stats.pagesScraped}`))
101
+ console.log(chalk.gray(` Items extraídos: ${tree.stats.itemsExtracted}`))
102
+ console.log(chalk.gray(` Nivel de profundidad: ${maxDepth}`))
103
+ if (tree.stats.codepenLinks) {
104
+ console.log(chalk.gray(` Enlaces a CodePen: ${tree.stats.codepenLinks}`))
105
+ }
106
+ console.log("")
107
+
108
+ return {
109
+ success: true,
110
+ pagesScraped: tree.stats.pagesScraped,
111
+ itemsExtracted: tree.stats.itemsExtracted,
112
+ tree,
113
+ }
114
+ } catch (error) {
115
+ console.log(chalk.red(`\n❌ Deep Scrape falló: ${error.message}\n`))
116
+ return {
117
+ success: false,
118
+ pagesScraped: tree.stats.pagesScraped,
119
+ itemsExtracted: tree.stats.itemsExtracted,
120
+ tree,
121
+ error: error.message,
122
+ }
123
+ }
124
+ }
125
+
126
+ // ─── Navegación recursiva ─────────────────────────────────────────
127
+
128
+ /**
129
+ * Scrapea una URL y decide cómo continuar según el tipo de página.
130
+ *
131
+ * @param {string} url
132
+ * @param {number} depth - Profundidad actual
133
+ * @param {number} maxDepth - Profundidad máxima
134
+ * @param {Set} visited - URLs ya visitadas
135
+ * @param {Object} tree - Árbol de resultado
136
+ * @param {Object} opts - Opciones
137
+ */
138
+ async function scrapeRecursive(url, depth, maxDepth, visited, tree, opts) {
139
+ if (visited.has(url) || depth > maxDepth) return
140
+ visited.add(url)
141
+
142
+ const { hostname, pathname } = new URL(url)
143
+ const isSameDomain = (link) => {
144
+ try {
145
+ return new URL(link, url).hostname === hostname
146
+ } catch {
147
+ return false
148
+ }
149
+ }
150
+
151
+ if (opts.verbose) {
152
+ console.log(chalk.gray(` ${" ".repeat(depth)}🔍 ${depth > 0 ? "└─ " : ""}${url}`))
153
+ }
154
+
155
+ // Fetch HTML
156
+ const html = await fetchPage(url)
157
+ if (!html) return
158
+
159
+ tree.stats.pagesScraped++
160
+
161
+ // Detectar tipo de página
162
+ let pageType
163
+ let ollamaPlan = null
164
+
165
+ // Solo usar Ollama para la primera página (depth === 0).
166
+ // Para subpáginas, usar heurísticas que son más rápidas.
167
+ if (opts.useOllama && depth === 0) {
168
+ // Usar Ollama para clasificar la página raíz
169
+ try {
170
+ ollamaPlan = await analyzePage(html, url, { model: opts.ollamaModel })
171
+ pageType = mapAnalyzerType(ollamaPlan.pageType)
172
+ if (opts.verbose) {
173
+ console.log(
174
+ chalk.gray(
175
+ ` ${" ".repeat(depth)}🧠 Ollama: ${ollamaPlan.pageType} (${ollamaPlan.confidence}%) → ${pageType}`
176
+ )
177
+ )
178
+ }
179
+ } catch {
180
+ // Fallback a heurísticas si Ollama falla
181
+ pageType = detectPageType(url, html)
182
+ }
183
+ } else {
184
+ // Heurísticas rápidas para subpáginas
185
+ pageType = detectPageType(url, html)
186
+ }
187
+ if (!tree.type) tree.type = pageType
188
+
189
+ // Extraer título
190
+ const title = extractTitle(html, url)
191
+ if (!tree.title) tree.title = title
192
+
193
+ // Según el tipo de página, extraer diferente contenido
194
+ switch (pageType) {
195
+ case PAGE_TYPES.CATALOG_HOME:
196
+ await handleCatalogHome(
197
+ url,
198
+ html,
199
+ depth,
200
+ maxDepth,
201
+ visited,
202
+ tree,
203
+ opts,
204
+ isSameDomain,
205
+ ollamaPlan
206
+ )
207
+ break
208
+
209
+ case PAGE_TYPES.CATEGORY_INDEX:
210
+ await handleCategoryIndex(
211
+ url,
212
+ html,
213
+ depth,
214
+ maxDepth,
215
+ visited,
216
+ tree,
217
+ opts,
218
+ isSameDomain,
219
+ ollamaPlan
220
+ )
221
+ break
222
+
223
+ case PAGE_TYPES.ITEM_LIST:
224
+ await handleItemList(
225
+ url,
226
+ html,
227
+ depth,
228
+ maxDepth,
229
+ visited,
230
+ tree,
231
+ opts,
232
+ isSameDomain,
233
+ ollamaPlan
234
+ )
235
+ break
236
+
237
+ default:
238
+ // Unknown: solo extraer contenido con Readability
239
+ await extractGenericContent(url, html, tree, opts)
240
+ break
241
+ }
242
+ }
243
+
244
+ // ─── Mapeo de tipos de página ──────────────────────────────────────
245
+
246
+ /**
247
+ * Mapea tipos del analyzer a tipos internos del deep scraper.
248
+ * El analyzer puede devolver strings como "catalog-home" o "item-list"
249
+ * que mapeamos a nuestros PAGE_TYPES.
250
+ */
251
+ function mapAnalyzerType(analyzerType) {
252
+ const type = (analyzerType || "").toLowerCase().trim()
253
+ const mapping = {
254
+ "catalog-home": PAGE_TYPES.CATALOG_HOME,
255
+ "category-index": PAGE_TYPES.CATEGORY_INDEX,
256
+ "item-list": PAGE_TYPES.ITEM_LIST,
257
+ "item-detail": PAGE_TYPES.ITEM_DETAIL,
258
+ article: PAGE_TYPES.ARTICLE,
259
+ product: PAGE_TYPES.PRODUCT,
260
+ docs: PAGE_TYPES.DOCS,
261
+ portfolio: PAGE_TYPES.PORTFOLIO,
262
+ landing: PAGE_TYPES.LANDING,
263
+ }
264
+ return mapping[type] || PAGE_TYPES.UNKNOWN
265
+ }
266
+
267
+ // ─── Detectores de tipo de página ─────────────────────────────────
268
+
269
+ /**
270
+ * Detecta el tipo de página basado en URL y contenido HTML.
271
+ */
272
+ function detectPageType(url, html) {
273
+ const path = new URL(url).pathname.toLowerCase()
274
+
275
+ // Catalog homepage: tiene nav con categorías principales
276
+ if (path === "/" || path === "") {
277
+ if (hasCatalogStructure(html)) return PAGE_TYPES.CATALOG_HOME
278
+ return PAGE_TYPES.UNKNOWN
279
+ }
280
+
281
+ // Category index: /tailwind-code-examples/ - tiene subcategorías listadas
282
+ if (path.match(/\/[a-z-]+(?:code-examples|examples)\/?$/)) {
283
+ if (hasSubcategoryLinks(html)) return PAGE_TYPES.CATEGORY_INDEX
284
+ return PAGE_TYPES.ITEM_LIST
285
+ }
286
+
287
+ // Item list: /tailwind-buttons/ - tiene grid de cards
288
+ if (path.match(/\/tailwind-[a-z-]+\/?$/)) {
289
+ if (hasCardGrid(html)) return PAGE_TYPES.ITEM_LIST
290
+ return PAGE_TYPES.UNKNOWN
291
+ }
292
+
293
+ return PAGE_TYPES.UNKNOWN
294
+ }
295
+
296
+ /**
297
+ * Detecta si el HTML tiene estructura de catálogo (nav con tecnologías).
298
+ */
299
+ function hasCatalogStructure(html) {
300
+ // Buscar sidebar con grupos de navegación (Languages, Frameworks)
301
+ return (
302
+ (html.includes("nav-title") || html.includes("nav-group")) &&
303
+ (html.includes("code-examples") || html.includes("nav-list"))
304
+ )
305
+ }
306
+
307
+ /**
308
+ * Detecta si el HTML contiene una lista de subcategorías.
309
+ */
310
+ function hasSubcategoryLinks(html) {
311
+ const links = extractAllLinks(html)
312
+ const subcatCount = links.filter(
313
+ (l) => l.match(/\/[a-z]+-[a-z-]+\/?$/) && !l.includes("page/")
314
+ ).length
315
+ return subcatCount > 5
316
+ }
317
+
318
+ /**
319
+ * Detecta si el HTML contiene un grid de cards.
320
+ */
321
+ function hasCardGrid(html) {
322
+ const cardCount = (html.match(/<div[^>]*class="?card[^"]*"?>/gi) || []).length
323
+ return cardCount > 2
324
+ }
325
+
326
+ // ─── Handlers por tipo de página ──────────────────────────────────
327
+
328
+ /**
329
+ * Maneja páginas tipo CATALOG_HOME (freefrontend.com).
330
+ * Extrae categorías de tecnología del nav y las sigue.
331
+ */
332
+ async function handleCatalogHome(
333
+ url,
334
+ html,
335
+ depth,
336
+ maxDepth,
337
+ visited,
338
+ tree,
339
+ opts,
340
+ isSameDomain,
341
+ ollamaPlan = null
342
+ ) {
343
+ // Extraer links de categorías del nav/sidebar
344
+ const links = extractCategoryLinks(html, url)
345
+
346
+ if (opts.verbose) {
347
+ console.log(chalk.gray(` ${" ".repeat(depth)}📂 Categorías encontradas: ${links.length}`))
348
+ }
349
+
350
+ const childNode = {
351
+ url,
352
+ type: PAGE_TYPES.CATALOG_HOME,
353
+ title: extractTitle(html, url),
354
+ children: [],
355
+ items: [],
356
+ }
357
+
358
+ // Seguir cada categoría (con concurrencia limitada)
359
+ const batchSize = opts.concurrency
360
+ for (let i = 0; i < links.length; i += batchSize) {
361
+ const batch = links.slice(i, i + batchSize)
362
+ await Promise.all(
363
+ batch.map(async (link) => {
364
+ const childTree = {
365
+ url: link,
366
+ children: [],
367
+ items: [],
368
+ stats: { pagesScraped: 0, itemsExtracted: 0 },
369
+ }
370
+ await scrapeRecursive(link, depth + 1, maxDepth, visited, childTree, opts)
371
+ if (childTree.stats.pagesScraped > 0 || childTree.stats.itemsExtracted > 0) {
372
+ childNode.children.push(childTree)
373
+ tree.stats.pagesScraped += childTree.stats.pagesScraped
374
+ tree.stats.itemsExtracted += childTree.stats.itemsExtracted
375
+ }
376
+ })
377
+ )
378
+ }
379
+
380
+ tree.children.push(childNode)
381
+ }
382
+
383
+ /**
384
+ * Maneja páginas tipo CATEGORY_INDEX (/tailwind-code-examples/).
385
+ * Extrae subcategorías y las sigue.
386
+ */
387
+ async function handleCategoryIndex(
388
+ url,
389
+ html,
390
+ depth,
391
+ maxDepth,
392
+ visited,
393
+ tree,
394
+ opts,
395
+ isSameDomain,
396
+ ollamaPlan = null
397
+ ) {
398
+ let links = extractSubcategoryLinks(html, url)
399
+
400
+ // Si Ollama sugirió un filtro de links, usarlo para priorizar
401
+ if (ollamaPlan?.linkFilter && links.length > 0) {
402
+ const filter = ollamaPlan.linkFilter.toLowerCase()
403
+ const filtered = links.filter((l) => l.toLowerCase().includes(filter))
404
+ if (filtered.length > 0 && filtered.length < links.length) {
405
+ if (opts.verbose) {
406
+ console.log(
407
+ chalk.gray(
408
+ ` ${" ".repeat(depth)}🧠 Ollama filter: "${filter}" → ${filtered.length}/${links.length} links`
409
+ )
410
+ )
411
+ }
412
+ links = filtered
413
+ }
414
+ }
415
+
416
+ // Si Ollama detectó links específicos, usarlos como prioridad
417
+ if (ollamaPlan?.links?.length > 0) {
418
+ const ollamaUrls = ollamaPlan.links
419
+ .filter((l) => l.type === "category" || l.type === "subcategory")
420
+ .map((l) => {
421
+ try {
422
+ return l.url.startsWith("http") ? l.url : new URL(l.url, url).href
423
+ } catch {
424
+ return null
425
+ }
426
+ })
427
+ .filter(Boolean)
428
+
429
+ if (ollamaUrls.length > 0) {
430
+ // Combinar: links de Ollama que también estén en los extraídos por heurísticas
431
+ const heuristicSet = new Set(links)
432
+ const ollamaMatched = ollamaUrls.filter((u) => heuristicSet.has(u))
433
+ if (ollamaMatched.length > 0) {
434
+ // Ollama acertó: usar solo los relevantes
435
+ links = ollamaMatched
436
+ if (opts.verbose) {
437
+ console.log(
438
+ chalk.gray(
439
+ ` ${" ".repeat(depth)}🧠 Ollama links: ${ollamaMatched.length}/${ollamaUrls.length} matched`
440
+ )
441
+ )
442
+ }
443
+ }
444
+ }
445
+ }
446
+
447
+ if (opts.verbose) {
448
+ console.log(chalk.gray(` ${" ".repeat(depth)}📁 Subcategorías: ${links.length}`))
449
+ }
450
+
451
+ const childNode = {
452
+ url,
453
+ type: PAGE_TYPES.CATEGORY_INDEX,
454
+ title: extractTitle(html, url),
455
+ children: [],
456
+ items: [],
457
+ }
458
+
459
+ const batchSize = opts.concurrency
460
+ for (let i = 0; i < links.length; i += batchSize) {
461
+ const batch = links.slice(i, i + batchSize)
462
+ await Promise.all(
463
+ batch.map(async (link) => {
464
+ const childTree = {
465
+ url: link,
466
+ children: [],
467
+ items: [],
468
+ stats: { pagesScraped: 0, itemsExtracted: 0 },
469
+ }
470
+ await scrapeRecursive(link, depth + 1, maxDepth, visited, childTree, opts)
471
+ if (childTree.stats.pagesScraped > 0 || childTree.stats.itemsExtracted > 0) {
472
+ childNode.children.push(childTree)
473
+ tree.stats.pagesScraped += childTree.stats.pagesScraped
474
+ tree.stats.itemsExtracted += childTree.stats.itemsExtracted
475
+ }
476
+ })
477
+ )
478
+ }
479
+
480
+ tree.children.push(childNode)
481
+ }
482
+
483
+ /**
484
+ * Maneja páginas tipo ITEM_LIST (/tailwind-buttons/).
485
+ * Extrae todas las cards con su metadata.
486
+ */
487
+ async function handleItemList(
488
+ url,
489
+ html,
490
+ depth,
491
+ maxDepth,
492
+ visited,
493
+ tree,
494
+ opts,
495
+ isSameDomain,
496
+ ollamaPlan = null
497
+ ) {
498
+ const cards = extractCards(html, url)
499
+
500
+ if (opts.verbose) {
501
+ console.log(chalk.gray(` ${" ".repeat(depth)}🎴 Cards: ${cards.length}`))
502
+ }
503
+
504
+ const childNode = {
505
+ url,
506
+ type: PAGE_TYPES.ITEM_LIST,
507
+ title: extractTitle(html, url),
508
+ children: [],
509
+ items: cards,
510
+ }
511
+
512
+ tree.children.push(childNode)
513
+ tree.stats.itemsExtracted += cards.length
514
+
515
+ // Guardar cada card como conocimiento
516
+ for (const card of cards) {
517
+ await saveCard(card, opts.domain)
518
+ }
519
+
520
+ // Contar CodePen links
521
+ const codepenCount = cards.filter((c) => c.codepenSlug).length
522
+ if (!tree.stats.codepenLinks) tree.stats.codepenLinks = 0
523
+ tree.stats.codepenLinks += codepenCount
524
+ }
525
+
526
+ // ─── Extractores de estructura ─────────────────────────────────────
527
+
528
+ /**
529
+ * Extrae links de categorías del nav/sidebar del homepage.
530
+ */
531
+ function extractCategoryLinks(html, baseUrl) {
532
+ const links = extractAllLinks(html)
533
+ // Filtrar solo links que sean categorías de tecnología
534
+ const techPatterns = [
535
+ "code-examples",
536
+ "html",
537
+ "css",
538
+ "javascript",
539
+ "js",
540
+ "bootstrap",
541
+ "tailwind",
542
+ "react",
543
+ "vue",
544
+ "angular",
545
+ "svelte",
546
+ "jquery",
547
+ "typescript",
548
+ ]
549
+
550
+ return [
551
+ ...new Set(
552
+ links.filter((l) => {
553
+ const path = l.replace(baseUrl.origin, "")
554
+ return techPatterns.some((p) => path.includes(p)) && !path.includes("page/")
555
+ })
556
+ ),
557
+ ].map((l) => (l.startsWith("http") ? l : new URL(l, baseUrl).href))
558
+ }
559
+
560
+ /**
561
+ * Extrae links de subcategorías de una página de categoría.
562
+ * Ej: /tailwind-code-examples/ → /tailwind-buttons/, /tailwind-cards/, etc.
563
+ */
564
+ function extractSubcategoryLinks(html, baseUrl) {
565
+ const links = extractAllLinks(html)
566
+ const currentPath = new URL(baseUrl).pathname.replace(/\/$/, "")
567
+ const pathPrefix = currentPath.split("/").pop() || ""
568
+ const tech = pathPrefix.split("-code")[0] || pathPrefix
569
+
570
+ return [
571
+ ...new Set(
572
+ links.filter((l) => {
573
+ const path = typeof l === "string" ? l.replace(baseUrl.origin, "") : ""
574
+ // Debe ser un link a una subcategoría del mismo tipo
575
+ return (
576
+ path.includes(tech) &&
577
+ path !== currentPath && // excluir la misma página
578
+ !path.includes("page/") &&
579
+ !path.includes("code-examples") &&
580
+ path.split("/").filter(Boolean).length <= 3
581
+ )
582
+ })
583
+ ),
584
+ ]
585
+ .map((l) => (l.startsWith("http") ? l : new URL(l, baseUrl).href))
586
+ .slice(0, 100)
587
+ }
588
+
589
+ /**
590
+ * Extrae todas las URLs href de un HTML.
591
+ */
592
+ function extractAllLinks(html) {
593
+ const hrefs = [...html.matchAll(/href=(\S+?)(?:\s|>)/g)]
594
+ return hrefs
595
+ .map((m) => {
596
+ let v = m[1]
597
+ if ((v.startsWith('"') && v.endsWith('"')) || (v.startsWith("'") && v.endsWith("'"))) {
598
+ v = v.slice(1, -1)
599
+ }
600
+ return v
601
+ })
602
+ .filter((h) => {
603
+ return (
604
+ h &&
605
+ !h.startsWith("#") &&
606
+ !h.startsWith("javascript") &&
607
+ !h.startsWith("mailto") &&
608
+ !h.startsWith("tel") &&
609
+ !h.includes("google") &&
610
+ !h.includes("facebook") &&
611
+ !h.includes("twitter") &&
612
+ !h.endsWith(".css") &&
613
+ !h.endsWith(".js") &&
614
+ !h.endsWith(".png") &&
615
+ !h.endsWith(".jpg") &&
616
+ !h.endsWith(".svg") &&
617
+ !h.endsWith(".ico") &&
618
+ !h.endsWith(".xml") &&
619
+ h.length > 2
620
+ )
621
+ })
622
+ }
623
+
624
+ // ─── Extractores de contenido ──────────────────────────────────────
625
+
626
+ /**
627
+ * Extrae el título de una página HTML.
628
+ */
629
+ function extractTitle(html, url) {
630
+ const titleMatch = html.match(/<title>([^<]+)<\/title>/i)
631
+ if (titleMatch) return titleMatch[1].trim()
632
+ return new URL(url).hostname
633
+ }
634
+
635
+ /**
636
+ * Extrae todas las cards de una página ITEM_LIST.
637
+ *
638
+ * freefrontend.com tiene 2 tipos de cards:
639
+ *
640
+ * Tipo A (CodePen popover, ~12 por página):
641
+ * <div class=card>
642
+ * <div class=card-header>
643
+ * <h3>Título</h3>
644
+ * <button popovertarget=XXX>demo & code</button>
645
+ * </div>
646
+ * <div class=card-content>...</div>
647
+ * <div class=card-meta-info>...</div>
648
+ * </div>
649
+ * <div id=XXX popover><p class=codepen data-slug-hash=XXX></div>
650
+ *
651
+ * Tipo B (External link, ~18 por página, <div class=snippet-card>):
652
+ * <div class=snippet-card>
653
+ * <div class=card-header>
654
+ * <h3><a href=https://...>Título<img class=external></a></h3>
655
+ * </div>
656
+ * <video/img ...>
657
+ * </div>
658
+ */
659
+ function extractCards(html, baseUrl) {
660
+ // ── Fase 1: Construir mapa popover ID → slug ──
661
+ const popoverMap = new Map()
662
+ const popoverRegex = /<div[^>]*\bid=(\S+?)[\s>][^>]*\bpopover\b[^>]*>([\s\S]{0,2000}?)<\/div>/gi
663
+ let pMatch
664
+ while ((pMatch = popoverRegex.exec(html)) !== null) {
665
+ const id = pMatch[1].replace(/"/g, "")
666
+ const slugMatch = pMatch[2].match(/data-slug-hash=([^\s>"']+)/)
667
+ if (slugMatch) {
668
+ popoverMap.set(id, slugMatch[1])
669
+ }
670
+ }
671
+
672
+ const cards = []
673
+
674
+ // ── Tipo A: cards con popover CodePen ──
675
+ // La estructura real tiene card-content ANIDADO:
676
+ // <div class=card-header>...<button popovertarget=XXX>...</div>
677
+ // <div class=card-content>
678
+ // <div class=card-content> ← anidado!
679
+ // <p>Descripción</p>
680
+ // </div>
681
+ // </div>
682
+ // <div class=card-meta-info>...</div>
683
+ //
684
+ // Estrategia: buscar card-header que contenga popovertarget,
685
+ // luego card-meta-info, y entre medio está el content.
686
+ const typeAPattern = /<div class=card-header>([\s\S]{0,800}?)<\/div>/gi
687
+ let aMatch
688
+ while ((aMatch = typeAPattern.exec(html)) !== null) {
689
+ const header = aMatch[1]
690
+ const headerStart = aMatch.index
691
+
692
+ // Solo procesar si tiene popovertarget (Tipo A)
693
+ const popoverTarget = (header.match(/popovertarget=(\S+?)(?:\s|>)/) || [])[1]
694
+ if (!popoverTarget) continue
695
+
696
+ const title = extractCardTitle(header)
697
+ const codepenSlug = popoverMap.get(popoverTarget) || null
698
+
699
+ // Buscar card-meta-info DESPUÉS del header (hasta ~5000 chars)
700
+ const afterHeader = html.slice(headerStart, headerStart + 5000)
701
+ const metaMatch = afterHeader.match(
702
+ /<div class=card-meta-info>([\s\S]{0,3000}?)<\/div>\s*<\/div>/
703
+ )
704
+ const meta = metaMatch ? metaMatch[0] : ""
705
+
706
+ // Buscar descripción: el <p> entre card-header y card-meta-info
707
+ const contentEnd = metaMatch
708
+ ? afterHeader.indexOf("<div class=card-meta-info>")
709
+ : afterHeader.length
710
+ const contentSection = afterHeader.slice(0, contentEnd)
711
+
712
+ // Extraer la descripción del <p> más largo
713
+ const desc = extractCardDescription(contentSection)
714
+
715
+ cards.push({
716
+ title,
717
+ description: desc,
718
+ technologies: extractTechnologies(meta),
719
+ difficulty: extractDifficulty(meta),
720
+ browserSupport: extractBrowserSupport(meta),
721
+ features: extractFeatures(meta),
722
+ requires: extractRequires(contentSection),
723
+ codepenSlug,
724
+ demoUrl: codepenSlug ? `https://codepen.io/anon/pen/${codepenSlug}` : null,
725
+ sourceUrl: baseUrl,
726
+ type: "codepen",
727
+ })
728
+ }
729
+
730
+ // ── Tipo B: snippet-cards (enlaces externos) ──
731
+ // Solo extraer tipo B si hay AL MENOS un card tipo A (significa que
732
+ // estamos en una página de subcategoría con componentes, no en una
733
+ // página de categorías que solo lista subcategorías)
734
+ const hasTypeA = cards.some((c) => c.type === "codepen")
735
+ if (!hasTypeA) return cards
736
+
737
+ // Patrón: <div class=snippet-card>...<div class=card-header>...<a href=...
738
+ const typeBPattern = /<div class=snippet-card>([\s\S]{0,3000}?)<\/div>\s*<\/div>/gi
739
+ let bMatch
740
+ while ((bMatch = typeBPattern.exec(html)) !== null) {
741
+ const section = bMatch[1]
742
+ const headerMatch = section.match(/<div class=card-header>([\s\S]{0,800}?)<\/div>/)
743
+ if (!headerMatch) continue
744
+
745
+ const header = headerMatch[1]
746
+ const title = extractCardTitle(header)
747
+
748
+ // Extraer enlace externo
749
+ const linkMatch = header.match(/href=(\S+?)(?:\s|>)/)
750
+ let demoUrl = null
751
+ if (linkMatch) {
752
+ let link = linkMatch[1]
753
+ if (
754
+ (link.startsWith('"') && link.endsWith('"')) ||
755
+ (link.startsWith("'") && link.endsWith("'"))
756
+ ) {
757
+ link = link.slice(1, -1)
758
+ }
759
+ if (link.startsWith("http")) {
760
+ demoUrl = link
761
+ }
762
+ }
763
+
764
+ // Extraer descripción del snippet-card
765
+ const desc = extractCardDescription(section)
766
+
767
+ // Extraer video/imagen
768
+ const videoMatch = section.match(/<video[^>]*aria-label="([^"]+)"/)
769
+ const imgMatch = section.match(/<img[^>]*alt="([^"]+)"/)
770
+ const mediaDesc = videoMatch ? videoMatch[1] : imgMatch ? imgMatch[1] : null
771
+
772
+ cards.push({
773
+ title,
774
+ description: desc || mediaDesc || "",
775
+ technologies: [],
776
+ difficulty: null,
777
+ browserSupport: [],
778
+ features: [],
779
+ requires: [],
780
+ codepenSlug: null,
781
+ demoUrl,
782
+ sourceUrl: baseUrl,
783
+ type: "external",
784
+ })
785
+ }
786
+
787
+ return cards
788
+ }
789
+
790
+ /**
791
+ * Extrae el título de una card desde el header.
792
+ */
793
+ function extractCardTitle(header) {
794
+ const hMatch = header.match(/<h[234][^>]*>([^<]+)/)
795
+ return hMatch ? hMatch[1].trim() : "Untitled"
796
+ }
797
+
798
+ /**
799
+ * Extrae la descripción de una card.
800
+ * Busca el <p> más largo dentro del contenido.
801
+ */
802
+ function extractCardDescription(content) {
803
+ const pTags = [...content.matchAll(/<p>([\s\S]{0,2000}?)<\/p>/gi)]
804
+ if (!pTags.length) return ""
805
+
806
+ // Tomar el <p> más largo (el principal, no snippets cortos)
807
+ const sorted = pTags
808
+ .map((m) => m[1])
809
+ .filter((t) => t.replace(/<[^>]+>/g, "").trim().length > 20)
810
+ .sort((a, b) => b.length - a.length)
811
+
812
+ const best = sorted[0] || ""
813
+ return best
814
+ .replace(/<[^>]+>/g, " ")
815
+ .replace(/&rsquo;/g, "'")
816
+ .replace(/&ldquo;/g, '"')
817
+ .replace(/&rdquo;/g, '"')
818
+ .replace(/&amp;/g, "&")
819
+ .replace(/&nbsp;/g, " ")
820
+ .replace(/\s+/g, " ")
821
+ .trim()
822
+ }
823
+
824
+ /**
825
+ * Extrae las tecnologías de la metadata.
826
+ */
827
+ function extractTechnologies(meta) {
828
+ const techs = [...meta.matchAll(/<span class="meta-value tech-([^"]+)"[^>]*>([^<]+)<\/span>/gi)]
829
+ return techs.map((t) => t[2].trim())
830
+ }
831
+
832
+ /**
833
+ * Extrae la dificultad de la metadata.
834
+ */
835
+ function extractDifficulty(meta) {
836
+ const diffMatch = meta.match(/Difficulty:<\/span>\s*\n?\s*<span[^>]*>([^<]+)/i)
837
+ return diffMatch ? diffMatch[1].trim() : null
838
+ }
839
+
840
+ /**
841
+ * Extrae browser support de la metadata.
842
+ */
843
+ function extractBrowserSupport(meta) {
844
+ const browsers = [
845
+ ...meta.matchAll(/<span class="meta-value browser-tag">[\s\S]{0,200}?<span>([^<]+)<\/span>/gi),
846
+ ]
847
+ return browsers.map((b) => b[1].trim())
848
+ }
849
+
850
+ /**
851
+ * Extrae features de la metadata.
852
+ */
853
+ function extractFeatures(meta) {
854
+ const features = [...meta.matchAll(/<span class=meta-value>([^<]+)<\/span>/g)]
855
+ // Filtrar technologies y otros valores no-features
856
+ const knownTechs = ["HTML", "CSS", "JavaScript", "TypeScript", "JS", "TS"]
857
+ const difficultyLevels = ["Beginner", "Intermediate", "Advanced"]
858
+ return features
859
+ .map((f) => f[1].trim())
860
+ .filter((f) => !knownTechs.includes(f) && !difficultyLevels.includes(f) && f.length > 2)
861
+ .slice(0, 10)
862
+ }
863
+
864
+ /**
865
+ * Extrae los requires de una card (tecnologías requeridas).
866
+ */
867
+ function extractRequires(content) {
868
+ const reqMatch = content.match(/Requires?:\s*([^<]+)/i)
869
+ if (!reqMatch) return []
870
+ return reqMatch[1]
871
+ .split(",")
872
+ .map((r) => r.trim())
873
+ .filter(Boolean)
874
+ }
875
+
876
+ /**
877
+ * Extrae el CodePen slug hash del HTML (busca popovers con data-slug-hash).
878
+ * Busca en todo el HTML cercano al índice de la card.
879
+ */
880
+ function extractCodePenSlug(html, cardIndex) {
881
+ // Buscar popover después de la card (dentro de ~5000 chars)
882
+ const afterCard = html.slice(cardIndex, cardIndex + 5000)
883
+ const slugMatch = afterCard.match(/data-slug-hash=([^\s>"']+)/)
884
+ return slugMatch ? slugMatch[1] : null
885
+ }
886
+
887
+ // ─── Fetch ─────────────────────────────────────────────────────────
888
+
889
+ /**
890
+ * Fetch de una página HTML con timeout y User-Agent.
891
+ *
892
+ * @param {string} url
893
+ * @returns {Promise<string|null>}
894
+ */
895
+ async function fetchPage(url) {
896
+ try {
897
+ const response = await fetch(url, {
898
+ headers: {
899
+ "User-Agent": USER_AGENT,
900
+ Accept: "text/html,application/xhtml+xml",
901
+ },
902
+ signal: AbortSignal.timeout(REQUEST_TIMEOUT),
903
+ })
904
+
905
+ if (!response.ok) {
906
+ console.log(chalk.yellow(` ⚠️ HTTP ${response.status}: ${url}`))
907
+ return null
908
+ }
909
+
910
+ return await response.text()
911
+ } catch (error) {
912
+ console.log(chalk.yellow(` ⚠️ Fetch error: ${url} - ${error.message}`))
913
+ return null
914
+ }
915
+ }
916
+
917
+ // ─── Guardado ──────────────────────────────────────────────────────
918
+
919
+ /**
920
+ * Guarda una card extraída en la estructura knowledge/.
921
+ *
922
+ * @param {Object} card - Card data
923
+ * @param {string} domain - Dominio OPL
924
+ */
925
+ async function saveCard(card, domain) {
926
+ try {
927
+ const { existsSync, mkdirSync, writeFileSync } = await import("fs")
928
+ const { join } = await import("path")
929
+ const { getProjectKnowledgeDir } = await import("../../commands/knowledge-helpers.js")
930
+
931
+ const knowledgeDir = getProjectKnowledgeDir()
932
+ if (!knowledgeDir) return
933
+
934
+ const slug = slugify(card.title)
935
+ const docDir = join(knowledgeDir, domain, "components", slug)
936
+ mkdirSync(docDir, { recursive: true })
937
+
938
+ // Guardar metadata
939
+ const meta = {
940
+ title: card.title,
941
+ description: card.description,
942
+ technologies: card.technologies,
943
+ difficulty: card.difficulty,
944
+ browserSupport: card.browserSupport,
945
+ features: card.features,
946
+ requires: card.requires,
947
+ codepen: card.demoUrl,
948
+ codepenSlug: card.codepenSlug,
949
+ source: card.sourceUrl,
950
+ scrapedAt: new Date().toISOString(),
951
+ type: "frontend-component",
952
+ domain,
953
+ }
954
+ writeFileSync(join(docDir, "meta.json"), JSON.stringify(meta, null, 2), "utf-8")
955
+
956
+ // Guardar descripción como markdown
957
+ const markdown = `# ${card.title}
958
+
959
+ ${card.description}
960
+
961
+ ## Metadata
962
+
963
+ - **Tecnologías:** ${card.technologies.join(", ") || "N/A"}
964
+ - **Dificultad:** ${card.difficulty || "N/A"}
965
+ - **Features:** ${card.features.join(", ") || "N/A"}
966
+ - **Browser Support:** ${card.browserSupport.join(", ") || "N/A"}
967
+
968
+ ## Demo
969
+
970
+ ${card.demoUrl || "No disponible"}
971
+ `
972
+ writeFileSync(join(docDir, "page.md"), markdown, "utf-8")
973
+ } catch (error) {
974
+ // No crítico - solo log si verbose
975
+ if (error.message.includes("knowledge-helpers")) {
976
+ // knowledge dir not configured
977
+ }
978
+ }
979
+ }
980
+
981
+ /**
982
+ * Extracción genérica con Readability (fallback para páginas desconocidas).
983
+ */
984
+ async function extractGenericContent(url, html, tree, opts) {
985
+ try {
986
+ const { JSDOM } = await import("jsdom")
987
+ const dom = new JSDOM(html, { url })
988
+ const { Readability } = await import("@mozilla/readability")
989
+ const reader = new Readability(dom.window.document)
990
+ const article = reader.parse()
991
+
992
+ if (article && article.content) {
993
+ const TurndownService = (await import("turndown")).default
994
+ const turndown = new TurndownService({
995
+ headingStyle: "atx",
996
+ codeBlockStyle: "fenced",
997
+ })
998
+ const markdown = turndown.turndown(article.content)
999
+
1000
+ tree.children.push({
1001
+ url,
1002
+ type: PAGE_TYPES.ARTICLE,
1003
+ title: article.title,
1004
+ children: [],
1005
+ items: [{ title: article.title, content: markdown, sourceUrl: url }],
1006
+ })
1007
+ tree.stats.itemsExtracted++
1008
+ }
1009
+ } catch {
1010
+ // Readability no disponible, ignorar
1011
+ }
1012
+ }
1013
+
1014
+ // ─── Utilities ──────────────────────────────────────────────────────
1015
+
1016
+ /**
1017
+ * Slugify un texto para usar como ID/directorio.
1018
+ */
1019
+ function slugify(text) {
1020
+ return text
1021
+ .toLowerCase()
1022
+ .replace(/[^a-z0-9áéíóúüñ\s-]/g, "")
1023
+ .replace(/\s+/g, "-")
1024
+ .replace(/-+/g, "-")
1025
+ .replace(/^-|-$/g, "")
1026
+ .slice(0, 80)
1027
+ }