npm - similarbuild - Versions diffs - 0.1.0 - Mend

similarbuild 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

package/templates/skills/sb-crawl-and-list/scripts/tests/test-crawl-and-list.mjs ADDED Viewed

@@ -0,0 +1,276 @@
+#!/usr/bin/env node
+// test-crawl-and-list.mjs — Smoke tests for the main script.
+// No network, no cheerio dep needed: we exercise --help, arg validation,
+// and a local --sitemap-path round-trip (which doesn't touch fetch or cheerio).
+import { spawnSync } from 'node:child_process'
+import { fileURLToPath } from 'node:url'
+import { dirname, resolve, join } from 'node:path'
+import { mkdtempSync, writeFileSync, rmSync, existsSync, readFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { strict as assert } from 'node:assert'
+const here = dirname(fileURLToPath(import.meta.url))
+const SCRIPT = resolve(here, '..', 'crawl-and-list.mjs')
+let passed = 0
+let failed = 0
+function test(name, fn) {
+  try {
+    fn()
+    process.stdout.write(`ok - ${name}\n`)
+    passed++
+  } catch (err) {
+    process.stdout.write(`not ok - ${name}\n  ${err.message}\n`)
+    failed++
+  }
+}
+test('--help exits 0 and prints usage', () => {
+  const r = spawnSync('node', [SCRIPT, '--help'], { encoding: 'utf8' })
+  assert.equal(r.status, 0, `exit code was ${r.status}`)
+  assert.match(r.stdout, /crawl-and-list\.mjs/)
+  assert.match(r.stdout, /--root-url/)
+  assert.match(r.stdout, /--output-dir/)
+  assert.match(r.stdout, /--max-depth/)
+  assert.match(r.stdout, /--max-pages/)
+  assert.match(r.stdout, /--blocklist/)
+  assert.match(r.stdout, /--sitemap-path/)
+})
+test('missing --root-url exits 2', () => {
+  const r = spawnSync('node', [SCRIPT, '--output-dir', '/tmp/sb-crawl-test'], {
+    encoding: 'utf8',
+  })
+  assert.equal(r.status, 2, `exit code was ${r.status}`)
+  assert.match(r.stderr, /missing --root-url/)
+})
+test('missing --output-dir exits 2', () => {
+  const r = spawnSync('node', [SCRIPT, '--root-url', 'https://example.com'], {
+    encoding: 'utf8',
+  })
+  assert.equal(r.status, 2, `exit code was ${r.status}`)
+  assert.match(r.stderr, /missing --output-dir/)
+})
+test('invalid --root-url exits 2', () => {
+  const r = spawnSync(
+    'node',
+    [SCRIPT, '--root-url', 'not-a-url', '--output-dir', '/tmp/sb-crawl-test'],
+    { encoding: 'utf8' },
+  )
+  assert.equal(r.status, 2, `exit code was ${r.status}`)
+  assert.match(r.stderr, /not a valid URL/)
+})
+test('non-http(s) --root-url exits 2', () => {
+  const r = spawnSync(
+    'node',
+    [SCRIPT, '--root-url', 'ftp://example.com', '--output-dir', '/tmp/sb-crawl-test'],
+    { encoding: 'utf8' },
+  )
+  assert.equal(r.status, 2, `exit code was ${r.status}`)
+  assert.match(r.stderr, /must be http\(s\)/)
+})
+test('non-numeric --max-depth exits 2', () => {
+  const r = spawnSync(
+    'node',
+    [
+      SCRIPT,
+      '--root-url',
+      'https://example.com',
+      '--output-dir',
+      '/tmp/sb-crawl-test',
+      '--max-depth',
+      'foo',
+    ],
+    { encoding: 'utf8' },
+  )
+  assert.equal(r.status, 2, `exit code was ${r.status}`)
+  assert.match(r.stderr, /max-depth/)
+})
+test('non-numeric --max-pages exits 2', () => {
+  const r = spawnSync(
+    'node',
+    [
+      SCRIPT,
+      '--root-url',
+      'https://example.com',
+      '--output-dir',
+      '/tmp/sb-crawl-test',
+      '--max-pages',
+      'zero',
+    ],
+    { encoding: 'utf8' },
+  )
+  assert.equal(r.status, 2, `exit code was ${r.status}`)
+  assert.match(r.stderr, /max-pages/)
+})
+test('--sitemap-path that does not exist exits 2', () => {
+  const r = spawnSync(
+    'node',
+    [
+      SCRIPT,
+      '--root-url',
+      'https://example.com',
+      '--output-dir',
+      '/tmp/sb-crawl-test',
+      '--sitemap-path',
+      '/tmp/sb-does-not-exist-xyz.xml',
+    ],
+    { encoding: 'utf8' },
+  )
+  assert.equal(r.status, 2, `exit code was ${r.status}`)
+  assert.match(r.stderr, /not found/)
+})
+test('valid --sitemap-path produces a pages-list.json with classified pages', () => {
+  // Use /tmp because mkdtempSync template needs an existing parent.
+  const dir = mkdtempSync(join(tmpdir(), 'sb-crawl-'))
+  const sitemapPath = join(dir, 'sitemap.xml')
+  writeFileSync(
+    sitemapPath,
+    `<?xml version="1.0"?>
+<urlset>
+  <url><loc>https://example.com/</loc></url>
+  <url><loc>https://example.com/products/foo</loc></url>
+  <url><loc>https://example.com/collections/men</loc></url>
+  <url><loc>https://example.com/admin/users</loc></url>
+  <url><loc>https://example.com/cart</loc></url>
+  <url><loc>https://example.com/pages/about</loc></url>
+  <url><loc>https://example.com/products/foo?utm_source=tw</loc></url>
+</urlset>`,
+    'utf8',
+  )
+  const out = join(dir, 'out')
+  const r = spawnSync(
+    'node',
+    [
+      SCRIPT,
+      '--root-url',
+      'https://example.com',
+      '--output-dir',
+      out,
+      '--sitemap-path',
+      sitemapPath,
+      '--respect-robots-txt',
+      'false',
+    ],
+    { encoding: 'utf8' },
+  )
+  try {
+    assert.equal(r.status, 0, `exit ${r.status}\nstderr:\n${r.stderr}\nstdout:\n${r.stdout}`)
+    const json = JSON.parse(r.stdout)
+    assert.equal(json.rootUrl, 'https://example.com/')
+    assert.equal(json.source, 'sitemap-path')
+    const types = json.pages.map((p) => p.type)
+    assert.ok(types.includes('home'), `home missing: ${types}`)
+    assert.ok(types.includes('pdp'))
+    assert.ok(types.includes('collection'))
+    assert.ok(types.includes('about'))
+    // /admin and /cart must be filtered
+    const urls = json.pages.map((p) => p.url)
+    assert.ok(!urls.some((u) => u.includes('/admin')), 'admin leaked')
+    assert.ok(!urls.some((u) => u.endsWith('/cart')), 'cart leaked')
+    // utm-tagged URL must collapse onto its bare twin
+    const fooHits = urls.filter((u) => u === 'https://example.com/products/foo')
+    assert.equal(fooHits.length, 1, 'utm dedupe failed')
+    assert.ok(json.duplicates >= 1, 'duplicate count not reported')
+    // Counts add up: blocked count matches admin + cart (2)
+    assert.equal(json.blocked, 2, `blocked was ${json.blocked}`)
+    // pages-list.json file persisted
+    assert.ok(existsSync(join(out, 'pages-list.json')))
+    const fileJson = JSON.parse(readFileSync(join(out, 'pages-list.json'), 'utf8'))
+    assert.equal(fileJson.pageCount, json.pageCount)
+  } finally {
+    rmSync(dir, { recursive: true, force: true })
+  }
+})
+test('--max-pages cap surfaces a warning and truncates', () => {
+  const dir = mkdtempSync(join(tmpdir(), 'sb-crawl-'))
+  const sitemapPath = join(dir, 'sitemap.xml')
+  const urls = []
+  for (let i = 0; i < 25; i++) urls.push(`<url><loc>https://example.com/products/item-${i}</loc></url>`)
+  writeFileSync(sitemapPath, `<urlset>${urls.join('')}</urlset>`, 'utf8')
+  const out = join(dir, 'out')
+  const r = spawnSync(
+    'node',
+    [
+      SCRIPT,
+      '--root-url',
+      'https://example.com',
+      '--output-dir',
+      out,
+      '--sitemap-path',
+      sitemapPath,
+      '--max-pages',
+      '10',
+      '--respect-robots-txt',
+      'false',
+    ],
+    { encoding: 'utf8' },
+  )
+  try {
+    assert.equal(r.status, 0)
+    const json = JSON.parse(r.stdout)
+    assert.equal(json.pageCount, 10)
+    assert.ok(json.warnings.some((w) => w.startsWith('max-pages-cap:10-of-25')))
+  } finally {
+    rmSync(dir, { recursive: true, force: true })
+  }
+})
+test('--blocklist additive: custom prefix drops matching URLs', () => {
+  const dir = mkdtempSync(join(tmpdir(), 'sb-crawl-'))
+  const sitemapPath = join(dir, 'sitemap.xml')
+  writeFileSync(
+    sitemapPath,
+    `<urlset>
+      <url><loc>https://example.com/</loc></url>
+      <url><loc>https://example.com/preview/secret</loc></url>
+      <url><loc>https://example.com/products/foo</loc></url>
+    </urlset>`,
+    'utf8',
+  )
+  const out = join(dir, 'out')
+  const r = spawnSync(
+    'node',
+    [
+      SCRIPT,
+      '--root-url',
+      'https://example.com',
+      '--output-dir',
+      out,
+      '--sitemap-path',
+      sitemapPath,
+      '--blocklist',
+      '/preview',
+      '--respect-robots-txt',
+      'false',
+    ],
+    { encoding: 'utf8' },
+  )
+  try {
+    assert.equal(r.status, 0, r.stderr)
+    const json = JSON.parse(r.stdout)
+    const urls = json.pages.map((p) => p.url)
+    assert.ok(!urls.some((u) => u.includes('/preview')), 'preview not blocked')
+    assert.equal(json.blocked, 1)
+  } finally {
+    rmSync(dir, { recursive: true, force: true })
+  }
+})
+process.stdout.write(`\n${passed} passed, ${failed} failed\n`)
+process.exit(failed === 0 ? 0 : 1)

package/templates/skills/sb-crawl-and-list/scripts/tests/test-fallback-crawler.mjs ADDED Viewed

@@ -0,0 +1,243 @@
+#!/usr/bin/env node
+// Tests for lib/fallback-crawler.mjs — pure unit tests with injected
+// fetcher + cheerio-shaped loadHtml. No real network or DOM required.
+//
+// We don't import cheerio here — instead we use a tiny shim that returns
+// $('a[href]').each(cb) results matching what cheerio would yield. The
+// fallback crawler doesn't depend on cheerio's full API, just the
+// `$(selector).each` + `.attr('href')` chain.
+import { strict as assert } from 'node:assert'
+import { fallbackCrawl } from '../lib/fallback-crawler.mjs'
+import { buildBlocklist } from '../lib/blocklist-filter.mjs'
+let passed = 0
+let failed = 0
+async function test(name, fn) {
+  try {
+    await fn()
+    process.stdout.write(`ok - ${name}\n`)
+    passed++
+  } catch (err) {
+    process.stdout.write(`not ok - ${name}\n  ${err.message}\n`)
+    failed++
+  }
+}
+// Build a tiny "$" replacement: parses very simple HTML for href attributes
+// in <a> tags and exposes them via the cheerio-ish API the crawler uses.
+function makeFakeLoadHtml() {
+  return (html) => {
+    const hrefs = []
+    const re = /<a\b[^>]*\bhref\s*=\s*"([^"]*)"[^>]*>/gi
+    let m
+    while ((m = re.exec(html)) !== null) hrefs.push(m[1])
+    function $(sel) {
+      // Only `a[href]` is used.
+      return {
+        each(cb) {
+          hrefs.forEach((h, i) =>
+            cb(i, {
+              // The crawler does $(el).attr('href') — provide that path.
+            }),
+          )
+        },
+      }
+    }
+    // The crawler does $(el).attr('href'); to make that work, $(el) must
+    // accept the (i, el) wrapping above. Easiest path: index-based lookup.
+    // Re-implement so the .each callback re-wraps element through $:
+    function dollar(arg) {
+      if (typeof arg === 'string') {
+        // selector path
+        return {
+          each(cb) {
+            hrefs.forEach((h, i) => {
+              cb(i, { __href: h })
+            })
+          },
+        }
+      }
+      // element wrapper
+      return {
+        attr(name) {
+          if (name === 'href') return arg.__href
+          return undefined
+        },
+      }
+    }
+    return dollar
+  }
+}
+const blocklistPrefixes = buildBlocklist()
+await test('crawls a single page with internal links and respects max-depth', async () => {
+  const pages = {
+    'https://example.com/': `
+      <html>
+        <a href="/about">About</a>
+        <a href="/products/foo">Foo</a>
+        <a href="https://other.com/external">Out</a>
+      </html>`,
+    'https://example.com/about': '<html>about</html>',
+    'https://example.com/products/foo': '<html>foo</html>',
+  }
+  const fetcher = async (url) => {
+    if (url in pages) return { ok: true, status: 200, text: pages[url] }
+    return { ok: false, status: 404 }
+  }
+  const loadHtml = makeFakeLoadHtml()
+  const r = await fallbackCrawl({
+    rootUrl: 'https://example.com/',
+    fetcher,
+    loadHtml,
+    maxDepth: 2,
+    maxPages: 50,
+    blocklistPrefixes,
+    sitemapUrl: null,
+  })
+  const urls = r.results.map((p) => p.url).sort()
+  assert.deepEqual(urls, [
+    'https://example.com/',
+    'https://example.com/about',
+    'https://example.com/products/foo',
+  ])
+  assert.ok(!r.warnings.includes('spa-suspected'))
+})
+await test('skips off-origin links', async () => {
+  const fetcher = async (url) => {
+    if (url === 'https://example.com/')
+      return {
+        ok: true,
+        status: 200,
+        text: '<html><a href="https://elsewhere.com/x">x</a></html>',
+      }
+    return { ok: false, status: 404 }
+  }
+  const loadHtml = makeFakeLoadHtml()
+  const r = await fallbackCrawl({
+    rootUrl: 'https://example.com/',
+    fetcher,
+    loadHtml,
+    maxDepth: 2,
+    maxPages: 50,
+    blocklistPrefixes,
+    sitemapUrl: null,
+  })
+  // Only the root + spa-suspected because root had 0 internal links
+  assert.deepEqual(r.results.map((p) => p.url), ['https://example.com/'])
+  assert.ok(r.warnings.includes('spa-suspected'))
+})
+await test('flags spa-suspected when entry has no internal hrefs', async () => {
+  const fetcher = async () => ({ ok: true, status: 200, text: '<html><script>app()</script></html>' })
+  const loadHtml = makeFakeLoadHtml()
+  const r = await fallbackCrawl({
+    rootUrl: 'https://example.com/',
+    fetcher,
+    loadHtml,
+    maxDepth: 3,
+    maxPages: 200,
+    blocklistPrefixes,
+    sitemapUrl: null,
+  })
+  assert.equal(r.results.length, 1)
+  assert.ok(r.warnings.includes('spa-suspected'))
+})
+await test('does NOT flag spa-suspected when crawl finds many pages', async () => {
+  const pages = {
+    'https://example.com/': `<a href="/a">A</a><a href="/b">B</a><a href="/c">C</a>`,
+    'https://example.com/a': '<html>a</html>',
+    'https://example.com/b': '<html>b</html>',
+    'https://example.com/c': '<html>c</html>',
+  }
+  const fetcher = async (url) => (pages[url] != null ? { ok: true, status: 200, text: pages[url] } : { ok: false, status: 404 })
+  const loadHtml = makeFakeLoadHtml()
+  const r = await fallbackCrawl({
+    rootUrl: 'https://example.com/',
+    fetcher,
+    loadHtml,
+    maxDepth: 2,
+    maxPages: 50,
+    blocklistPrefixes,
+    sitemapUrl: null,
+  })
+  assert.equal(r.results.length, 4)
+  assert.ok(!r.warnings.includes('spa-suspected'))
+})
+await test('respects maxPages cap', async () => {
+  const pages = {
+    'https://example.com/': `<a href="/a">A</a><a href="/b">B</a><a href="/c">C</a><a href="/d">D</a>`,
+    'https://example.com/a': '<html>a</html>',
+    'https://example.com/b': '<html>b</html>',
+    'https://example.com/c': '<html>c</html>',
+    'https://example.com/d': '<html>d</html>',
+  }
+  const fetcher = async (url) => (pages[url] != null ? { ok: true, status: 200, text: pages[url] } : { ok: false, status: 404 })
+  const loadHtml = makeFakeLoadHtml()
+  const r = await fallbackCrawl({
+    rootUrl: 'https://example.com/',
+    fetcher,
+    loadHtml,
+    maxDepth: 2,
+    maxPages: 2,
+    blocklistPrefixes,
+    sitemapUrl: null,
+  })
+  assert.equal(r.results.length, 2)
+})
+await test('does not enqueue blocklisted paths', async () => {
+  const pages = {
+    'https://example.com/': `<a href="/account/profile">Profile</a><a href="/products/foo">Foo</a>`,
+    'https://example.com/products/foo': '<html>foo</html>',
+  }
+  const fetcher = async (url) => (pages[url] != null ? { ok: true, status: 200, text: pages[url] } : { ok: false, status: 404 })
+  const loadHtml = makeFakeLoadHtml()
+  const r = await fallbackCrawl({
+    rootUrl: 'https://example.com/',
+    fetcher,
+    loadHtml,
+    maxDepth: 2,
+    maxPages: 50,
+    blocklistPrefixes,
+    sitemapUrl: null,
+  })
+  const urls = r.results.map((p) => p.url).sort()
+  // /account/profile must NOT appear
+  assert.deepEqual(urls, ['https://example.com/', 'https://example.com/products/foo'])
+})
+await test('warns on per-page fetch failures but keeps other pages', async () => {
+  const pages = {
+    'https://example.com/': `<a href="/ok">OK</a><a href="/broken">Broken</a>`,
+    'https://example.com/ok': '<html>ok</html>',
+    // /broken will 500
+  }
+  const fetcher = async (url) => {
+    if (url === 'https://example.com/broken') return { ok: false, status: 500 }
+    if (pages[url] != null) return { ok: true, status: 200, text: pages[url] }
+    return { ok: false, status: 404 }
+  }
+  const loadHtml = makeFakeLoadHtml()
+  const r = await fallbackCrawl({
+    rootUrl: 'https://example.com/',
+    fetcher,
+    loadHtml,
+    maxDepth: 2,
+    maxPages: 50,
+    blocklistPrefixes,
+    sitemapUrl: null,
+  })
+  const urls = r.results.map((p) => p.url).sort()
+  assert.deepEqual(urls, ['https://example.com/', 'https://example.com/ok'])
+  assert.ok(r.warnings.some((w) => w.startsWith('crawl-fetch-failed:https://example.com/broken')))
+})
+process.stdout.write(`\n${passed} passed, ${failed} failed\n`)
+process.exit(failed === 0 ? 0 : 1)

package/templates/skills/sb-crawl-and-list/scripts/tests/test-page-classifier.mjs ADDED Viewed

@@ -0,0 +1,120 @@
+#!/usr/bin/env node
+// Tests for lib/page-classifier.mjs — pure unit tests.
+import { strict as assert } from 'node:assert'
+import { classifyUrl } from '../lib/page-classifier.mjs'
+let passed = 0
+let failed = 0
+function test(name, fn) {
+  try {
+    fn()
+    process.stdout.write(`ok - ${name}\n`)
+    passed++
+  } catch (err) {
+    process.stdout.write(`not ok - ${name}\n  ${err.message}\n`)
+    failed++
+  }
+}
+// Home
+test('classifies bare root as home', () => {
+  assert.equal(classifyUrl('https://example.com/'), 'home')
+  assert.equal(classifyUrl('https://example.com'), 'home')
+})
+// PDP
+test('classifies Shopify /products/ as pdp', () => {
+  assert.equal(classifyUrl('https://example.com/products/foo-bar'), 'pdp')
+  assert.equal(classifyUrl('https://example.com/products/foo-bar/'), 'pdp')
+})
+test('classifies Woo-style /product/ as pdp', () => {
+  assert.equal(classifyUrl('https://example.com/product/widget'), 'pdp')
+})
+// Collection
+test('classifies /collections/ as collection', () => {
+  assert.equal(classifyUrl('https://example.com/collections/men'), 'collection')
+})
+test('classifies /product-category/ as collection', () => {
+  assert.equal(classifyUrl('https://example.com/product-category/shoes'), 'collection')
+})
+test('classifies /category/ as collection', () => {
+  assert.equal(classifyUrl('https://example.com/category/dresses'), 'collection')
+})
+// Contact
+test('classifies /pages/contact-us as contact', () => {
+  assert.equal(classifyUrl('https://example.com/pages/contact-us'), 'contact')
+})
+test('classifies /contact as contact', () => {
+  assert.equal(classifyUrl('https://example.com/contact'), 'contact')
+})
+// About
+test('classifies /pages/about-us as about', () => {
+  assert.equal(classifyUrl('https://example.com/pages/about-us'), 'about')
+})
+test('classifies /about as about', () => {
+  assert.equal(classifyUrl('https://example.com/about'), 'about')
+})
+// Policy
+test('classifies /policies/privacy-policy as policy', () => {
+  assert.equal(classifyUrl('https://example.com/policies/privacy-policy'), 'policy')
+})
+test('classifies /pages/terms-of-service as policy', () => {
+  assert.equal(classifyUrl('https://example.com/pages/terms-of-service'), 'policy')
+})
+test('classifies /pages/refund-policy as policy', () => {
+  assert.equal(classifyUrl('https://example.com/pages/refund-policy'), 'policy')
+})
+// Blog
+test('classifies /blogs/news/post-slug as blog', () => {
+  assert.equal(classifyUrl('https://example.com/blogs/news/post-slug'), 'blog')
+})
+test('classifies WordPress /blog/post-slug as blog', () => {
+  assert.equal(classifyUrl('https://example.com/blog/hello-world'), 'blog')
+})
+test('classifies /posts/foo as blog', () => {
+  assert.equal(classifyUrl('https://example.com/posts/foo'), 'blog')
+})
+// Other
+test('classifies generic /random-path as other', () => {
+  assert.equal(classifyUrl('https://example.com/random-path'), 'other')
+})
+test('classifies /pages/landing-2024 as other (no specific match)', () => {
+  assert.equal(classifyUrl('https://example.com/pages/landing-2024'), 'other')
+})
+// Edge cases
+test('handles invalid URL string by treating it as a path', () => {
+  assert.equal(classifyUrl('/products/foo'), 'pdp')
+  assert.equal(classifyUrl('/'), 'home')
+})
+test('is case-insensitive for paths', () => {
+  assert.equal(classifyUrl('https://example.com/PRODUCTS/foo'), 'pdp')
+  assert.equal(classifyUrl('https://example.com/Pages/Contact'), 'contact')
+})
+test('does not misclassify /products-old as pdp (no slash boundary)', () => {
+  // /products-old is "other" because the rule requires /products/ with a slash
+  assert.equal(classifyUrl('https://example.com/products-old'), 'other')
+})
+process.stdout.write(`\n${passed} passed, ${failed} failed\n`)
+process.exit(failed === 0 ? 0 : 1)