similarbuild 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "similarbuild",
3
- "version": "0.2.0",
3
+ "version": "0.2.2",
4
4
  "description": "Visual migration framework for Claude Code — clone a live page, get a paste-ready WordPress/Elementor or Shopify section file, validated and auto-corrected.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -29,7 +29,7 @@ Optional:
29
29
  --wait-strategy <name> lazy-load (default) | auto | kaching-bundles | judge-me
30
30
  --max-depth <n> DOM walk max depth (default 8).
31
31
  --max-children <n> Max children kept per node (default 60).
32
- --max-text <n> Max chars of direct text per node (default 240).
32
+ --max-text <n> Max chars of direct text per node (default 0 = no truncation; pass a positive integer to cap).
33
33
  --timeout <ms> Per-step timeout (default 30000).
34
34
  --help Show this message.
35
35
 
@@ -55,7 +55,7 @@ const { values } = parseArgs({
55
55
  'output-dir': { type: 'string' },
56
56
  'max-depth': { type: 'string', default: '8' },
57
57
  'max-children': { type: 'string', default: '60' },
58
- 'max-text': { type: 'string', default: '240' },
58
+ 'max-text': { type: 'string', default: '0' },
59
59
  timeout: { type: 'string', default: '30000' },
60
60
  help: { type: 'boolean', default: false },
61
61
  },
@@ -960,6 +960,12 @@ function extractInPage({ selector, maxDepth, maxChildren, maxText }) {
960
960
  // iframes (Klaviyo embeds, recaptcha) are recorded as opaque rectangles.
961
961
  let shadowDOMTraversed = false
962
962
  let shadowRootCount = 0
963
+ // §V03-C — counts hosts whose shadow tree was successfully re-serialized
964
+ // via getHTML+parseHTMLUnsafe and re-walked into a flattened light-DOM
965
+ // representation. 0 means the post-render shadow-flatten phase was either
966
+ // unavailable, skipped, or hit zero open roots.
967
+ let shadowSerializedHostCount = 0
968
+ const warnings = []
963
969
  const externalIframes = []
964
970
  function classifyIframePurpose(src) {
965
971
  if (!src) return null
@@ -978,7 +984,7 @@ function extractInPage({ selector, maxDepth, maxChildren, maxText }) {
978
984
  function classifySection(el) {
979
985
  const cls = (typeof el.className === 'string' ? el.className : '').toLowerCase()
980
986
  const tag = el.tagName.toLowerCase()
981
- const id = (el.id || '').toLowerCase()
987
+ const id = (typeof el.id === 'string' ? el.id : '').toLowerCase()
982
988
  const blob = `${tag} ${cls} ${id}`
983
989
  if (/\bhero\b/.test(blob)) return 'hero'
984
990
  if (/\bbanner\b/.test(blob)) return 'banner'
@@ -1091,7 +1097,10 @@ function extractInPage({ selector, maxDepth, maxChildren, maxText }) {
1091
1097
  if (n.nodeType === 3) t += n.nodeValue
1092
1098
  }
1093
1099
  t = t.replace(/\s+/g, ' ').trim()
1094
- if (t.length > maxText) t = `${t.slice(0, maxText)}…`
1100
+ // §V03-B maxText === 0 means "no truncation" (default since v0.3.0).
1101
+ // Pre-v0.3.0 default was 240, which silently clipped policy paragraphs
1102
+ // (privacy/terms) at ~25-36% coverage. The cap is now opt-in via flag.
1103
+ if (maxText > 0 && t.length > maxText) t = `${t.slice(0, maxText)}…`
1095
1104
  return t
1096
1105
  }
1097
1106
 
@@ -1107,7 +1116,7 @@ function extractInPage({ selector, maxDepth, maxChildren, maxText }) {
1107
1116
  const node = {
1108
1117
  tag: el.tagName.toLowerCase(),
1109
1118
  classes: typeof el.className === 'string' ? el.className.split(/\s+/).filter(Boolean) : [],
1110
- id: el.id || null,
1119
+ id: typeof el.id === 'string' && el.id ? el.id : null,
1111
1120
  attrs: {},
1112
1121
  text: directText(el),
1113
1122
  bbox: bbox(el),
@@ -1239,7 +1248,7 @@ function extractInPage({ selector, maxDepth, maxChildren, maxText }) {
1239
1248
  // pseudo-elements, so this filters automatically.
1240
1249
  const pseudoElements = []
1241
1250
  function pseudoSelectorFor(el) {
1242
- if (el.id) return `#${el.id}`
1251
+ if (typeof el.id === 'string' && el.id) return `#${el.id}`
1243
1252
  const cls = typeof el.className === 'string' ? el.className.trim().split(/\s+/).filter(Boolean) : []
1244
1253
  if (cls.length) return `${el.tagName.toLowerCase()}.${cls.slice(0, 3).join('.')}`
1245
1254
  return el.tagName.toLowerCase()
@@ -1406,15 +1415,138 @@ function extractInPage({ selector, maxDepth, maxChildren, maxText }) {
1406
1415
  const dom = [walk(root, 0)].filter(Boolean)
1407
1416
  const { sectionType, sectionBoundingBox } = findSectionAndBox(root, !!selector)
1408
1417
 
1418
+ // §V03-C — Shadow DOM serialization via getHTML + parseHTMLUnsafe.
1419
+ // walk() above only sees light DOM and `.shadowRoot.children` direct.
1420
+ // Custom elements whose shadow tree is populated by JS (Shopify
1421
+ // <x-product-form>, <price-list>, <variant-radios>, <store-footer-menu>)
1422
+ // expose content that the children-only walker can technically read,
1423
+ // but `<slot>`-projected content and content composed from declarative
1424
+ // shadow DOM gets fragmented. getHTML({ serializableShadowRoots: true })
1425
+ // emits a single HTML string with `<template shadowrootmode="open">`
1426
+ // declarations inline; parseHTMLUnsafe re-attaches those as live shadow
1427
+ // roots in a parsed document, which the same walk() can then flatten
1428
+ // uniformly. Layout (bbox/computedStyle) on the parsed doc is detached
1429
+ // and returns UA defaults — that's an acknowledged trade-off; the
1430
+ // structural content (tags, classes, attrs, text, src) is what makes
1431
+ // PDP gallery/price/variants visible to the composer downstream.
1432
+ const originalShadowRootCount = shadowRootCount
1433
+ const originalShadowDOMTraversed = shadowDOMTraversed
1434
+ // §V03-C — preserve a snapshot of the live-walker dom[] before any
1435
+ // potential substitution. If the re-walk replaces dom[] with the
1436
+ // shadow-flattened tree (which carries detached parsedDoc bboxes
1437
+ // === {0,0,0,0}), downstream consumers that need real layout (e.g.
1438
+ // /build-site Step 3.5e --crop-live-bbox for header/footer compare)
1439
+ // can fall back to domLive. When substitution doesn't fire, domLive
1440
+ // is left null and consumers use dom[] as-is.
1441
+ let domLive = null
1442
+ try {
1443
+ if (typeof document.documentElement.getHTML === 'function' &&
1444
+ typeof Document.parseHTMLUnsafe === 'function') {
1445
+ const hostsSeen = new Set()
1446
+ const hostsCollected = []
1447
+ function collectHostsFrom(el) {
1448
+ if (!el) return
1449
+ if (el.shadowRoot && !hostsSeen.has(el)) {
1450
+ hostsSeen.add(el)
1451
+ hostsCollected.push(el)
1452
+ for (const child of el.shadowRoot.children) collectHostsFrom(child)
1453
+ }
1454
+ for (const child of el.children) collectHostsFrom(child)
1455
+ }
1456
+ collectHostsFrom(document.documentElement)
1457
+
1458
+ if (hostsCollected.length > 0) {
1459
+ const html = document.documentElement.getHTML({
1460
+ serializableShadowRoots: true,
1461
+ shadowRoots: hostsCollected.map((h) => h.shadowRoot),
1462
+ })
1463
+ const parsedDoc = Document.parseHTMLUnsafe(html)
1464
+ const parsedRoot = selector
1465
+ ? parsedDoc.querySelector(selector)
1466
+ : parsedDoc.body
1467
+ if (parsedRoot) {
1468
+ const flattened = walk(parsedRoot, 0)
1469
+ // §V03-C safety guard: only substitute the live dom[] if the
1470
+ // re-walk did not lose value on EITHER axis (nodes or aggregate
1471
+ // text chars). parseHTMLUnsafe returns a detached doc;
1472
+ // getComputedStyle/getBoundingClientRect degrade to UA defaults
1473
+ // (zero bbox, empty computed). On pages where shadow flattening
1474
+ // adds value (PDPs Shopify with populated custom elements) the
1475
+ // gain is large; on pages without that workload (policies,
1476
+ // plain HTML), the re-walk can lose content because hydrated
1477
+ // shadow content visible to the live walker doesn't reproduce
1478
+ // on the detached tree. When that happens, keep the live
1479
+ // walker result and surface a warning.
1480
+ function measureTree(arr) {
1481
+ let nodes = 0
1482
+ let textChars = 0
1483
+ const stack = Array.isArray(arr) ? [...arr] : [arr]
1484
+ while (stack.length) {
1485
+ const node = stack.pop()
1486
+ if (!node || typeof node !== 'object') continue
1487
+ nodes++
1488
+ if (typeof node.text === 'string') textChars += node.text.length
1489
+ if (Array.isArray(node.children)) {
1490
+ for (const c of node.children) stack.push(c)
1491
+ }
1492
+ }
1493
+ return { nodes, textChars }
1494
+ }
1495
+ if (flattened) {
1496
+ const orig = measureTree(dom)
1497
+ const flat = measureTree([flattened])
1498
+ if (flat.nodes >= orig.nodes && flat.textChars >= orig.textChars) {
1499
+ // Snapshot the live walker result BEFORE substitution so
1500
+ // downstream consumers that depend on real layout (bbox
1501
+ // values are zero on the parsed detached doc) can fall
1502
+ // back when needed — see §V03-C domLive comment above.
1503
+ domLive = dom.length === 1 ? dom[0] : [...dom]
1504
+ dom.length = 0
1505
+ dom.push(flattened)
1506
+ shadowSerializedHostCount = hostsCollected.length
1507
+ } else {
1508
+ warnings.push({
1509
+ code: 'shadow-flatten-skipped-lossy',
1510
+ message: `re-walk lossy: nodes ${flat.nodes} vs ${orig.nodes}, textChars ${flat.textChars} vs ${orig.textChars}; keeping live walker result`,
1511
+ })
1512
+ }
1513
+ }
1514
+ // The re-walk of the parsed (detached) doc re-discovers shadow
1515
+ // roots that parseHTMLUnsafe re-attached, so it would
1516
+ // double-count if we let those increments leak. Restore the
1517
+ // canonical live counts here.
1518
+ shadowRootCount = originalShadowRootCount
1519
+ shadowDOMTraversed = originalShadowDOMTraversed
1520
+ }
1521
+ }
1522
+ } else {
1523
+ warnings.push({
1524
+ code: 'shadow-serialize-unavailable',
1525
+ message:
1526
+ 'getHTML or Document.parseHTMLUnsafe not available; shadow DOM falls back to .shadowRoot.children walk (v0.2.x behavior)',
1527
+ })
1528
+ }
1529
+ } catch (err) {
1530
+ shadowRootCount = originalShadowRootCount
1531
+ shadowDOMTraversed = originalShadowDOMTraversed
1532
+ warnings.push({
1533
+ code: 'shadow-serialize-failed',
1534
+ message: String(err && err.message ? err.message : err),
1535
+ })
1536
+ }
1537
+
1409
1538
  return {
1410
1539
  sectionType,
1411
1540
  sectionBoundingBox,
1412
1541
  tokens,
1413
1542
  dom,
1543
+ domLive,
1414
1544
  pseudoElements,
1415
1545
  imgUrls,
1416
1546
  shadowDOMTraversed,
1417
1547
  shadowRootCount,
1548
+ shadowSerializedHostCount,
1549
+ warnings,
1418
1550
  externalIframes,
1419
1551
  }
1420
1552
  }
@@ -37,6 +37,24 @@ test('--help exits 0 and prints usage', () => {
37
37
  assert.match(r.stdout, /--wait-strategy/)
38
38
  })
39
39
 
40
+ // §V03-B — V0.3.0 changed --max-text default from 240 to 0 (no truncation).
41
+ // This regression-protects the policy-page coverage fix.
42
+ test('--help documents --max-text default 0 (no truncation)', () => {
43
+ const r = spawnSync('node', [SCRIPT, '--help'], { encoding: 'utf8' })
44
+ assert.equal(r.status, 0, `exit code was ${r.status}`)
45
+ assert.match(r.stdout, /--max-text/, '--max-text flag must appear in help')
46
+ // Scope the "default 0" assertion to the --max-text line specifically.
47
+ // The previous /default\s+0/ would match against any other flag whose
48
+ // description happens to contain "default 0" (e.g. a future flag whose
49
+ // default is "0ms" or similar).
50
+ assert.match(
51
+ r.stdout,
52
+ /--max-text[^\n]*default\s+0/,
53
+ '--max-text line must document default 0',
54
+ )
55
+ assert.match(r.stdout, /no truncation/i, 'help must explain semantics')
56
+ })
57
+
40
58
  test('missing --url exits 2', () => {
41
59
  const r = spawnSync('node', [SCRIPT, '--output-dir', '/tmp/sb-test'], { encoding: 'utf8' })
42
60
  assert.equal(r.status, 2, `exit code was ${r.status}`)