prism-design 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +292 -0
- package/LICENSE +21 -0
- package/README.md +203 -0
- package/bin/clone-architect.mjs +476 -0
- package/bin/prism.mjs +467 -0
- package/catalog/index.json +1155 -0
- package/extractions/airbnb.com/DESIGN.md +1068 -0
- package/extractions/airbnb.com/tokens.json +507 -0
- package/extractions/attio.com/DESIGN.md +1295 -0
- package/extractions/attio.com/tokens.json +438 -0
- package/extractions/auroxdashboard.com/DESIGN.md +724 -0
- package/extractions/auroxdashboard.com/tokens.json +195 -0
- package/extractions/careerexplorer.com/DESIGN.md +1178 -0
- package/extractions/careerexplorer.com/tokens.json +141 -0
- package/extractions/chance.co/DESIGN.md +1209 -0
- package/extractions/chance.co/tokens.json +160 -0
- package/extractions/choisis-ton-avenir.com/DESIGN.md +1265 -0
- package/extractions/choisis-ton-avenir.com/tokens.json +227 -0
- package/extractions/example.com/DESIGN.md +436 -0
- package/extractions/example.com/tokens.json +91 -0
- package/extractions/getdesign.md/DESIGN.md +1009 -0
- package/extractions/getdesign.md/tokens.json +219 -0
- package/extractions/github.com/DESIGN.md +1130 -0
- package/extractions/github.com/tokens.json +2092 -0
- package/extractions/hello-charly.com/DESIGN.md +1146 -0
- package/extractions/hello-charly.com/tokens.json +322 -0
- package/extractions/hyperliquid.xyz/DESIGN.md +779 -0
- package/extractions/hyperliquid.xyz/tokens.json +598 -0
- package/extractions/instagram.com/DESIGN.md +996 -0
- package/extractions/instagram.com/tokens.json +1240 -0
- package/extractions/jobirl.com/DESIGN.md +1160 -0
- package/extractions/jobirl.com/tokens.json +139 -0
- package/extractions/life360.com/DESIGN.md +1133 -0
- package/extractions/life360.com/tokens.json +491 -0
- package/extractions/lifesum.com/DESIGN.md +965 -0
- package/extractions/lifesum.com/tokens.json +170 -0
- package/extractions/linear.app/DESIGN.md +1301 -0
- package/extractions/linear.app/tokens.json +732 -0
- package/extractions/mavoie.org/DESIGN.md +1148 -0
- package/extractions/mavoie.org/tokens.json +128 -0
- package/extractions/miro.com/DESIGN.md +1237 -0
- package/extractions/miro.com/tokens.json +401 -0
- package/extractions/notion.so/DESIGN.md +1319 -0
- package/extractions/notion.so/tokens.json +906 -0
- package/extractions/onetonline.org/DESIGN.md +909 -0
- package/extractions/onetonline.org/tokens.json +280 -0
- package/extractions/posthog.com/DESIGN.md +1024 -0
- package/extractions/posthog.com/tokens.json +197 -0
- package/extractions/revolut.com/DESIGN.md +1080 -0
- package/extractions/revolut.com/tokens.json +401 -0
- package/extractions/stripe.com/DESIGN.md +1272 -0
- package/extractions/stripe.com/tokens.json +794 -0
- package/extractions/switchcollective.com/DESIGN.md +1040 -0
- package/extractions/switchcollective.com/tokens.json +98 -0
- package/extractions/truity.com/DESIGN.md +970 -0
- package/extractions/truity.com/tokens.json +166 -0
- package/extractions/uniquekicks.be/DESIGN.md +1171 -0
- package/extractions/uniquekicks.be/tokens.json +237 -0
- package/package.json +122 -0
- package/scripts/analyze.ts +281 -0
- package/scripts/bank-register.ts +379 -0
- package/scripts/bank.ts +374 -0
- package/scripts/browser-stealth.ts +189 -0
- package/scripts/clone.ts +198 -0
- package/scripts/compare-vs-gd-final.ts +273 -0
- package/scripts/compare-vs-gd.ts +269 -0
- package/scripts/compare.ts +405 -0
- package/scripts/deploy-site.ts +181 -0
- package/scripts/diff-snapshots.ts +340 -0
- package/scripts/enrich-catalog.ts +212 -0
- package/scripts/extract.ts +2038 -0
- package/scripts/extractors/advanced.ts +524 -0
- package/scripts/extractors/widgets.ts +711 -0
- package/scripts/generate-design-md.ts +5775 -0
- package/scripts/generate-final-pdf.ts +274 -0
- package/scripts/generate-og-image.ts +87 -0
- package/scripts/generate-showcase.ts +1588 -0
- package/scripts/generate-site.ts +847 -0
- package/scripts/mass-extract.sh +91 -0
- package/scripts/post-process-all.sh +55 -0
- package/scripts/regen-catalog.ts +203 -0
- package/scripts/shared/cache.ts +149 -0
- package/scripts/shared/css-helpers.ts +263 -0
- package/scripts/shared/logger.ts +57 -0
- package/scripts/shared/named-colors.ts +355 -0
- package/scripts/shared/types.ts +220 -0
- package/scripts/sync-catalog.ts +105 -0
- package/scripts/tokenize.ts +988 -0
- package/templates/layout-template.md +52 -0
- package/templates/tokens-template.json +34 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Mass extraction orchestrator — 71 GD brands en batches de 3 parallèles
|
|
3
|
+
# Tier 1 (easy) → Tier 2 (medium) → Tier 3 (risky)
|
|
4
|
+
# Cumulative timeout: 240s par extract (skip après timeout)
|
|
5
|
+
# Output: logs/mass-<TS>/{success,failed,timing}.txt
|
|
6
|
+
|
|
7
|
+
set -u
|
|
8
|
+
cd /home/paul/clone-architect
|
|
9
|
+
TS=$(date +%Y%m%d-%H%M)
|
|
10
|
+
LOGDIR="logs/mass-$TS"
|
|
11
|
+
mkdir -p "$LOGDIR"
|
|
12
|
+
SUCCESS="$LOGDIR/success.txt"
|
|
13
|
+
FAILED="$LOGDIR/failed.txt"
|
|
14
|
+
TIMING="$LOGDIR/timing.txt"
|
|
15
|
+
touch "$SUCCESS" "$FAILED" "$TIMING"
|
|
16
|
+
|
|
17
|
+
# === Tier 1 — Easy wins (33 brands) ===
|
|
18
|
+
TIER1=(claude.com clay.com cohere.com composio.dev expo.dev hashicorp.com mastercard.com replicate.com sanity.io sentry.io superhuman.com voltagent.dev warp.dev wise.com x.ai zapier.com airtable.com elevenlabs.io pinterest.com runwayml.com cal.com clickhouse.com intercom.com lovable.dev mintlify.com mistral.ai opencode.ai posthog.com resend.com supabase.com together.ai raycast.com linear.app)
|
|
19
|
+
|
|
20
|
+
# === Tier 2 — Medium (24 brands) ===
|
|
21
|
+
TIER2=(bmw.com bmwm.com bugatti.com ferrari.com hp.com lamborghini.com meta.com playstation.com renault.com spacex.com tesla.com uber.com wired.com airbnb.com apple.com cursor.com framer.com ibm.com miro.com mongodb.com nvidia.com ollama.com shopify.com spotify.com)
|
|
22
|
+
|
|
23
|
+
# === Tier 3 — Risky (14 brands, anti-bot) ===
|
|
24
|
+
TIER3=(binance.com coinbase.com kraken.com nike.com notion.so revolut.com starbucks.com stripe.com theverge.com vercel.com vodafone.com webflow.com minimax.io figma.com)
|
|
25
|
+
|
|
26
|
+
extract_one() {
|
|
27
|
+
local domain=$1
|
|
28
|
+
local start=$(date +%s)
|
|
29
|
+
# bmwm.com special case (BMW M doesn't exist as TLD, use bmw-m.com or skip)
|
|
30
|
+
local url="https://$domain"
|
|
31
|
+
if [ "$domain" = "bmwm.com" ]; then
|
|
32
|
+
url="https://www.bmw-m.com"
|
|
33
|
+
fi
|
|
34
|
+
if timeout 240s npx tsx scripts/extract.ts "$url" > "$LOGDIR/$domain.log" 2>&1; then
|
|
35
|
+
local elapsed=$(( $(date +%s) - start ))
|
|
36
|
+
echo "$domain" >> "$SUCCESS"
|
|
37
|
+
echo "$domain $elapsed" >> "$TIMING"
|
|
38
|
+
echo " ✅ $domain (${elapsed}s)"
|
|
39
|
+
else
|
|
40
|
+
local elapsed=$(( $(date +%s) - start ))
|
|
41
|
+
echo "$domain" >> "$FAILED"
|
|
42
|
+
echo "$domain ${elapsed}s FAIL" >> "$TIMING"
|
|
43
|
+
echo " ❌ $domain (${elapsed}s)"
|
|
44
|
+
fi
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
run_batch_of_3() {
|
|
48
|
+
local d1=${1:-}; local d2=${2:-}; local d3=${3:-}
|
|
49
|
+
[ -n "$d1" ] && extract_one "$d1" &
|
|
50
|
+
[ -n "$d2" ] && extract_one "$d2" &
|
|
51
|
+
[ -n "$d3" ] && extract_one "$d3" &
|
|
52
|
+
wait
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
run_tier() {
|
|
56
|
+
local tier_name=$1
|
|
57
|
+
shift
|
|
58
|
+
local tier=("$@")
|
|
59
|
+
echo "═══════════════════════════════════════════════════"
|
|
60
|
+
echo "🚀 $tier_name — ${#tier[@]} brands"
|
|
61
|
+
echo "═══════════════════════════════════════════════════"
|
|
62
|
+
local i=0
|
|
63
|
+
while [ $i -lt ${#tier[@]} ]; do
|
|
64
|
+
local d1=${tier[$i]:-}
|
|
65
|
+
local d2=${tier[$((i+1))]:-}
|
|
66
|
+
local d3=${tier[$((i+2))]:-}
|
|
67
|
+
run_batch_of_3 "$d1" "$d2" "$d3"
|
|
68
|
+
i=$((i+3))
|
|
69
|
+
# Free RAM between batches
|
|
70
|
+
sync && echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true
|
|
71
|
+
done
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
echo "📁 Logs: $LOGDIR"
|
|
75
|
+
echo ""
|
|
76
|
+
|
|
77
|
+
run_tier "TIER 1 — Easy" "${TIER1[@]}"
|
|
78
|
+
run_tier "TIER 2 — Medium" "${TIER2[@]}"
|
|
79
|
+
run_tier "TIER 3 — Risky" "${TIER3[@]}"
|
|
80
|
+
|
|
81
|
+
echo ""
|
|
82
|
+
echo "═══════════════════════════════════════════════════"
|
|
83
|
+
echo "📊 RÉSULTATS"
|
|
84
|
+
echo "═══════════════════════════════════════════════════"
|
|
85
|
+
echo " Success : $(wc -l < $SUCCESS) brands"
|
|
86
|
+
echo " Failed : $(wc -l < $FAILED) brands"
|
|
87
|
+
echo ""
|
|
88
|
+
echo "Failed brands:"
|
|
89
|
+
cat "$FAILED" 2>/dev/null | sed 's/^/ ❌ /'
|
|
90
|
+
echo ""
|
|
91
|
+
echo "Logs in: $LOGDIR"
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Post-process all extractions: tokenize + generate-design-md + generate-showcase
|
|
3
|
+
# Adds new brands to catalog and runs enrichment
|
|
4
|
+
|
|
5
|
+
set -u
|
|
6
|
+
cd /home/paul/clone-architect
|
|
7
|
+
|
|
8
|
+
LOGDIR="logs/postproc-$(date +%Y%m%d-%H%M)"
|
|
9
|
+
mkdir -p "$LOGDIR"
|
|
10
|
+
|
|
11
|
+
echo "🔧 Post-processing all extractions..."
|
|
12
|
+
|
|
13
|
+
SUCCESS=0
|
|
14
|
+
SKIPPED=0
|
|
15
|
+
FAILED=0
|
|
16
|
+
|
|
17
|
+
for dir in extractions/*/; do
|
|
18
|
+
domain=$(basename "$dir")
|
|
19
|
+
if [ -f "$dir/raw-css.json" ]; then
|
|
20
|
+
if npx tsx scripts/tokenize.ts "$domain" > "$LOGDIR/$domain.tok.log" 2>&1; then
|
|
21
|
+
if npx tsx scripts/generate-design-md.ts "$domain" > "$LOGDIR/$domain.dmd.log" 2>&1; then
|
|
22
|
+
SUCCESS=$((SUCCESS+1))
|
|
23
|
+
echo " ✅ $domain"
|
|
24
|
+
else
|
|
25
|
+
FAILED=$((FAILED+1))
|
|
26
|
+
echo " ❌ $domain (design-md failed)"
|
|
27
|
+
fi
|
|
28
|
+
else
|
|
29
|
+
FAILED=$((FAILED+1))
|
|
30
|
+
echo " ❌ $domain (tokenize failed)"
|
|
31
|
+
fi
|
|
32
|
+
else
|
|
33
|
+
SKIPPED=$((SKIPPED+1))
|
|
34
|
+
echo " ⏭ $domain (no raw-css.json)"
|
|
35
|
+
fi
|
|
36
|
+
done
|
|
37
|
+
|
|
38
|
+
echo ""
|
|
39
|
+
echo "📊 Tokenize + DESIGN.md: $SUCCESS OK · $FAILED FAIL · $SKIPPED SKIP"
|
|
40
|
+
echo ""
|
|
41
|
+
echo "🎨 Generating showcases..."
|
|
42
|
+
|
|
43
|
+
SHOW_OK=0
|
|
44
|
+
SHOW_FAIL=0
|
|
45
|
+
for dir in extractions/*/; do
|
|
46
|
+
domain=$(basename "$dir")
|
|
47
|
+
if [ -f "$dir/DESIGN.md" ]; then
|
|
48
|
+
if npx tsx scripts/generate-showcase.ts "$domain" > "$LOGDIR/$domain.show.log" 2>&1; then
|
|
49
|
+
SHOW_OK=$((SHOW_OK+1))
|
|
50
|
+
else
|
|
51
|
+
SHOW_FAIL=$((SHOW_FAIL+1))
|
|
52
|
+
fi
|
|
53
|
+
fi
|
|
54
|
+
done
|
|
55
|
+
echo "📊 Showcases: $SHOW_OK OK · $SHOW_FAIL FAIL"
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Prism — Batch DESIGN.md Regeneration
|
|
3
|
+
*
|
|
4
|
+
* Regenerates DESIGN.md for all extractions that have both raw-css.json + tokens.json.
|
|
5
|
+
* Defensive: skips extractions with missing files (never process.exit on partial failure).
|
|
6
|
+
* Reports a final summary with scores.
|
|
7
|
+
*
|
|
8
|
+
* Usage: npx tsx scripts/regen-catalog.ts [--dry-run] [--domain <domain>]
|
|
9
|
+
* --dry-run Show what would be regenerated without writing files
|
|
10
|
+
* --domain Regenerate a single domain only
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { readdir, readFile, stat } from 'fs/promises';
|
|
14
|
+
import { existsSync } from 'fs';
|
|
15
|
+
import { join } from 'path';
|
|
16
|
+
import { spawnSync } from 'child_process';
|
|
17
|
+
|
|
18
|
+
const ROOT = process.cwd();
|
|
19
|
+
const EXTRACTIONS_DIR = join(ROOT, 'extractions');
|
|
20
|
+
const GENERATE_SCRIPT = join(ROOT, 'scripts', 'generate-design-md.ts');
|
|
21
|
+
|
|
22
|
+
const isDryRun = process.argv.includes('--dry-run');
|
|
23
|
+
const domainFilter = (() => {
|
|
24
|
+
const idx = process.argv.indexOf('--domain');
|
|
25
|
+
return idx >= 0 ? process.argv[idx + 1] : null;
|
|
26
|
+
})();
|
|
27
|
+
|
|
28
|
+
interface RegenResult {
|
|
29
|
+
domain: string;
|
|
30
|
+
status: 'ok' | 'skipped' | 'error';
|
|
31
|
+
reason?: string;
|
|
32
|
+
completeness?: number;
|
|
33
|
+
lines?: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async function getCompleteness(designMdPath: string): Promise<number | null> {
|
|
37
|
+
try {
|
|
38
|
+
const content = await readFile(designMdPath, 'utf-8');
|
|
39
|
+
const m = content.match(/^completeness:\s*(\d+)/m);
|
|
40
|
+
return m ? parseInt(m[1]) : null;
|
|
41
|
+
} catch {
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
async function getLineCount(designMdPath: string): Promise<number> {
|
|
47
|
+
try {
|
|
48
|
+
const content = await readFile(designMdPath, 'utf-8');
|
|
49
|
+
return content.split('\n').length;
|
|
50
|
+
} catch {
|
|
51
|
+
return 0;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
async function main() {
|
|
56
|
+
console.log(`\n🔄 Prism — Batch DESIGN.md Regeneration`);
|
|
57
|
+
if (isDryRun) console.log(` DRY RUN — no files will be written`);
|
|
58
|
+
if (domainFilter) console.log(` Filter: ${domainFilter} only`);
|
|
59
|
+
console.log('');
|
|
60
|
+
|
|
61
|
+
// Discover all extraction directories
|
|
62
|
+
let allDomains: string[] = [];
|
|
63
|
+
try {
|
|
64
|
+
const entries = await readdir(EXTRACTIONS_DIR);
|
|
65
|
+
for (const entry of entries) {
|
|
66
|
+
const fullPath = join(EXTRACTIONS_DIR, entry);
|
|
67
|
+
try {
|
|
68
|
+
const s = await stat(fullPath);
|
|
69
|
+
if (s.isDirectory()) allDomains.push(entry);
|
|
70
|
+
} catch { /* skip */ }
|
|
71
|
+
}
|
|
72
|
+
} catch (err) {
|
|
73
|
+
console.error('Cannot read extractions directory:', err);
|
|
74
|
+
process.exit(1);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if (domainFilter) {
|
|
78
|
+
allDomains = allDomains.filter(d => d === domainFilter);
|
|
79
|
+
if (allDomains.length === 0) {
|
|
80
|
+
console.error(`Domain "${domainFilter}" not found in extractions/`);
|
|
81
|
+
process.exit(1);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
allDomains.sort();
|
|
86
|
+
console.log(`Found ${allDomains.length} extraction(s) to process...\n`);
|
|
87
|
+
|
|
88
|
+
const results: RegenResult[] = [];
|
|
89
|
+
let okCount = 0;
|
|
90
|
+
let skipCount = 0;
|
|
91
|
+
let errorCount = 0;
|
|
92
|
+
|
|
93
|
+
for (const domain of allDomains) {
|
|
94
|
+
const extractionDir = join(EXTRACTIONS_DIR, domain);
|
|
95
|
+
const rawCssPath = join(extractionDir, 'raw-css.json');
|
|
96
|
+
const tokensPath = join(extractionDir, 'tokens.json');
|
|
97
|
+
const designMdPath = join(extractionDir, 'DESIGN.md');
|
|
98
|
+
|
|
99
|
+
// Skip if missing required input files
|
|
100
|
+
if (!existsSync(rawCssPath)) {
|
|
101
|
+
results.push({ domain, status: 'skipped', reason: 'no raw-css.json' });
|
|
102
|
+
skipCount++;
|
|
103
|
+
console.log(` ⏭️ ${domain} — skipped (no raw-css.json)`);
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
if (!existsSync(tokensPath)) {
|
|
107
|
+
results.push({ domain, status: 'skipped', reason: 'no tokens.json' });
|
|
108
|
+
skipCount++;
|
|
109
|
+
console.log(` ⏭️ ${domain} — skipped (no tokens.json)`);
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if (isDryRun) {
|
|
114
|
+
console.log(` 🔲 ${domain} — would regenerate`);
|
|
115
|
+
results.push({ domain, status: 'ok', reason: 'dry-run' });
|
|
116
|
+
okCount++;
|
|
117
|
+
continue;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Resolve tsx binary
|
|
121
|
+
const localTsx = join(ROOT, 'node_modules', '.bin', 'tsx');
|
|
122
|
+
const tsxCmd = existsSync(localTsx) ? localTsx : 'npx';
|
|
123
|
+
const tsxArgs = existsSync(localTsx)
|
|
124
|
+
? [GENERATE_SCRIPT, domain]
|
|
125
|
+
: ['tsx', GENERATE_SCRIPT, domain];
|
|
126
|
+
|
|
127
|
+
process.stdout.write(` ⚙️ ${domain}... `);
|
|
128
|
+
|
|
129
|
+
const result = spawnSync(tsxCmd, tsxArgs, {
|
|
130
|
+
cwd: ROOT,
|
|
131
|
+
encoding: 'utf-8',
|
|
132
|
+
timeout: 30000, // 30s max per domain
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
if (result.error || result.status !== 0) {
|
|
136
|
+
const errMsg = result.stderr?.split('\n')[0] || result.error?.message || 'unknown error';
|
|
137
|
+
console.log(`❌ ${errMsg.slice(0, 80)}`);
|
|
138
|
+
results.push({ domain, status: 'error', reason: errMsg });
|
|
139
|
+
errorCount++;
|
|
140
|
+
continue;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Read back completeness score
|
|
144
|
+
const completeness = await getCompleteness(designMdPath);
|
|
145
|
+
const lines = await getLineCount(designMdPath);
|
|
146
|
+
|
|
147
|
+
const scoreDisplay = completeness !== null ? `${completeness}/100` : 'no score';
|
|
148
|
+
console.log(`✅ ${lines}L | ${scoreDisplay}`);
|
|
149
|
+
results.push({ domain, status: 'ok', completeness: completeness ?? undefined, lines });
|
|
150
|
+
okCount++;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// ── Summary ──
|
|
154
|
+
console.log('\n' + '═'.repeat(60));
|
|
155
|
+
console.log(`📊 Regeneration Summary`);
|
|
156
|
+
console.log(` ✅ OK: ${okCount} | ⏭️ Skipped: ${skipCount} | ❌ Errors: ${errorCount}`);
|
|
157
|
+
|
|
158
|
+
if (!isDryRun) {
|
|
159
|
+
const withScores = results.filter(r => r.completeness !== undefined);
|
|
160
|
+
if (withScores.length > 0) {
|
|
161
|
+
const avgScore = Math.round(withScores.reduce((sum, r) => sum + (r.completeness!), 0) / withScores.length);
|
|
162
|
+
const maxScore = Math.max(...withScores.map(r => r.completeness!));
|
|
163
|
+
const minScore = Math.min(...withScores.map(r => r.completeness!));
|
|
164
|
+
console.log(` Completeness: avg ${avgScore}/100 | best ${maxScore} | worst ${minScore}`);
|
|
165
|
+
|
|
166
|
+
// Top 5 and bottom 5
|
|
167
|
+
const sorted = [...withScores].sort((a, b) => b.completeness! - a.completeness!);
|
|
168
|
+
console.log(`\n Top 5:`);
|
|
169
|
+
for (const r of sorted.slice(0, 5)) {
|
|
170
|
+
console.log(` ${r.domain}: ${r.completeness}/100`);
|
|
171
|
+
}
|
|
172
|
+
if (sorted.length > 5) {
|
|
173
|
+
console.log(`\n Needs improvement:`);
|
|
174
|
+
for (const r of sorted.slice(-5).reverse()) {
|
|
175
|
+
console.log(` ${r.domain}: ${r.completeness}/100`);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Run catalog enrichment to update catalog/index.json with new scores
|
|
181
|
+
console.log('\n🔧 Updating catalog/index.json with new scores...');
|
|
182
|
+
const localTsx3 = join(ROOT, 'node_modules', '.bin', 'tsx');
|
|
183
|
+
const tsxCmd3 = existsSync(localTsx3) ? localTsx3 : 'npx';
|
|
184
|
+
const enrichScript = join(ROOT, 'scripts', 'enrich-catalog.ts');
|
|
185
|
+
if (existsSync(enrichScript)) {
|
|
186
|
+
const enrichArgs = existsSync(localTsx3)
|
|
187
|
+
? [enrichScript]
|
|
188
|
+
: ['tsx', enrichScript];
|
|
189
|
+
const enrichResult = spawnSync(tsxCmd3, enrichArgs, { stdio: 'inherit', cwd: ROOT });
|
|
190
|
+
if (enrichResult.status !== 0) {
|
|
191
|
+
console.warn('⚠️ catalog enrichment failed — run manually: npx tsx scripts/enrich-catalog.ts');
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
console.log('');
|
|
197
|
+
process.exit(errorCount > 0 ? 1 : 0);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
main().catch(err => {
|
|
201
|
+
console.error('Fatal error in regen-catalog:', err);
|
|
202
|
+
process.exit(1);
|
|
203
|
+
});
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* shared/cache.ts — Phase 2.3 re-extraction cache
|
|
3
|
+
*
|
|
4
|
+
* Skips Playwright extraction if the URL content hasn't materially changed since last run.
|
|
5
|
+
* Fingerprint = hash(URL + HTTP headers ETag/last-modified/content-length + algorithm version).
|
|
6
|
+
*
|
|
7
|
+
* Cache hit: ~2s (HEAD request only) vs full extraction ~140s = 99% time saved on re-runs.
|
|
8
|
+
*
|
|
9
|
+
* Safety:
|
|
10
|
+
* - Atomic writes (temp file + rename) prevent corruption on partial writes
|
|
11
|
+
* - Algorithm version baked into fingerprint = automatic bust when extractors change
|
|
12
|
+
* - Force flag bypasses cache for explicit re-extractions
|
|
13
|
+
* - TTL prevents stale fingerprints from blocking forever
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { createHash } from 'crypto';
|
|
17
|
+
import { existsSync, readFileSync, writeFileSync, renameSync, statSync } from 'fs';
|
|
18
|
+
import { join, dirname } from 'path';
|
|
19
|
+
import { mkdir } from 'fs/promises';
|
|
20
|
+
|
|
21
|
+
/** Algorithm version — bump when extraction logic materially changes. Bust all caches. */
|
|
22
|
+
export const CACHE_ALGORITHM_VERSION = '2026.05.26';
|
|
23
|
+
|
|
24
|
+
export interface CacheCheckResult {
|
|
25
|
+
skip: boolean;
|
|
26
|
+
reason: string;
|
|
27
|
+
fingerprint?: string;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Check if extraction should be skipped for this URL.
|
|
32
|
+
* Performs a HEAD request to detect URL content changes via standard HTTP headers.
|
|
33
|
+
* Returns {skip:true} when fingerprints match AND TTL not exceeded.
|
|
34
|
+
*
|
|
35
|
+
* @param url - Target URL to potentially re-extract
|
|
36
|
+
* @param domain - Resolved domain (extractions/<domain>/ dir)
|
|
37
|
+
* @param opts - Options
|
|
38
|
+
* - force: bypass cache entirely (always extract)
|
|
39
|
+
* - ttlDays: max age before re-extraction even on match (default 7)
|
|
40
|
+
* - cacheDir: extractions root (default 'extractions')
|
|
41
|
+
*/
|
|
42
|
+
export async function shouldSkipExtraction(
|
|
43
|
+
url: string,
|
|
44
|
+
domain: string,
|
|
45
|
+
opts: { force?: boolean; ttlDays?: number; cacheDir?: string } = {}
|
|
46
|
+
): Promise<CacheCheckResult> {
|
|
47
|
+
if (opts.force) return { skip: false, reason: 'force flag set' };
|
|
48
|
+
|
|
49
|
+
const cacheDir = opts.cacheDir || 'extractions';
|
|
50
|
+
const ttlDays = opts.ttlDays ?? 7;
|
|
51
|
+
const fpPath = join(cacheDir, domain, '.extraction-fingerprint');
|
|
52
|
+
const rawCssPath = join(cacheDir, domain, 'raw-css.json');
|
|
53
|
+
|
|
54
|
+
// No existing extraction → must run, but still compute fingerprint for post-extraction write
|
|
55
|
+
if (!existsSync(rawCssPath) || !existsSync(fpPath)) {
|
|
56
|
+
let fp: string | undefined;
|
|
57
|
+
try {
|
|
58
|
+
fp = await computeUrlFingerprint(url);
|
|
59
|
+
} catch {
|
|
60
|
+
// HEAD fail → no fingerprint (cache cannot be populated this run)
|
|
61
|
+
}
|
|
62
|
+
return { skip: false, reason: 'no prior extraction or fingerprint', fingerprint: fp };
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// TTL check
|
|
66
|
+
try {
|
|
67
|
+
const stat = statSync(fpPath);
|
|
68
|
+
const ageMs = Date.now() - stat.mtimeMs;
|
|
69
|
+
const ttlMs = ttlDays * 86400 * 1000;
|
|
70
|
+
if (ageMs > ttlMs) {
|
|
71
|
+
return { skip: false, reason: `fingerprint older than ${ttlDays}d TTL` };
|
|
72
|
+
}
|
|
73
|
+
} catch {
|
|
74
|
+
return { skip: false, reason: 'cannot stat fingerprint file' };
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Compute current fingerprint from URL HEAD
|
|
78
|
+
let newFp: string;
|
|
79
|
+
try {
|
|
80
|
+
newFp = await computeUrlFingerprint(url);
|
|
81
|
+
} catch (e) {
|
|
82
|
+
return { skip: false, reason: `HEAD request failed: ${(e as Error).message}` };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Compare with stored fingerprint
|
|
86
|
+
try {
|
|
87
|
+
const storedFp = readFileSync(fpPath, 'utf-8').trim();
|
|
88
|
+
if (storedFp === newFp) {
|
|
89
|
+
return { skip: true, reason: 'fingerprint match (unchanged)', fingerprint: newFp };
|
|
90
|
+
}
|
|
91
|
+
return { skip: false, reason: 'fingerprint differs (content or version changed)', fingerprint: newFp };
|
|
92
|
+
} catch {
|
|
93
|
+
return { skip: false, reason: 'cannot read stored fingerprint' };
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Write fingerprint atomically (temp file + rename) so partial writes never corrupt cache.
|
|
99
|
+
*/
|
|
100
|
+
export async function writeFingerprint(
|
|
101
|
+
domain: string,
|
|
102
|
+
fingerprint: string,
|
|
103
|
+
opts: { cacheDir?: string } = {}
|
|
104
|
+
): Promise<void> {
|
|
105
|
+
const cacheDir = opts.cacheDir || 'extractions';
|
|
106
|
+
const finalPath = join(cacheDir, domain, '.extraction-fingerprint');
|
|
107
|
+
const tmpPath = `${finalPath}.${process.pid}.${Date.now()}.tmp`;
|
|
108
|
+
|
|
109
|
+
await mkdir(dirname(finalPath), { recursive: true });
|
|
110
|
+
writeFileSync(tmpPath, fingerprint, 'utf-8');
|
|
111
|
+
renameSync(tmpPath, finalPath); // atomic on POSIX filesystems
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Compute a fingerprint from URL + HTTP headers + algorithm version.
|
|
116
|
+
* Strategy:
|
|
117
|
+
* 1. HEAD request → get ETag / Last-Modified / Content-Length / Content-Type
|
|
118
|
+
* 2. Hash(URL + headers + ALGORITHM_VERSION)
|
|
119
|
+
* 3. Returns hex string (40 chars sha1)
|
|
120
|
+
*
|
|
121
|
+
* Why include algorithm version: if extractors evolve (new fields, fixed bugs), cache
|
|
122
|
+
* must invalidate even when content is identical.
|
|
123
|
+
*/
|
|
124
|
+
async function computeUrlFingerprint(url: string): Promise<string> {
|
|
125
|
+
let etag = '';
|
|
126
|
+
let lastMod = '';
|
|
127
|
+
let contentLen = '';
|
|
128
|
+
let contentType = '';
|
|
129
|
+
|
|
130
|
+
try {
|
|
131
|
+
const response = await fetch(url, {
|
|
132
|
+
method: 'HEAD',
|
|
133
|
+
redirect: 'follow',
|
|
134
|
+
signal: AbortSignal.timeout(8000),
|
|
135
|
+
});
|
|
136
|
+
if (response.ok || response.status === 405) {
|
|
137
|
+
// Some sites reject HEAD (405) but headers may still be informative
|
|
138
|
+
etag = response.headers.get('etag') || '';
|
|
139
|
+
lastMod = response.headers.get('last-modified') || '';
|
|
140
|
+
contentLen = response.headers.get('content-length') || '';
|
|
141
|
+
contentType = response.headers.get('content-type') || '';
|
|
142
|
+
}
|
|
143
|
+
} catch {
|
|
144
|
+
// HEAD failed (CORS, timeout, network). Fall through with empty headers.
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const material = [url, etag, lastMod, contentLen, contentType, CACHE_ALGORITHM_VERSION].join('|');
|
|
148
|
+
return createHash('sha1').update(material).digest('hex');
|
|
149
|
+
}
|