prism-design 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/CHANGELOG.md +292 -0
  2. package/LICENSE +21 -0
  3. package/README.md +203 -0
  4. package/bin/clone-architect.mjs +476 -0
  5. package/bin/prism.mjs +467 -0
  6. package/catalog/index.json +1155 -0
  7. package/extractions/airbnb.com/DESIGN.md +1068 -0
  8. package/extractions/airbnb.com/tokens.json +507 -0
  9. package/extractions/attio.com/DESIGN.md +1295 -0
  10. package/extractions/attio.com/tokens.json +438 -0
  11. package/extractions/auroxdashboard.com/DESIGN.md +724 -0
  12. package/extractions/auroxdashboard.com/tokens.json +195 -0
  13. package/extractions/careerexplorer.com/DESIGN.md +1178 -0
  14. package/extractions/careerexplorer.com/tokens.json +141 -0
  15. package/extractions/chance.co/DESIGN.md +1209 -0
  16. package/extractions/chance.co/tokens.json +160 -0
  17. package/extractions/choisis-ton-avenir.com/DESIGN.md +1265 -0
  18. package/extractions/choisis-ton-avenir.com/tokens.json +227 -0
  19. package/extractions/example.com/DESIGN.md +436 -0
  20. package/extractions/example.com/tokens.json +91 -0
  21. package/extractions/getdesign.md/DESIGN.md +1009 -0
  22. package/extractions/getdesign.md/tokens.json +219 -0
  23. package/extractions/github.com/DESIGN.md +1130 -0
  24. package/extractions/github.com/tokens.json +2092 -0
  25. package/extractions/hello-charly.com/DESIGN.md +1146 -0
  26. package/extractions/hello-charly.com/tokens.json +322 -0
  27. package/extractions/hyperliquid.xyz/DESIGN.md +779 -0
  28. package/extractions/hyperliquid.xyz/tokens.json +598 -0
  29. package/extractions/instagram.com/DESIGN.md +996 -0
  30. package/extractions/instagram.com/tokens.json +1240 -0
  31. package/extractions/jobirl.com/DESIGN.md +1160 -0
  32. package/extractions/jobirl.com/tokens.json +139 -0
  33. package/extractions/life360.com/DESIGN.md +1133 -0
  34. package/extractions/life360.com/tokens.json +491 -0
  35. package/extractions/lifesum.com/DESIGN.md +965 -0
  36. package/extractions/lifesum.com/tokens.json +170 -0
  37. package/extractions/linear.app/DESIGN.md +1301 -0
  38. package/extractions/linear.app/tokens.json +732 -0
  39. package/extractions/mavoie.org/DESIGN.md +1148 -0
  40. package/extractions/mavoie.org/tokens.json +128 -0
  41. package/extractions/miro.com/DESIGN.md +1237 -0
  42. package/extractions/miro.com/tokens.json +401 -0
  43. package/extractions/notion.so/DESIGN.md +1319 -0
  44. package/extractions/notion.so/tokens.json +906 -0
  45. package/extractions/onetonline.org/DESIGN.md +909 -0
  46. package/extractions/onetonline.org/tokens.json +280 -0
  47. package/extractions/posthog.com/DESIGN.md +1024 -0
  48. package/extractions/posthog.com/tokens.json +197 -0
  49. package/extractions/revolut.com/DESIGN.md +1080 -0
  50. package/extractions/revolut.com/tokens.json +401 -0
  51. package/extractions/stripe.com/DESIGN.md +1272 -0
  52. package/extractions/stripe.com/tokens.json +794 -0
  53. package/extractions/switchcollective.com/DESIGN.md +1040 -0
  54. package/extractions/switchcollective.com/tokens.json +98 -0
  55. package/extractions/truity.com/DESIGN.md +970 -0
  56. package/extractions/truity.com/tokens.json +166 -0
  57. package/extractions/uniquekicks.be/DESIGN.md +1171 -0
  58. package/extractions/uniquekicks.be/tokens.json +237 -0
  59. package/package.json +122 -0
  60. package/scripts/analyze.ts +281 -0
  61. package/scripts/bank-register.ts +379 -0
  62. package/scripts/bank.ts +374 -0
  63. package/scripts/browser-stealth.ts +189 -0
  64. package/scripts/clone.ts +198 -0
  65. package/scripts/compare-vs-gd-final.ts +273 -0
  66. package/scripts/compare-vs-gd.ts +269 -0
  67. package/scripts/compare.ts +405 -0
  68. package/scripts/deploy-site.ts +181 -0
  69. package/scripts/diff-snapshots.ts +340 -0
  70. package/scripts/enrich-catalog.ts +212 -0
  71. package/scripts/extract.ts +2038 -0
  72. package/scripts/extractors/advanced.ts +524 -0
  73. package/scripts/extractors/widgets.ts +711 -0
  74. package/scripts/generate-design-md.ts +5775 -0
  75. package/scripts/generate-final-pdf.ts +274 -0
  76. package/scripts/generate-og-image.ts +87 -0
  77. package/scripts/generate-showcase.ts +1588 -0
  78. package/scripts/generate-site.ts +847 -0
  79. package/scripts/mass-extract.sh +91 -0
  80. package/scripts/post-process-all.sh +55 -0
  81. package/scripts/regen-catalog.ts +203 -0
  82. package/scripts/shared/cache.ts +149 -0
  83. package/scripts/shared/css-helpers.ts +263 -0
  84. package/scripts/shared/logger.ts +57 -0
  85. package/scripts/shared/named-colors.ts +355 -0
  86. package/scripts/shared/types.ts +220 -0
  87. package/scripts/sync-catalog.ts +105 -0
  88. package/scripts/tokenize.ts +988 -0
  89. package/templates/layout-template.md +52 -0
  90. package/templates/tokens-template.json +34 -0
@@ -0,0 +1,91 @@
1
+ #!/bin/bash
2
+ # Mass extraction orchestrator — 71 GD brands en batches de 3 parallèles
3
+ # Tier 1 (easy) → Tier 2 (medium) → Tier 3 (risky)
4
+ # Cumulative timeout: 240s par extract (skip après timeout)
5
+ # Output: logs/mass-<TS>/{success,failed,timing}.txt
6
+
7
+ set -u
8
+ cd /home/paul/clone-architect
9
+ TS=$(date +%Y%m%d-%H%M)
10
+ LOGDIR="logs/mass-$TS"
11
+ mkdir -p "$LOGDIR"
12
+ SUCCESS="$LOGDIR/success.txt"
13
+ FAILED="$LOGDIR/failed.txt"
14
+ TIMING="$LOGDIR/timing.txt"
15
+ touch "$SUCCESS" "$FAILED" "$TIMING"
16
+
17
+ # === Tier 1 — Easy wins (33 brands) ===
18
+ TIER1=(claude.com clay.com cohere.com composio.dev expo.dev hashicorp.com mastercard.com replicate.com sanity.io sentry.io superhuman.com voltagent.dev warp.dev wise.com x.ai zapier.com airtable.com elevenlabs.io pinterest.com runwayml.com cal.com clickhouse.com intercom.com lovable.dev mintlify.com mistral.ai opencode.ai posthog.com resend.com supabase.com together.ai raycast.com linear.app)
19
+
20
+ # === Tier 2 — Medium (24 brands) ===
21
+ TIER2=(bmw.com bmwm.com bugatti.com ferrari.com hp.com lamborghini.com meta.com playstation.com renault.com spacex.com tesla.com uber.com wired.com airbnb.com apple.com cursor.com framer.com ibm.com miro.com mongodb.com nvidia.com ollama.com shopify.com spotify.com)
22
+
23
+ # === Tier 3 — Risky (14 brands, anti-bot) ===
24
+ TIER3=(binance.com coinbase.com kraken.com nike.com notion.so revolut.com starbucks.com stripe.com theverge.com vercel.com vodafone.com webflow.com minimax.io figma.com)
25
+
26
+ extract_one() {
27
+ local domain=$1
28
+ local start=$(date +%s)
29
+ # bmwm.com special case (BMW M doesn't exist as TLD, use bmw-m.com or skip)
30
+ local url="https://$domain"
31
+ if [ "$domain" = "bmwm.com" ]; then
32
+ url="https://www.bmw-m.com"
33
+ fi
34
+ if timeout 240s npx tsx scripts/extract.ts "$url" > "$LOGDIR/$domain.log" 2>&1; then
35
+ local elapsed=$(( $(date +%s) - start ))
36
+ echo "$domain" >> "$SUCCESS"
37
+ echo "$domain $elapsed" >> "$TIMING"
38
+ echo " ✅ $domain (${elapsed}s)"
39
+ else
40
+ local elapsed=$(( $(date +%s) - start ))
41
+ echo "$domain" >> "$FAILED"
42
+ echo "$domain ${elapsed}s FAIL" >> "$TIMING"
43
+ echo " ❌ $domain (${elapsed}s)"
44
+ fi
45
+ }
46
+
47
+ run_batch_of_3() {
48
+ local d1=${1:-}; local d2=${2:-}; local d3=${3:-}
49
+ [ -n "$d1" ] && extract_one "$d1" &
50
+ [ -n "$d2" ] && extract_one "$d2" &
51
+ [ -n "$d3" ] && extract_one "$d3" &
52
+ wait
53
+ }
54
+
55
+ run_tier() {
56
+ local tier_name=$1
57
+ shift
58
+ local tier=("$@")
59
+ echo "═══════════════════════════════════════════════════"
60
+ echo "🚀 $tier_name — ${#tier[@]} brands"
61
+ echo "═══════════════════════════════════════════════════"
62
+ local i=0
63
+ while [ $i -lt ${#tier[@]} ]; do
64
+ local d1=${tier[$i]:-}
65
+ local d2=${tier[$((i+1))]:-}
66
+ local d3=${tier[$((i+2))]:-}
67
+ run_batch_of_3 "$d1" "$d2" "$d3"
68
+ i=$((i+3))
69
+ # Free RAM between batches
70
+ sync && echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true
71
+ done
72
+ }
73
+
74
+ echo "📁 Logs: $LOGDIR"
75
+ echo ""
76
+
77
+ run_tier "TIER 1 — Easy" "${TIER1[@]}"
78
+ run_tier "TIER 2 — Medium" "${TIER2[@]}"
79
+ run_tier "TIER 3 — Risky" "${TIER3[@]}"
80
+
81
+ echo ""
82
+ echo "═══════════════════════════════════════════════════"
83
+ echo "📊 RÉSULTATS"
84
+ echo "═══════════════════════════════════════════════════"
85
+ echo " Success : $(wc -l < $SUCCESS) brands"
86
+ echo " Failed : $(wc -l < $FAILED) brands"
87
+ echo ""
88
+ echo "Failed brands:"
89
+ cat "$FAILED" 2>/dev/null | sed 's/^/ ❌ /'
90
+ echo ""
91
+ echo "Logs in: $LOGDIR"
@@ -0,0 +1,55 @@
1
+ #!/bin/bash
2
+ # Post-process all extractions: tokenize + generate-design-md + generate-showcase
3
+ # Adds new brands to catalog and runs enrichment
4
+
5
+ set -u
6
+ cd /home/paul/clone-architect
7
+
8
+ LOGDIR="logs/postproc-$(date +%Y%m%d-%H%M)"
9
+ mkdir -p "$LOGDIR"
10
+
11
+ echo "🔧 Post-processing all extractions..."
12
+
13
+ SUCCESS=0
14
+ SKIPPED=0
15
+ FAILED=0
16
+
17
+ for dir in extractions/*/; do
18
+ domain=$(basename "$dir")
19
+ if [ -f "$dir/raw-css.json" ]; then
20
+ if npx tsx scripts/tokenize.ts "$domain" > "$LOGDIR/$domain.tok.log" 2>&1; then
21
+ if npx tsx scripts/generate-design-md.ts "$domain" > "$LOGDIR/$domain.dmd.log" 2>&1; then
22
+ SUCCESS=$((SUCCESS+1))
23
+ echo " ✅ $domain"
24
+ else
25
+ FAILED=$((FAILED+1))
26
+ echo " ❌ $domain (design-md failed)"
27
+ fi
28
+ else
29
+ FAILED=$((FAILED+1))
30
+ echo " ❌ $domain (tokenize failed)"
31
+ fi
32
+ else
33
+ SKIPPED=$((SKIPPED+1))
34
+ echo " ⏭ $domain (no raw-css.json)"
35
+ fi
36
+ done
37
+
38
+ echo ""
39
+ echo "📊 Tokenize + DESIGN.md: $SUCCESS OK · $FAILED FAIL · $SKIPPED SKIP"
40
+ echo ""
41
+ echo "🎨 Generating showcases..."
42
+
43
+ SHOW_OK=0
44
+ SHOW_FAIL=0
45
+ for dir in extractions/*/; do
46
+ domain=$(basename "$dir")
47
+ if [ -f "$dir/DESIGN.md" ]; then
48
+ if npx tsx scripts/generate-showcase.ts "$domain" > "$LOGDIR/$domain.show.log" 2>&1; then
49
+ SHOW_OK=$((SHOW_OK+1))
50
+ else
51
+ SHOW_FAIL=$((SHOW_FAIL+1))
52
+ fi
53
+ fi
54
+ done
55
+ echo "📊 Showcases: $SHOW_OK OK · $SHOW_FAIL FAIL"
@@ -0,0 +1,203 @@
1
+ /**
2
+ * Prism — Batch DESIGN.md Regeneration
3
+ *
4
+ * Regenerates DESIGN.md for all extractions that have both raw-css.json + tokens.json.
5
+ * Defensive: skips extractions with missing files (never process.exit on partial failure).
6
+ * Reports a final summary with scores.
7
+ *
8
+ * Usage: npx tsx scripts/regen-catalog.ts [--dry-run] [--domain <domain>]
9
+ * --dry-run Show what would be regenerated without writing files
10
+ * --domain Regenerate a single domain only
11
+ */
12
+
13
+ import { readdir, readFile, stat } from 'fs/promises';
14
+ import { existsSync } from 'fs';
15
+ import { join } from 'path';
16
+ import { spawnSync } from 'child_process';
17
+
18
+ const ROOT = process.cwd();
19
+ const EXTRACTIONS_DIR = join(ROOT, 'extractions');
20
+ const GENERATE_SCRIPT = join(ROOT, 'scripts', 'generate-design-md.ts');
21
+
22
+ const isDryRun = process.argv.includes('--dry-run');
23
+ const domainFilter = (() => {
24
+ const idx = process.argv.indexOf('--domain');
25
+ return idx >= 0 ? process.argv[idx + 1] : null;
26
+ })();
27
+
28
+ interface RegenResult {
29
+ domain: string;
30
+ status: 'ok' | 'skipped' | 'error';
31
+ reason?: string;
32
+ completeness?: number;
33
+ lines?: number;
34
+ }
35
+
36
+ async function getCompleteness(designMdPath: string): Promise<number | null> {
37
+ try {
38
+ const content = await readFile(designMdPath, 'utf-8');
39
+ const m = content.match(/^completeness:\s*(\d+)/m);
40
+ return m ? parseInt(m[1]) : null;
41
+ } catch {
42
+ return null;
43
+ }
44
+ }
45
+
46
+ async function getLineCount(designMdPath: string): Promise<number> {
47
+ try {
48
+ const content = await readFile(designMdPath, 'utf-8');
49
+ return content.split('\n').length;
50
+ } catch {
51
+ return 0;
52
+ }
53
+ }
54
+
55
+ async function main() {
56
+ console.log(`\n🔄 Prism — Batch DESIGN.md Regeneration`);
57
+ if (isDryRun) console.log(` DRY RUN — no files will be written`);
58
+ if (domainFilter) console.log(` Filter: ${domainFilter} only`);
59
+ console.log('');
60
+
61
+ // Discover all extraction directories
62
+ let allDomains: string[] = [];
63
+ try {
64
+ const entries = await readdir(EXTRACTIONS_DIR);
65
+ for (const entry of entries) {
66
+ const fullPath = join(EXTRACTIONS_DIR, entry);
67
+ try {
68
+ const s = await stat(fullPath);
69
+ if (s.isDirectory()) allDomains.push(entry);
70
+ } catch { /* skip */ }
71
+ }
72
+ } catch (err) {
73
+ console.error('Cannot read extractions directory:', err);
74
+ process.exit(1);
75
+ }
76
+
77
+ if (domainFilter) {
78
+ allDomains = allDomains.filter(d => d === domainFilter);
79
+ if (allDomains.length === 0) {
80
+ console.error(`Domain "${domainFilter}" not found in extractions/`);
81
+ process.exit(1);
82
+ }
83
+ }
84
+
85
+ allDomains.sort();
86
+ console.log(`Found ${allDomains.length} extraction(s) to process...\n`);
87
+
88
+ const results: RegenResult[] = [];
89
+ let okCount = 0;
90
+ let skipCount = 0;
91
+ let errorCount = 0;
92
+
93
+ for (const domain of allDomains) {
94
+ const extractionDir = join(EXTRACTIONS_DIR, domain);
95
+ const rawCssPath = join(extractionDir, 'raw-css.json');
96
+ const tokensPath = join(extractionDir, 'tokens.json');
97
+ const designMdPath = join(extractionDir, 'DESIGN.md');
98
+
99
+ // Skip if missing required input files
100
+ if (!existsSync(rawCssPath)) {
101
+ results.push({ domain, status: 'skipped', reason: 'no raw-css.json' });
102
+ skipCount++;
103
+ console.log(` ⏭️ ${domain} — skipped (no raw-css.json)`);
104
+ continue;
105
+ }
106
+ if (!existsSync(tokensPath)) {
107
+ results.push({ domain, status: 'skipped', reason: 'no tokens.json' });
108
+ skipCount++;
109
+ console.log(` ⏭️ ${domain} — skipped (no tokens.json)`);
110
+ continue;
111
+ }
112
+
113
+ if (isDryRun) {
114
+ console.log(` 🔲 ${domain} — would regenerate`);
115
+ results.push({ domain, status: 'ok', reason: 'dry-run' });
116
+ okCount++;
117
+ continue;
118
+ }
119
+
120
+ // Resolve tsx binary
121
+ const localTsx = join(ROOT, 'node_modules', '.bin', 'tsx');
122
+ const tsxCmd = existsSync(localTsx) ? localTsx : 'npx';
123
+ const tsxArgs = existsSync(localTsx)
124
+ ? [GENERATE_SCRIPT, domain]
125
+ : ['tsx', GENERATE_SCRIPT, domain];
126
+
127
+ process.stdout.write(` ⚙️ ${domain}... `);
128
+
129
+ const result = spawnSync(tsxCmd, tsxArgs, {
130
+ cwd: ROOT,
131
+ encoding: 'utf-8',
132
+ timeout: 30000, // 30s max per domain
133
+ });
134
+
135
+ if (result.error || result.status !== 0) {
136
+ const errMsg = result.stderr?.split('\n')[0] || result.error?.message || 'unknown error';
137
+ console.log(`❌ ${errMsg.slice(0, 80)}`);
138
+ results.push({ domain, status: 'error', reason: errMsg });
139
+ errorCount++;
140
+ continue;
141
+ }
142
+
143
+ // Read back completeness score
144
+ const completeness = await getCompleteness(designMdPath);
145
+ const lines = await getLineCount(designMdPath);
146
+
147
+ const scoreDisplay = completeness !== null ? `${completeness}/100` : 'no score';
148
+ console.log(`✅ ${lines}L | ${scoreDisplay}`);
149
+ results.push({ domain, status: 'ok', completeness: completeness ?? undefined, lines });
150
+ okCount++;
151
+ }
152
+
153
+ // ── Summary ──
154
+ console.log('\n' + '═'.repeat(60));
155
+ console.log(`📊 Regeneration Summary`);
156
+ console.log(` ✅ OK: ${okCount} | ⏭️ Skipped: ${skipCount} | ❌ Errors: ${errorCount}`);
157
+
158
+ if (!isDryRun) {
159
+ const withScores = results.filter(r => r.completeness !== undefined);
160
+ if (withScores.length > 0) {
161
+ const avgScore = Math.round(withScores.reduce((sum, r) => sum + (r.completeness!), 0) / withScores.length);
162
+ const maxScore = Math.max(...withScores.map(r => r.completeness!));
163
+ const minScore = Math.min(...withScores.map(r => r.completeness!));
164
+ console.log(` Completeness: avg ${avgScore}/100 | best ${maxScore} | worst ${minScore}`);
165
+
166
+ // Top 5 and bottom 5
167
+ const sorted = [...withScores].sort((a, b) => b.completeness! - a.completeness!);
168
+ console.log(`\n Top 5:`);
169
+ for (const r of sorted.slice(0, 5)) {
170
+ console.log(` ${r.domain}: ${r.completeness}/100`);
171
+ }
172
+ if (sorted.length > 5) {
173
+ console.log(`\n Needs improvement:`);
174
+ for (const r of sorted.slice(-5).reverse()) {
175
+ console.log(` ${r.domain}: ${r.completeness}/100`);
176
+ }
177
+ }
178
+ }
179
+
180
+ // Run catalog enrichment to update catalog/index.json with new scores
181
+ console.log('\n🔧 Updating catalog/index.json with new scores...');
182
+ const localTsx3 = join(ROOT, 'node_modules', '.bin', 'tsx');
183
+ const tsxCmd3 = existsSync(localTsx3) ? localTsx3 : 'npx';
184
+ const enrichScript = join(ROOT, 'scripts', 'enrich-catalog.ts');
185
+ if (existsSync(enrichScript)) {
186
+ const enrichArgs = existsSync(localTsx3)
187
+ ? [enrichScript]
188
+ : ['tsx', enrichScript];
189
+ const enrichResult = spawnSync(tsxCmd3, enrichArgs, { stdio: 'inherit', cwd: ROOT });
190
+ if (enrichResult.status !== 0) {
191
+ console.warn('⚠️ catalog enrichment failed — run manually: npx tsx scripts/enrich-catalog.ts');
192
+ }
193
+ }
194
+ }
195
+
196
+ console.log('');
197
+ process.exit(errorCount > 0 ? 1 : 0);
198
+ }
199
+
200
+ main().catch(err => {
201
+ console.error('Fatal error in regen-catalog:', err);
202
+ process.exit(1);
203
+ });
@@ -0,0 +1,149 @@
1
+ /**
2
+ * shared/cache.ts — Phase 2.3 re-extraction cache
3
+ *
4
+ * Skips Playwright extraction if the URL content hasn't materially changed since last run.
5
+ * Fingerprint = hash(URL + HTTP headers ETag/last-modified/content-length + algorithm version).
6
+ *
7
+ * Cache hit: ~2s (HEAD request only) vs full extraction ~140s = 99% time saved on re-runs.
8
+ *
9
+ * Safety:
10
+ * - Atomic writes (temp file + rename) prevent corruption on partial writes
11
+ * - Algorithm version baked into fingerprint = automatic bust when extractors change
12
+ * - Force flag bypasses cache for explicit re-extractions
13
+ * - TTL prevents stale fingerprints from blocking forever
14
+ */
15
+
16
+ import { createHash } from 'crypto';
17
+ import { existsSync, readFileSync, writeFileSync, renameSync, statSync } from 'fs';
18
+ import { join, dirname } from 'path';
19
+ import { mkdir } from 'fs/promises';
20
+
21
+ /** Algorithm version — bump when extraction logic materially changes. Bust all caches. */
22
+ export const CACHE_ALGORITHM_VERSION = '2026.05.26';
23
+
24
+ export interface CacheCheckResult {
25
+ skip: boolean;
26
+ reason: string;
27
+ fingerprint?: string;
28
+ }
29
+
30
+ /**
31
+ * Check if extraction should be skipped for this URL.
32
+ * Performs a HEAD request to detect URL content changes via standard HTTP headers.
33
+ * Returns {skip:true} when fingerprints match AND TTL not exceeded.
34
+ *
35
+ * @param url - Target URL to potentially re-extract
36
+ * @param domain - Resolved domain (extractions/<domain>/ dir)
37
+ * @param opts - Options
38
+ * - force: bypass cache entirely (always extract)
39
+ * - ttlDays: max age before re-extraction even on match (default 7)
40
+ * - cacheDir: extractions root (default 'extractions')
41
+ */
42
+ export async function shouldSkipExtraction(
43
+ url: string,
44
+ domain: string,
45
+ opts: { force?: boolean; ttlDays?: number; cacheDir?: string } = {}
46
+ ): Promise<CacheCheckResult> {
47
+ if (opts.force) return { skip: false, reason: 'force flag set' };
48
+
49
+ const cacheDir = opts.cacheDir || 'extractions';
50
+ const ttlDays = opts.ttlDays ?? 7;
51
+ const fpPath = join(cacheDir, domain, '.extraction-fingerprint');
52
+ const rawCssPath = join(cacheDir, domain, 'raw-css.json');
53
+
54
+ // No existing extraction → must run, but still compute fingerprint for post-extraction write
55
+ if (!existsSync(rawCssPath) || !existsSync(fpPath)) {
56
+ let fp: string | undefined;
57
+ try {
58
+ fp = await computeUrlFingerprint(url);
59
+ } catch {
60
+ // HEAD fail → no fingerprint (cache cannot be populated this run)
61
+ }
62
+ return { skip: false, reason: 'no prior extraction or fingerprint', fingerprint: fp };
63
+ }
64
+
65
+ // TTL check
66
+ try {
67
+ const stat = statSync(fpPath);
68
+ const ageMs = Date.now() - stat.mtimeMs;
69
+ const ttlMs = ttlDays * 86400 * 1000;
70
+ if (ageMs > ttlMs) {
71
+ return { skip: false, reason: `fingerprint older than ${ttlDays}d TTL` };
72
+ }
73
+ } catch {
74
+ return { skip: false, reason: 'cannot stat fingerprint file' };
75
+ }
76
+
77
+ // Compute current fingerprint from URL HEAD
78
+ let newFp: string;
79
+ try {
80
+ newFp = await computeUrlFingerprint(url);
81
+ } catch (e) {
82
+ return { skip: false, reason: `HEAD request failed: ${(e as Error).message}` };
83
+ }
84
+
85
+ // Compare with stored fingerprint
86
+ try {
87
+ const storedFp = readFileSync(fpPath, 'utf-8').trim();
88
+ if (storedFp === newFp) {
89
+ return { skip: true, reason: 'fingerprint match (unchanged)', fingerprint: newFp };
90
+ }
91
+ return { skip: false, reason: 'fingerprint differs (content or version changed)', fingerprint: newFp };
92
+ } catch {
93
+ return { skip: false, reason: 'cannot read stored fingerprint' };
94
+ }
95
+ }
96
+
97
+ /**
98
+ * Write fingerprint atomically (temp file + rename) so partial writes never corrupt cache.
99
+ */
100
+ export async function writeFingerprint(
101
+ domain: string,
102
+ fingerprint: string,
103
+ opts: { cacheDir?: string } = {}
104
+ ): Promise<void> {
105
+ const cacheDir = opts.cacheDir || 'extractions';
106
+ const finalPath = join(cacheDir, domain, '.extraction-fingerprint');
107
+ const tmpPath = `${finalPath}.${process.pid}.${Date.now()}.tmp`;
108
+
109
+ await mkdir(dirname(finalPath), { recursive: true });
110
+ writeFileSync(tmpPath, fingerprint, 'utf-8');
111
+ renameSync(tmpPath, finalPath); // atomic on POSIX filesystems
112
+ }
113
+
114
+ /**
115
+ * Compute a fingerprint from URL + HTTP headers + algorithm version.
116
+ * Strategy:
117
+ * 1. HEAD request → get ETag / Last-Modified / Content-Length / Content-Type
118
+ * 2. Hash(URL + headers + ALGORITHM_VERSION)
119
+ * 3. Returns hex string (40 chars sha1)
120
+ *
121
+ * Why include algorithm version: if extractors evolve (new fields, fixed bugs), cache
122
+ * must invalidate even when content is identical.
123
+ */
124
+ async function computeUrlFingerprint(url: string): Promise<string> {
125
+ let etag = '';
126
+ let lastMod = '';
127
+ let contentLen = '';
128
+ let contentType = '';
129
+
130
+ try {
131
+ const response = await fetch(url, {
132
+ method: 'HEAD',
133
+ redirect: 'follow',
134
+ signal: AbortSignal.timeout(8000),
135
+ });
136
+ if (response.ok || response.status === 405) {
137
+ // Some sites reject HEAD (405) but headers may still be informative
138
+ etag = response.headers.get('etag') || '';
139
+ lastMod = response.headers.get('last-modified') || '';
140
+ contentLen = response.headers.get('content-length') || '';
141
+ contentType = response.headers.get('content-type') || '';
142
+ }
143
+ } catch {
144
+ // HEAD failed (CORS, timeout, network). Fall through with empty headers.
145
+ }
146
+
147
+ const material = [url, etag, lastMod, contentLen, contentType, CACHE_ALGORITHM_VERSION].join('|');
148
+ return createHash('sha1').update(material).digest('hex');
149
+ }