fullstackgtm 0.31.0 → 0.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,405 @@
1
+ /**
2
+ * Market sourcing — find the *right* page to capture for each vendor, detect
3
+ * acquired/redirected vendors, and extract brand logos. These raise the quality
4
+ * of a cold-start map and are useful to every consumer (CLI, MCP, or a hosted
5
+ * service), with zero coupling to any transport.
6
+ *
7
+ * Design:
8
+ * - **Pure functions** (`pickCategoryPage`, `extractLogoUrl`, `categoryKeywords`,
9
+ * `registrableDomain`) operate on already-fetched HTML / strings. The caller
10
+ * fetches the page with whatever it likes — the hosted service injects a
11
+ * browser-rendering fetch for JS-walled homepages; the CLI uses the default.
12
+ * - **Fetching helpers** (`resolveFinalUrl`, `detectDrift`,
13
+ * `findCategoryPageInSitemap`, `fetchLogoDataUri`) default to the package's
14
+ * SSRF-guarded `assertPublicUrl` + global `fetch`, but each accepts an
15
+ * injectable fetcher so they stay testable offline and transport-agnostic.
16
+ * Sitemaps / redirects / logo bytes are plain resources — they never need a
17
+ * headless browser, so the default fetch is sufficient even in the hosted layer.
18
+ */
19
+ import { assertPublicUrl } from "./market.ts";
20
+
21
+ const USER_AGENT = "fullstackgtm-market/0 (+https://github.com/fullstackgtm/core)";
22
+ const FETCH_TIMEOUT_MS = 15_000;
23
+ const MAX_REDIRECTS = 5;
24
+
25
+ /** Fetch a text resource (sitemap, robots.txt); null on any failure. */
26
+ export type FetchText = (url: string) => Promise<string | null>;
27
+ /** Fetch raw bytes (a logo image); null on any failure. */
28
+ export type FetchBytes = (url: string) => Promise<{ contentType: string; bytes: Uint8Array } | null>;
29
+ /** Resolve a URL's final destination after redirects. */
30
+ export type ResolveUrl = (url: string) => Promise<{ finalUrl: string; status: number }>;
31
+
32
+ function hostOf(url: string): string | null {
33
+ try {
34
+ return new URL(url).hostname.replace(/^www\./, "");
35
+ } catch {
36
+ return null;
37
+ }
38
+ }
39
+
40
+ // ── Pure helpers ────────────────────────────────────────────────────────────
41
+
42
+ // A few common multi-label public suffixes so "bbc.co.uk" → "bbc.co.uk", not "co.uk".
43
+ const MULTI_LABEL_TLDS = new Set([
44
+ "co.uk", "com.au", "co.nz", "co.jp", "com.br", "co.in", "com.sg", "co.za", "com.mx", "com.cn",
45
+ ]);
46
+
47
+ /**
48
+ * Best-effort registrable domain (eTLD+1) for comparing vendor identity across a
49
+ * redirect. Heuristic (no full public-suffix list): last two labels, or three for
50
+ * a known multi-label suffix. Good enough to catch "spiff.com → salesforce.com".
51
+ */
52
+ export function registrableDomain(host: string): string {
53
+ const h = String(host || "").toLowerCase().replace(/\.$/, "");
54
+ const labels = h.split(".");
55
+ if (labels.length <= 2) return h;
56
+ const lastTwo = labels.slice(-2).join(".");
57
+ return MULTI_LABEL_TLDS.has(lastTwo) ? labels.slice(-3).join(".") : lastTwo;
58
+ }
59
+
60
+ const CATEGORY_STOPWORDS = new Set([
61
+ "software", "platform", "platforms", "tool", "tools", "system", "systems", "management",
62
+ "solution", "solutions", "app", "apps", "service", "services", "suite", "the", "for", "and", "of",
63
+ ]);
64
+
65
+ /** Significant category words for matching pages/links (drops generic filler). */
66
+ export function categoryKeywords(category: string): string[] {
67
+ return [
68
+ ...new Set(
69
+ String(category || "")
70
+ .toLowerCase()
71
+ .split(/[^a-z0-9]+/)
72
+ .filter((w) => w.length >= 3 && !CATEGORY_STOPWORDS.has(w)),
73
+ ),
74
+ ];
75
+ }
76
+
77
+ /**
78
+ * Conglomerate page-selection: when a vendor's homepage isn't about the category,
79
+ * follow its own nav to the category page. Scans same-registrable-domain internal
80
+ * links and returns the one whose anchor text / path best matches the category
81
+ * keywords (anchor text weighted 2×, path 1×; requires ≥2 to avoid false hits).
82
+ * Returns null if no link is a clear match. Pure — operates on the given HTML.
83
+ */
84
+ export function pickCategoryPage(html: string, baseUrl: string, keywords: string[]): string | null {
85
+ if (!html || !keywords.length) return null;
86
+ let baseHost: string;
87
+ try {
88
+ baseHost = registrableDomain(new URL(baseUrl).hostname);
89
+ } catch {
90
+ return null;
91
+ }
92
+ const re = /<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
93
+ const seen = new Set<string>();
94
+ let best: string | null = null;
95
+ let bestScore = 0;
96
+ let m: RegExpExecArray | null;
97
+ while ((m = re.exec(html))) {
98
+ let u: URL;
99
+ try {
100
+ u = new URL(m[1], baseUrl);
101
+ } catch {
102
+ continue;
103
+ }
104
+ if (u.protocol !== "http:" && u.protocol !== "https:") continue;
105
+ if (registrableDomain(u.hostname) !== baseHost) continue; // internal links only
106
+ const key = u.origin + u.pathname;
107
+ if (seen.has(key)) continue;
108
+ seen.add(key);
109
+ const text = m[2].replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
110
+ const path = u.pathname.toLowerCase();
111
+ let score = 0;
112
+ for (const kw of keywords) {
113
+ if (text.includes(kw)) score += 2;
114
+ if (path.includes(kw)) score += 1;
115
+ }
116
+ if (score > bestScore) {
117
+ bestScore = score;
118
+ best = u.toString();
119
+ }
120
+ }
121
+ return bestScore >= 2 ? best : null;
122
+ }
123
+
124
+ /**
125
+ * Pull a canonical logo URL out of a homepage: apple-touch-icon → og:image →
126
+ * rel=icon, resolved to an absolute URL. Pure — operates on the given HTML.
127
+ */
128
+ export function extractLogoUrl(html: string, baseUrl: string): string | null {
129
+ if (!html) return null;
130
+ const abs = (href: string): string | null => {
131
+ try {
132
+ return new URL(href, baseUrl).toString();
133
+ } catch {
134
+ return null;
135
+ }
136
+ };
137
+ const hrefOf = (tag: string): string | null => {
138
+ const mm = tag.match(/href=["']([^"']+)["']/i) || tag.match(/content=["']([^"']+)["']/i);
139
+ return mm ? abs(mm[1]) : null;
140
+ };
141
+ let mm = html.match(/<link[^>]+rel=["'][^"']*apple-touch-icon[^"']*["'][^>]*>/i);
142
+ if (mm) {
143
+ const u = hrefOf(mm[0]);
144
+ if (u) return u;
145
+ }
146
+ mm = html.match(/<meta[^>]+property=["']og:image["'][^>]*>/i);
147
+ if (mm) {
148
+ const u = hrefOf(mm[0]);
149
+ if (u) return u;
150
+ }
151
+ mm = html.match(/<link[^>]+rel=["'](?:shortcut icon|icon)["'][^>]*>/i);
152
+ if (mm) {
153
+ const u = hrefOf(mm[0]);
154
+ if (u) return u;
155
+ }
156
+ return null;
157
+ }
158
+
159
+ // ── Default SSRF-guarded fetchers ────────────────────────────────────────────
160
+
161
+ /** Manual-redirect fetch that re-validates every hop against the SSRF guard. */
162
+ async function guardedFetch(url: string): Promise<Response | null> {
163
+ let current = url;
164
+ for (let hop = 0; hop <= MAX_REDIRECTS; hop++) {
165
+ await assertPublicUrl(current);
166
+ const res = await fetch(current, {
167
+ redirect: "manual",
168
+ signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
169
+ headers: { "User-Agent": USER_AGENT },
170
+ });
171
+ const location = res.headers.get("location");
172
+ if (res.status >= 300 && res.status < 400 && location) {
173
+ try {
174
+ await res.body?.cancel();
175
+ } catch {
176
+ /* ignore */
177
+ }
178
+ current = new URL(location, current).toString();
179
+ continue;
180
+ }
181
+ return res;
182
+ }
183
+ return null;
184
+ }
185
+
186
+ const defaultFetchText: FetchText = async (url) => {
187
+ try {
188
+ const res = await guardedFetch(url);
189
+ return res && res.ok ? await res.text() : null;
190
+ } catch {
191
+ return null;
192
+ }
193
+ };
194
+
195
+ const defaultFetchBytes: FetchBytes = async (url) => {
196
+ try {
197
+ const res = await guardedFetch(url);
198
+ if (!res || !res.ok) return null;
199
+ const contentType = (res.headers.get("content-type") || "").split(";")[0].trim().toLowerCase();
200
+ const bytes = new Uint8Array(await res.arrayBuffer());
201
+ return { contentType, bytes };
202
+ } catch {
203
+ return null;
204
+ }
205
+ };
206
+
207
+ /**
208
+ * Follow redirects (SSRF-guarded, re-validated each hop) and return the final URL
209
+ * + status WITHOUT downloading the body. Used to detect identity drift.
210
+ */
211
+ export const resolveFinalUrl: ResolveUrl = async (rawUrl) => {
212
+ let current = rawUrl;
213
+ for (let hop = 0; hop <= MAX_REDIRECTS; hop++) {
214
+ await assertPublicUrl(current);
215
+ const res = await fetch(current, {
216
+ redirect: "manual",
217
+ signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
218
+ headers: { "User-Agent": USER_AGENT },
219
+ });
220
+ try {
221
+ await res.body?.cancel();
222
+ } catch {
223
+ /* ignore */
224
+ }
225
+ const location = res.headers.get("location");
226
+ if (res.status >= 300 && res.status < 400 && location) {
227
+ current = new URL(location, current).toString();
228
+ continue;
229
+ }
230
+ return { finalUrl: current, status: res.status };
231
+ }
232
+ return { finalUrl: current, status: 0 };
233
+ };
234
+
235
+ // ── Fetching helpers ─────────────────────────────────────────────────────────
236
+
237
+ /**
238
+ * Detect vendor identity drift: does this URL (or its www/apex sibling) redirect
239
+ * to a DIFFERENT registrable domain? Returns the drifted-to host, else null.
240
+ * Catches acquired/defunct products (e.g. www.spiff.com → salesforce.com) even
241
+ * when the apex itself errors. Only a 2xx/3xx landing on another domain counts —
242
+ * a status code or a throw is NOT drift (real sites block bare requests).
243
+ */
244
+ export async function detectDrift(
245
+ url: string,
246
+ srcHost: string,
247
+ resolve: ResolveUrl = resolveFinalUrl,
248
+ ): Promise<string | null> {
249
+ if (!srcHost) return null;
250
+ const tries = [url];
251
+ try {
252
+ const u = new URL(url);
253
+ const sibling = u.hostname.startsWith("www.")
254
+ ? url.replace("://www.", "://")
255
+ : url.replace(`://${u.hostname}`, `://www.${u.hostname}`);
256
+ if (sibling !== url) tries.push(sibling);
257
+ } catch {
258
+ /* ignore */
259
+ }
260
+ for (const t of tries) {
261
+ try {
262
+ const { finalUrl, status } = await resolve(t);
263
+ if (status < 200 || status >= 400) continue;
264
+ const finalHost = hostOf(finalUrl);
265
+ if (finalHost && registrableDomain(finalHost) !== registrableDomain(srcHost)) return finalHost;
266
+ } catch {
267
+ /* a throw isn't a drift signal */
268
+ }
269
+ }
270
+ return null;
271
+ }
272
+
273
+ /**
274
+ * Fallback for JS-nav conglomerates whose product links aren't in the rendered
275
+ * homepage: find the category page from the vendor's sitemap. Bounded (≤6 sitemap
276
+ * fetches, ≤20k URLs), same-registrable-domain, plain XML only; media sitemaps
277
+ * skipped, non-English locales de-prioritized, /products/ preferred; requires the
278
+ * path to hit the keywords so locale/blog false-positives are rejected.
279
+ */
280
+ export async function findCategoryPageInSitemap(
281
+ rootUrl: string,
282
+ keywords: string[],
283
+ fetchText: FetchText = defaultFetchText,
284
+ ): Promise<string | null> {
285
+ let root: URL;
286
+ try {
287
+ root = new URL(rootUrl);
288
+ } catch {
289
+ return null;
290
+ }
291
+ if (!keywords.length) return null;
292
+ const rootDom = registrableDomain(root.hostname);
293
+ const sameDomain = (u: string) => registrableDomain(hostOf(u) || "") === rootDom;
294
+ const pathScore = (u: string): number => {
295
+ let path: string;
296
+ try {
297
+ path = new URL(u).pathname.toLowerCase();
298
+ } catch {
299
+ return 0;
300
+ }
301
+ let s = 0;
302
+ for (const kw of keywords) if (path.includes(kw)) s += 1;
303
+ if (s > 0) {
304
+ if (/\/(product|solution)s?\//.test(path)) s += 0.5; // prefer product pages
305
+ const loc = path.match(/^\/([a-z]{2})(?:-[a-z]{2})?\//); // de-prioritize non-English locales
306
+ if (loc && !["en", "us"].includes(loc[1])) s -= 0.6;
307
+ }
308
+ return s;
309
+ };
310
+
311
+ const candidates = new Set([`${root.origin}/sitemap.xml`, `${root.origin}/sitemap_index.xml`]);
312
+ const robots = await fetchText(`${root.origin}/robots.txt`);
313
+ if (robots) {
314
+ for (const mm of robots.slice(0, 100_000).matchAll(/^\s*sitemap:\s*(\S+)/gim)) {
315
+ try {
316
+ candidates.add(new URL(mm[1].trim(), root.origin).toString());
317
+ } catch {
318
+ /* skip */
319
+ }
320
+ }
321
+ }
322
+
323
+ const queue = [...candidates];
324
+ const seen = new Set<string>();
325
+ let best: string | null = null;
326
+ let bestScore = 0;
327
+ let fetched = 0;
328
+ let scanned = 0;
329
+ while (queue.length && fetched < 6 && scanned < 20_000) {
330
+ const sm = queue.shift() as string;
331
+ if (seen.has(sm) || !sameDomain(sm)) continue;
332
+ seen.add(sm);
333
+ if (sm.endsWith(".gz")) continue; // skip compressed sitemaps
334
+ const xml = await fetchText(sm);
335
+ if (!xml) continue;
336
+ fetched++;
337
+ const body = xml.slice(0, 5_000_000);
338
+ const locs = [...body.matchAll(/<loc>\s*([^<\s]+?)\s*<\/loc>/gi)].map((mm) => mm[1].replace(/&amp;/g, "&"));
339
+ if (/<sitemapindex/i.test(body)) {
340
+ const ranked = locs
341
+ .filter(sameDomain)
342
+ .filter((l) => !/(pdf|video|image|img|news|siteimprove)/i.test(l))
343
+ .map((l): [number, string] => [pathScore(l) + (/product|solution/i.test(l) ? 1 : 0), l])
344
+ .sort((a, b) => b[0] - a[0]);
345
+ for (const [, l] of ranked.slice(0, 5)) queue.push(l);
346
+ } else {
347
+ for (const l of locs) {
348
+ scanned++;
349
+ if (!sameDomain(l)) continue;
350
+ const s = pathScore(l);
351
+ if (s > bestScore) {
352
+ bestScore = s;
353
+ best = l;
354
+ }
355
+ }
356
+ }
357
+ }
358
+ return bestScore >= Math.min(2, keywords.length) ? best : null;
359
+ }
360
+
361
+ /**
362
+ * Find a vendor's category-specific page: scan its (already-fetched) homepage nav,
363
+ * then fall back to the sitemap. Returns the page URL, or null to keep the homepage.
364
+ */
365
+ export async function findCategoryPage(
366
+ homepageHtml: string,
367
+ homepageUrl: string,
368
+ category: string,
369
+ fetchText: FetchText = defaultFetchText,
370
+ ): Promise<string | null> {
371
+ const keywords = categoryKeywords(category);
372
+ if (!keywords.length) return null;
373
+ const nav = pickCategoryPage(homepageHtml, homepageUrl, keywords);
374
+ if (nav) return nav;
375
+ return findCategoryPageInSitemap(homepageUrl, keywords, fetchText);
376
+ }
377
+
378
+ /**
379
+ * A vendor logo as a self-contained `data:` URI — the form `MarketVendor.logo`
380
+ * renders and the report serves under a strict `img-src data:` CSP. Prefers the
381
+ * page-declared logo (from the given homepage HTML), then a favicon service.
382
+ * Bounded to small raster/SVG (≤50KB).
383
+ */
384
+ export async function fetchLogoDataUri(
385
+ homepageUrl: string,
386
+ html?: string,
387
+ fetchBytes: FetchBytes = defaultFetchBytes,
388
+ ): Promise<string | null> {
389
+ const host = hostOf(homepageUrl);
390
+ if (!host) return null;
391
+ const candidates: string[] = [];
392
+ if (html) {
393
+ const fromPage = extractLogoUrl(html, homepageUrl);
394
+ if (fromPage) candidates.push(fromPage);
395
+ }
396
+ candidates.push(`https://www.google.com/s2/favicons?domain=${host}&sz=64`);
397
+ for (const url of candidates) {
398
+ const got = await fetchBytes(url);
399
+ if (!got) continue;
400
+ if (!got.contentType.startsWith("image/")) continue;
401
+ if (got.bytes.length === 0 || got.bytes.length > 50_000) continue;
402
+ return `data:${got.contentType};base64,${Buffer.from(got.bytes).toString("base64")}`;
403
+ }
404
+ return null;
405
+ }