fullstackgtm 0.32.0 → 0.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -5,6 +5,29 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
5
5
  and the project adheres to [Semantic Versioning](https://semver.org/).
6
6
  The path to 1.0 is planned in [docs/roadmap-to-1.0.md](./docs/roadmap-to-1.0.md).
7
7
 
8
+ ## [0.33.0] — 2026-06-18
9
+
10
+ ### Added
11
+
12
+ - **Market sourcing helpers (`marketSourcing.ts`)** — find the right page to
13
+ capture per vendor, detect acquired/redirected vendors, and extract brand logos,
14
+ with zero coupling to any transport:
15
+ - `pickCategoryPage(html, baseUrl, keywords)` — follow a vendor's own nav to its
16
+ category page (so multi-product companies like SAP/Salesforce are captured on
17
+ the product page, not the corporate homepage). Pure.
18
+ - `findCategoryPageInSitemap(rootUrl, keywords, fetchText?)` — sitemap fallback
19
+ for JS-mega-menu sites whose product links aren't in the rendered homepage.
20
+ - `findCategoryPage(...)` — nav-scan then sitemap, combined.
21
+ - `detectDrift(url, srcHost, resolve?)` / `resolveFinalUrl(url)` — skip vendors
22
+ whose site redirects to a different company (an acquired/defunct product).
23
+ - `extractLogoUrl(html, baseUrl)` + `fetchLogoDataUri(homepageUrl, html?, …)` —
24
+ a vendor logo as a self-contained `data:` URI for `MarketVendor.logo`.
25
+ - `categoryKeywords()` + `registrableDomain()` utilities.
26
+
27
+ Pure functions operate on already-fetched HTML; fetching helpers default to the
28
+ package's SSRF-guarded `assertPublicUrl` + `fetch` but accept an injectable
29
+ fetcher (testable offline, browser-render-friendly). 8 tests.
30
+
8
31
  ## [0.32.0] — 2026-06-18
9
32
 
10
33
  ### Added
package/dist/index.d.ts CHANGED
@@ -34,6 +34,7 @@ export { buildWorksheet, classifyMarket, type ClassifyMarketOptions, type Classi
34
34
  export { computeDirectives, computeOverlayStats, directivesToPlan, overlayToMarkdown, type CallDocument, type ClaimMentionStats, type DirectiveStat, type DirectiveType, type MarketDirective, type OverlayOptions, type OverlayStats, type VendorMentionStats, } from "./marketOverlay.ts";
35
35
  export { computeScaleIndex, dimensionForMetric, scaleReportToText, type ScaleDimension, type ScaleReport, type SignalEstimate, type VendorScale, } from "./marketScale.ts";
36
36
  export { marketMapToHtml, marketMapToMarkdown } from "./marketReport.ts";
37
+ export { registrableDomain, categoryKeywords, pickCategoryPage, extractLogoUrl, resolveFinalUrl, detectDrift, findCategoryPageInSitemap, findCategoryPage, fetchLogoDataUri, type FetchText, type FetchBytes, type ResolveUrl, } from "./marketSourcing.ts";
37
38
  export { computeMissedFirings, createFileScheduleRunStore, createFileScheduleStore, cronMatches, crontabSentinels, expectedFirings, nextCronFiring, parseCron, renderManagedBlock, replaceManagedBlock, scheduleId, scheduleRunsDir, schedulesPath, systemCrontabIo, tokenizeCommand, validateSchedulableArgv, type CronExpression, type CrontabIo, type ScheduleEntry, type ScheduleProvider, type ScheduleRunRecord, type ScheduleRunStore, type ScheduleRunTrigger, type ScheduleStore, } from "./schedule.ts";
38
39
  export { suggestValues, type SuggestionConfidence, type ValueSuggestion } from "./suggest.ts";
39
40
  export type { ApprovalStatus, AuditFinding, AuditFindingSeverity, CanonicalAccount, CanonicalActivity, CanonicalContact, CanonicalDeal, CanonicalGtmSnapshot, CanonicalUser, CrmProvider, GtmAuditRule, GtmConnector, GtmEvidence, GtmEvidenceSourceSystem, GtmObjectType, GtmPolicy, GtmRuleContext, GtmRuleResult, GtmSnapshotIndex, PatchOperation, PatchOperationResult, PatchOperationType, PatchPlan, PatchPlanRun, PatchPlanRunStatus, PatchVerification, PipelineFinding, PipelineFindingStatus, PipelineFindingType, ProviderIdentity, RiskLevel, SourceFreshness, } from "./types.ts";
package/dist/index.js CHANGED
@@ -34,5 +34,6 @@ export { buildWorksheet, classifyMarket, } from "./marketClassify.js";
34
34
  export { computeDirectives, computeOverlayStats, directivesToPlan, overlayToMarkdown, } from "./marketOverlay.js";
35
35
  export { computeScaleIndex, dimensionForMetric, scaleReportToText, } from "./marketScale.js";
36
36
  export { marketMapToHtml, marketMapToMarkdown } from "./marketReport.js";
37
+ export { registrableDomain, categoryKeywords, pickCategoryPage, extractLogoUrl, resolveFinalUrl, detectDrift, findCategoryPageInSitemap, findCategoryPage, fetchLogoDataUri, } from "./marketSourcing.js";
37
38
  export { computeMissedFirings, createFileScheduleRunStore, createFileScheduleStore, cronMatches, crontabSentinels, expectedFirings, nextCronFiring, parseCron, renderManagedBlock, replaceManagedBlock, scheduleId, scheduleRunsDir, schedulesPath, systemCrontabIo, tokenizeCommand, validateSchedulableArgv, } from "./schedule.js";
38
39
  export { suggestValues } from "./suggest.js";
@@ -0,0 +1,66 @@
1
+ /** Fetch a text resource (sitemap, robots.txt); null on any failure. */
2
+ export type FetchText = (url: string) => Promise<string | null>;
3
+ /** Fetch raw bytes (a logo image); null on any failure. */
4
+ export type FetchBytes = (url: string) => Promise<{
5
+ contentType: string;
6
+ bytes: Uint8Array;
7
+ } | null>;
8
+ /** Resolve a URL's final destination after redirects. */
9
+ export type ResolveUrl = (url: string) => Promise<{
10
+ finalUrl: string;
11
+ status: number;
12
+ }>;
13
+ /**
14
+ * Best-effort registrable domain (eTLD+1) for comparing vendor identity across a
15
+ * redirect. Heuristic (no full public-suffix list): last two labels, or three for
16
+ * a known multi-label suffix. Good enough to catch "spiff.com → salesforce.com".
17
+ */
18
+ export declare function registrableDomain(host: string): string;
19
+ /** Significant category words for matching pages/links (drops generic filler). */
20
+ export declare function categoryKeywords(category: string): string[];
21
+ /**
22
+ * Conglomerate page-selection: when a vendor's homepage isn't about the category,
23
+ * follow its own nav to the category page. Scans same-registrable-domain internal
24
+ * links and returns the one whose anchor text / path best matches the category
25
+ * keywords (anchor text weighted 2×, path 1×; requires ≥2 to avoid false hits).
26
+ * Returns null if no link is a clear match. Pure — operates on the given HTML.
27
+ */
28
+ export declare function pickCategoryPage(html: string, baseUrl: string, keywords: string[]): string | null;
29
+ /**
30
+ * Pull a canonical logo URL out of a homepage: apple-touch-icon → og:image →
31
+ * rel=icon, resolved to an absolute URL. Pure — operates on the given HTML.
32
+ */
33
+ export declare function extractLogoUrl(html: string, baseUrl: string): string | null;
34
+ /**
35
+ * Follow redirects (SSRF-guarded, re-validated each hop) and return the final URL
36
+ * + status WITHOUT downloading the body. Used to detect identity drift.
37
+ */
38
+ export declare const resolveFinalUrl: ResolveUrl;
39
+ /**
40
+ * Detect vendor identity drift: does this URL (or its www/apex sibling) redirect
41
+ * to a DIFFERENT registrable domain? Returns the drifted-to host, else null.
42
+ * Catches acquired/defunct products (e.g. www.spiff.com → salesforce.com) even
43
+ * when the apex itself errors. Only a 2xx/3xx landing on another domain counts —
44
+ * a status code or a throw is NOT drift (real sites block bare requests).
45
+ */
46
+ export declare function detectDrift(url: string, srcHost: string, resolve?: ResolveUrl): Promise<string | null>;
47
+ /**
48
+ * Fallback for JS-nav conglomerates whose product links aren't in the rendered
49
+ * homepage: find the category page from the vendor's sitemap. Bounded (≤6 sitemap
50
+ * fetches, ≤20k URLs), same-registrable-domain, plain XML only; media sitemaps
51
+ * skipped, non-English locales de-prioritized, /products/ preferred; requires the
52
+ * path to hit the keywords so locale/blog false-positives are rejected.
53
+ */
54
+ export declare function findCategoryPageInSitemap(rootUrl: string, keywords: string[], fetchText?: FetchText): Promise<string | null>;
55
+ /**
56
+ * Find a vendor's category-specific page: scan its (already-fetched) homepage nav,
57
+ * then fall back to the sitemap. Returns the page URL, or null to keep the homepage.
58
+ */
59
+ export declare function findCategoryPage(homepageHtml: string, homepageUrl: string, category: string, fetchText?: FetchText): Promise<string | null>;
60
+ /**
61
+ * A vendor logo as a self-contained `data:` URI — the form `MarketVendor.logo`
62
+ * renders and the report serves under a strict `img-src data:` CSP. Prefers the
63
+ * page-declared logo (from the given homepage HTML), then a favicon service.
64
+ * Bounded to small raster/SVG (≤50KB).
65
+ */
66
+ export declare function fetchLogoDataUri(homepageUrl: string, html?: string, fetchBytes?: FetchBytes): Promise<string | null>;
@@ -0,0 +1,405 @@
1
+ /**
2
+ * Market sourcing — find the *right* page to capture for each vendor, detect
3
+ * acquired/redirected vendors, and extract brand logos. These raise the quality
4
+ * of a cold-start map and are useful to every consumer (CLI, MCP, or a hosted
5
+ * service), with zero coupling to any transport.
6
+ *
7
+ * Design:
8
+ * - **Pure functions** (`pickCategoryPage`, `extractLogoUrl`, `categoryKeywords`,
9
+ * `registrableDomain`) operate on already-fetched HTML / strings. The caller
10
+ * fetches the page with whatever it likes — the hosted service injects a
11
+ * browser-rendering fetch for JS-walled homepages; the CLI uses the default.
12
+ * - **Fetching helpers** (`resolveFinalUrl`, `detectDrift`,
13
+ * `findCategoryPageInSitemap`, `fetchLogoDataUri`) default to the package's
14
+ * SSRF-guarded `assertPublicUrl` + global `fetch`, but each accepts an
15
+ * injectable fetcher so they stay testable offline and transport-agnostic.
16
+ * Sitemaps / redirects / logo bytes are plain resources — they never need a
17
+ * headless browser, so the default fetch is sufficient even in the hosted layer.
18
+ */
19
+ import { assertPublicUrl } from "./market.js";
20
+ const USER_AGENT = "fullstackgtm-market/0 (+https://github.com/fullstackgtm/core)";
21
+ const FETCH_TIMEOUT_MS = 15_000;
22
+ const MAX_REDIRECTS = 5;
23
+ function hostOf(url) {
24
+ try {
25
+ return new URL(url).hostname.replace(/^www\./, "");
26
+ }
27
+ catch {
28
+ return null;
29
+ }
30
+ }
31
+ // ── Pure helpers ────────────────────────────────────────────────────────────
32
+ // A few common multi-label public suffixes so "bbc.co.uk" → "bbc.co.uk", not "co.uk".
33
+ const MULTI_LABEL_TLDS = new Set([
34
+ "co.uk", "com.au", "co.nz", "co.jp", "com.br", "co.in", "com.sg", "co.za", "com.mx", "com.cn",
35
+ ]);
36
+ /**
37
+ * Best-effort registrable domain (eTLD+1) for comparing vendor identity across a
38
+ * redirect. Heuristic (no full public-suffix list): last two labels, or three for
39
+ * a known multi-label suffix. Good enough to catch "spiff.com → salesforce.com".
40
+ */
41
+ export function registrableDomain(host) {
42
+ const h = String(host || "").toLowerCase().replace(/\.$/, "");
43
+ const labels = h.split(".");
44
+ if (labels.length <= 2)
45
+ return h;
46
+ const lastTwo = labels.slice(-2).join(".");
47
+ return MULTI_LABEL_TLDS.has(lastTwo) ? labels.slice(-3).join(".") : lastTwo;
48
+ }
49
+ const CATEGORY_STOPWORDS = new Set([
50
+ "software", "platform", "platforms", "tool", "tools", "system", "systems", "management",
51
+ "solution", "solutions", "app", "apps", "service", "services", "suite", "the", "for", "and", "of",
52
+ ]);
53
+ /** Significant category words for matching pages/links (drops generic filler). */
54
+ export function categoryKeywords(category) {
55
+ return [
56
+ ...new Set(String(category || "")
57
+ .toLowerCase()
58
+ .split(/[^a-z0-9]+/)
59
+ .filter((w) => w.length >= 3 && !CATEGORY_STOPWORDS.has(w))),
60
+ ];
61
+ }
62
+ /**
63
+ * Conglomerate page-selection: when a vendor's homepage isn't about the category,
64
+ * follow its own nav to the category page. Scans same-registrable-domain internal
65
+ * links and returns the one whose anchor text / path best matches the category
66
+ * keywords (anchor text weighted 2×, path 1×; requires ≥2 to avoid false hits).
67
+ * Returns null if no link is a clear match. Pure — operates on the given HTML.
68
+ */
69
+ export function pickCategoryPage(html, baseUrl, keywords) {
70
+ if (!html || !keywords.length)
71
+ return null;
72
+ let baseHost;
73
+ try {
74
+ baseHost = registrableDomain(new URL(baseUrl).hostname);
75
+ }
76
+ catch {
77
+ return null;
78
+ }
79
+ const re = /<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
80
+ const seen = new Set();
81
+ let best = null;
82
+ let bestScore = 0;
83
+ let m;
84
+ while ((m = re.exec(html))) {
85
+ let u;
86
+ try {
87
+ u = new URL(m[1], baseUrl);
88
+ }
89
+ catch {
90
+ continue;
91
+ }
92
+ if (u.protocol !== "http:" && u.protocol !== "https:")
93
+ continue;
94
+ if (registrableDomain(u.hostname) !== baseHost)
95
+ continue; // internal links only
96
+ const key = u.origin + u.pathname;
97
+ if (seen.has(key))
98
+ continue;
99
+ seen.add(key);
100
+ const text = m[2].replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
101
+ const path = u.pathname.toLowerCase();
102
+ let score = 0;
103
+ for (const kw of keywords) {
104
+ if (text.includes(kw))
105
+ score += 2;
106
+ if (path.includes(kw))
107
+ score += 1;
108
+ }
109
+ if (score > bestScore) {
110
+ bestScore = score;
111
+ best = u.toString();
112
+ }
113
+ }
114
+ return bestScore >= 2 ? best : null;
115
+ }
116
+ /**
117
+ * Pull a canonical logo URL out of a homepage: apple-touch-icon → og:image →
118
+ * rel=icon, resolved to an absolute URL. Pure — operates on the given HTML.
119
+ */
120
+ export function extractLogoUrl(html, baseUrl) {
121
+ if (!html)
122
+ return null;
123
+ const abs = (href) => {
124
+ try {
125
+ return new URL(href, baseUrl).toString();
126
+ }
127
+ catch {
128
+ return null;
129
+ }
130
+ };
131
+ const hrefOf = (tag) => {
132
+ const mm = tag.match(/href=["']([^"']+)["']/i) || tag.match(/content=["']([^"']+)["']/i);
133
+ return mm ? abs(mm[1]) : null;
134
+ };
135
+ let mm = html.match(/<link[^>]+rel=["'][^"']*apple-touch-icon[^"']*["'][^>]*>/i);
136
+ if (mm) {
137
+ const u = hrefOf(mm[0]);
138
+ if (u)
139
+ return u;
140
+ }
141
+ mm = html.match(/<meta[^>]+property=["']og:image["'][^>]*>/i);
142
+ if (mm) {
143
+ const u = hrefOf(mm[0]);
144
+ if (u)
145
+ return u;
146
+ }
147
+ mm = html.match(/<link[^>]+rel=["'](?:shortcut icon|icon)["'][^>]*>/i);
148
+ if (mm) {
149
+ const u = hrefOf(mm[0]);
150
+ if (u)
151
+ return u;
152
+ }
153
+ return null;
154
+ }
155
+ // ── Default SSRF-guarded fetchers ────────────────────────────────────────────
156
+ /** Manual-redirect fetch that re-validates every hop against the SSRF guard. */
157
+ async function guardedFetch(url) {
158
+ let current = url;
159
+ for (let hop = 0; hop <= MAX_REDIRECTS; hop++) {
160
+ await assertPublicUrl(current);
161
+ const res = await fetch(current, {
162
+ redirect: "manual",
163
+ signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
164
+ headers: { "User-Agent": USER_AGENT },
165
+ });
166
+ const location = res.headers.get("location");
167
+ if (res.status >= 300 && res.status < 400 && location) {
168
+ try {
169
+ await res.body?.cancel();
170
+ }
171
+ catch {
172
+ /* ignore */
173
+ }
174
+ current = new URL(location, current).toString();
175
+ continue;
176
+ }
177
+ return res;
178
+ }
179
+ return null;
180
+ }
181
+ const defaultFetchText = async (url) => {
182
+ try {
183
+ const res = await guardedFetch(url);
184
+ return res && res.ok ? await res.text() : null;
185
+ }
186
+ catch {
187
+ return null;
188
+ }
189
+ };
190
+ const defaultFetchBytes = async (url) => {
191
+ try {
192
+ const res = await guardedFetch(url);
193
+ if (!res || !res.ok)
194
+ return null;
195
+ const contentType = (res.headers.get("content-type") || "").split(";")[0].trim().toLowerCase();
196
+ const bytes = new Uint8Array(await res.arrayBuffer());
197
+ return { contentType, bytes };
198
+ }
199
+ catch {
200
+ return null;
201
+ }
202
+ };
203
+ /**
204
+ * Follow redirects (SSRF-guarded, re-validated each hop) and return the final URL
205
+ * + status WITHOUT downloading the body. Used to detect identity drift.
206
+ */
207
+ export const resolveFinalUrl = async (rawUrl) => {
208
+ let current = rawUrl;
209
+ for (let hop = 0; hop <= MAX_REDIRECTS; hop++) {
210
+ await assertPublicUrl(current);
211
+ const res = await fetch(current, {
212
+ redirect: "manual",
213
+ signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
214
+ headers: { "User-Agent": USER_AGENT },
215
+ });
216
+ try {
217
+ await res.body?.cancel();
218
+ }
219
+ catch {
220
+ /* ignore */
221
+ }
222
+ const location = res.headers.get("location");
223
+ if (res.status >= 300 && res.status < 400 && location) {
224
+ current = new URL(location, current).toString();
225
+ continue;
226
+ }
227
+ return { finalUrl: current, status: res.status };
228
+ }
229
+ return { finalUrl: current, status: 0 };
230
+ };
231
+ // ── Fetching helpers ─────────────────────────────────────────────────────────
232
+ /**
233
+ * Detect vendor identity drift: does this URL (or its www/apex sibling) redirect
234
+ * to a DIFFERENT registrable domain? Returns the drifted-to host, else null.
235
+ * Catches acquired/defunct products (e.g. www.spiff.com → salesforce.com) even
236
+ * when the apex itself errors. Only a 2xx/3xx landing on another domain counts —
237
+ * a status code or a throw is NOT drift (real sites block bare requests).
238
+ */
239
+ export async function detectDrift(url, srcHost, resolve = resolveFinalUrl) {
240
+ if (!srcHost)
241
+ return null;
242
+ const tries = [url];
243
+ try {
244
+ const u = new URL(url);
245
+ const sibling = u.hostname.startsWith("www.")
246
+ ? url.replace("://www.", "://")
247
+ : url.replace(`://${u.hostname}`, `://www.${u.hostname}`);
248
+ if (sibling !== url)
249
+ tries.push(sibling);
250
+ }
251
+ catch {
252
+ /* ignore */
253
+ }
254
+ for (const t of tries) {
255
+ try {
256
+ const { finalUrl, status } = await resolve(t);
257
+ if (status < 200 || status >= 400)
258
+ continue;
259
+ const finalHost = hostOf(finalUrl);
260
+ if (finalHost && registrableDomain(finalHost) !== registrableDomain(srcHost))
261
+ return finalHost;
262
+ }
263
+ catch {
264
+ /* a throw isn't a drift signal */
265
+ }
266
+ }
267
+ return null;
268
+ }
269
+ /**
270
+ * Fallback for JS-nav conglomerates whose product links aren't in the rendered
271
+ * homepage: find the category page from the vendor's sitemap. Bounded (≤6 sitemap
272
+ * fetches, ≤20k URLs), same-registrable-domain, plain XML only; media sitemaps
273
+ * skipped, non-English locales de-prioritized, /products/ preferred; requires the
274
+ * path to hit the keywords so locale/blog false-positives are rejected.
275
+ */
276
+ export async function findCategoryPageInSitemap(rootUrl, keywords, fetchText = defaultFetchText) {
277
+ let root;
278
+ try {
279
+ root = new URL(rootUrl);
280
+ }
281
+ catch {
282
+ return null;
283
+ }
284
+ if (!keywords.length)
285
+ return null;
286
+ const rootDom = registrableDomain(root.hostname);
287
+ const sameDomain = (u) => registrableDomain(hostOf(u) || "") === rootDom;
288
+ const pathScore = (u) => {
289
+ let path;
290
+ try {
291
+ path = new URL(u).pathname.toLowerCase();
292
+ }
293
+ catch {
294
+ return 0;
295
+ }
296
+ let s = 0;
297
+ for (const kw of keywords)
298
+ if (path.includes(kw))
299
+ s += 1;
300
+ if (s > 0) {
301
+ if (/\/(product|solution)s?\//.test(path))
302
+ s += 0.5; // prefer product pages
303
+ const loc = path.match(/^\/([a-z]{2})(?:-[a-z]{2})?\//); // de-prioritize non-English locales
304
+ if (loc && !["en", "us"].includes(loc[1]))
305
+ s -= 0.6;
306
+ }
307
+ return s;
308
+ };
309
+ const candidates = new Set([`${root.origin}/sitemap.xml`, `${root.origin}/sitemap_index.xml`]);
310
+ const robots = await fetchText(`${root.origin}/robots.txt`);
311
+ if (robots) {
312
+ for (const mm of robots.slice(0, 100_000).matchAll(/^\s*sitemap:\s*(\S+)/gim)) {
313
+ try {
314
+ candidates.add(new URL(mm[1].trim(), root.origin).toString());
315
+ }
316
+ catch {
317
+ /* skip */
318
+ }
319
+ }
320
+ }
321
+ const queue = [...candidates];
322
+ const seen = new Set();
323
+ let best = null;
324
+ let bestScore = 0;
325
+ let fetched = 0;
326
+ let scanned = 0;
327
+ while (queue.length && fetched < 6 && scanned < 20_000) {
328
+ const sm = queue.shift();
329
+ if (seen.has(sm) || !sameDomain(sm))
330
+ continue;
331
+ seen.add(sm);
332
+ if (sm.endsWith(".gz"))
333
+ continue; // skip compressed sitemaps
334
+ const xml = await fetchText(sm);
335
+ if (!xml)
336
+ continue;
337
+ fetched++;
338
+ const body = xml.slice(0, 5_000_000);
339
+ const locs = [...body.matchAll(/<loc>\s*([^<\s]+?)\s*<\/loc>/gi)].map((mm) => mm[1].replace(/&amp;/g, "&"));
340
+ if (/<sitemapindex/i.test(body)) {
341
+ const ranked = locs
342
+ .filter(sameDomain)
343
+ .filter((l) => !/(pdf|video|image|img|news|siteimprove)/i.test(l))
344
+ .map((l) => [pathScore(l) + (/product|solution/i.test(l) ? 1 : 0), l])
345
+ .sort((a, b) => b[0] - a[0]);
346
+ for (const [, l] of ranked.slice(0, 5))
347
+ queue.push(l);
348
+ }
349
+ else {
350
+ for (const l of locs) {
351
+ scanned++;
352
+ if (!sameDomain(l))
353
+ continue;
354
+ const s = pathScore(l);
355
+ if (s > bestScore) {
356
+ bestScore = s;
357
+ best = l;
358
+ }
359
+ }
360
+ }
361
+ }
362
+ return bestScore >= Math.min(2, keywords.length) ? best : null;
363
+ }
364
+ /**
365
+ * Find a vendor's category-specific page: scan its (already-fetched) homepage nav,
366
+ * then fall back to the sitemap. Returns the page URL, or null to keep the homepage.
367
+ */
368
+ export async function findCategoryPage(homepageHtml, homepageUrl, category, fetchText = defaultFetchText) {
369
+ const keywords = categoryKeywords(category);
370
+ if (!keywords.length)
371
+ return null;
372
+ const nav = pickCategoryPage(homepageHtml, homepageUrl, keywords);
373
+ if (nav)
374
+ return nav;
375
+ return findCategoryPageInSitemap(homepageUrl, keywords, fetchText);
376
+ }
377
+ /**
378
+ * A vendor logo as a self-contained `data:` URI — the form `MarketVendor.logo`
379
+ * renders and the report serves under a strict `img-src data:` CSP. Prefers the
380
+ * page-declared logo (from the given homepage HTML), then a favicon service.
381
+ * Bounded to small raster/SVG (≤50KB).
382
+ */
383
+ export async function fetchLogoDataUri(homepageUrl, html, fetchBytes = defaultFetchBytes) {
384
+ const host = hostOf(homepageUrl);
385
+ if (!host)
386
+ return null;
387
+ const candidates = [];
388
+ if (html) {
389
+ const fromPage = extractLogoUrl(html, homepageUrl);
390
+ if (fromPage)
391
+ candidates.push(fromPage);
392
+ }
393
+ candidates.push(`https://www.google.com/s2/favicons?domain=${host}&sz=64`);
394
+ for (const url of candidates) {
395
+ const got = await fetchBytes(url);
396
+ if (!got)
397
+ continue;
398
+ if (!got.contentType.startsWith("image/"))
399
+ continue;
400
+ if (got.bytes.length === 0 || got.bytes.length > 50_000)
401
+ continue;
402
+ return `data:${got.contentType};base64,${Buffer.from(got.bytes).toString("base64")}`;
403
+ }
404
+ return null;
405
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "fullstackgtm",
3
- "version": "0.32.0",
3
+ "version": "0.33.0",
4
4
  "description": "Open-source agentic GTM ops framework: canonical GTM data model, pluggable deterministic audits, reviewable dry-run patch plans, approval-gated write-back with conflict detection, and cross-system entity resolution. HubSpot, Salesforce, and Stripe connectors included.",
5
5
  "license": "Apache-2.0",
6
6
  "author": "Full Stack GTM LLC <ryan@fullstackgtm.com> (https://fullstackgtm.com)",
package/src/index.ts CHANGED
@@ -291,6 +291,20 @@ export {
291
291
  type VendorScale,
292
292
  } from "./marketScale.ts";
293
293
  export { marketMapToHtml, marketMapToMarkdown } from "./marketReport.ts";
294
+ export {
295
+ registrableDomain,
296
+ categoryKeywords,
297
+ pickCategoryPage,
298
+ extractLogoUrl,
299
+ resolveFinalUrl,
300
+ detectDrift,
301
+ findCategoryPageInSitemap,
302
+ findCategoryPage,
303
+ fetchLogoDataUri,
304
+ type FetchText,
305
+ type FetchBytes,
306
+ type ResolveUrl,
307
+ } from "./marketSourcing.ts";
294
308
  export {
295
309
  computeMissedFirings,
296
310
  createFileScheduleRunStore,
@@ -0,0 +1,405 @@
1
+ /**
2
+ * Market sourcing — find the *right* page to capture for each vendor, detect
3
+ * acquired/redirected vendors, and extract brand logos. These raise the quality
4
+ * of a cold-start map and are useful to every consumer (CLI, MCP, or a hosted
5
+ * service), with zero coupling to any transport.
6
+ *
7
+ * Design:
8
+ * - **Pure functions** (`pickCategoryPage`, `extractLogoUrl`, `categoryKeywords`,
9
+ * `registrableDomain`) operate on already-fetched HTML / strings. The caller
10
+ * fetches the page with whatever it likes — the hosted service injects a
11
+ * browser-rendering fetch for JS-walled homepages; the CLI uses the default.
12
+ * - **Fetching helpers** (`resolveFinalUrl`, `detectDrift`,
13
+ * `findCategoryPageInSitemap`, `fetchLogoDataUri`) default to the package's
14
+ * SSRF-guarded `assertPublicUrl` + global `fetch`, but each accepts an
15
+ * injectable fetcher so they stay testable offline and transport-agnostic.
16
+ * Sitemaps / redirects / logo bytes are plain resources — they never need a
17
+ * headless browser, so the default fetch is sufficient even in the hosted layer.
18
+ */
19
+ import { assertPublicUrl } from "./market.ts";
20
+
21
+ const USER_AGENT = "fullstackgtm-market/0 (+https://github.com/fullstackgtm/core)";
22
+ const FETCH_TIMEOUT_MS = 15_000;
23
+ const MAX_REDIRECTS = 5;
24
+
25
+ /** Fetch a text resource (sitemap, robots.txt); null on any failure. */
26
+ export type FetchText = (url: string) => Promise<string | null>;
27
+ /** Fetch raw bytes (a logo image); null on any failure. */
28
+ export type FetchBytes = (url: string) => Promise<{ contentType: string; bytes: Uint8Array } | null>;
29
+ /** Resolve a URL's final destination after redirects. */
30
+ export type ResolveUrl = (url: string) => Promise<{ finalUrl: string; status: number }>;
31
+
32
+ function hostOf(url: string): string | null {
33
+ try {
34
+ return new URL(url).hostname.replace(/^www\./, "");
35
+ } catch {
36
+ return null;
37
+ }
38
+ }
39
+
40
+ // ── Pure helpers ────────────────────────────────────────────────────────────
41
+
42
+ // A few common multi-label public suffixes so "bbc.co.uk" → "bbc.co.uk", not "co.uk".
43
+ const MULTI_LABEL_TLDS = new Set([
44
+ "co.uk", "com.au", "co.nz", "co.jp", "com.br", "co.in", "com.sg", "co.za", "com.mx", "com.cn",
45
+ ]);
46
+
47
+ /**
48
+ * Best-effort registrable domain (eTLD+1) for comparing vendor identity across a
49
+ * redirect. Heuristic (no full public-suffix list): last two labels, or three for
50
+ * a known multi-label suffix. Good enough to catch "spiff.com → salesforce.com".
51
+ */
52
+ export function registrableDomain(host: string): string {
53
+ const h = String(host || "").toLowerCase().replace(/\.$/, "");
54
+ const labels = h.split(".");
55
+ if (labels.length <= 2) return h;
56
+ const lastTwo = labels.slice(-2).join(".");
57
+ return MULTI_LABEL_TLDS.has(lastTwo) ? labels.slice(-3).join(".") : lastTwo;
58
+ }
59
+
60
+ const CATEGORY_STOPWORDS = new Set([
61
+ "software", "platform", "platforms", "tool", "tools", "system", "systems", "management",
62
+ "solution", "solutions", "app", "apps", "service", "services", "suite", "the", "for", "and", "of",
63
+ ]);
64
+
65
+ /** Significant category words for matching pages/links (drops generic filler). */
66
+ export function categoryKeywords(category: string): string[] {
67
+ return [
68
+ ...new Set(
69
+ String(category || "")
70
+ .toLowerCase()
71
+ .split(/[^a-z0-9]+/)
72
+ .filter((w) => w.length >= 3 && !CATEGORY_STOPWORDS.has(w)),
73
+ ),
74
+ ];
75
+ }
76
+
77
+ /**
78
+ * Conglomerate page-selection: when a vendor's homepage isn't about the category,
79
+ * follow its own nav to the category page. Scans same-registrable-domain internal
80
+ * links and returns the one whose anchor text / path best matches the category
81
+ * keywords (anchor text weighted 2×, path 1×; requires ≥2 to avoid false hits).
82
+ * Returns null if no link is a clear match. Pure — operates on the given HTML.
83
+ */
84
+ export function pickCategoryPage(html: string, baseUrl: string, keywords: string[]): string | null {
85
+ if (!html || !keywords.length) return null;
86
+ let baseHost: string;
87
+ try {
88
+ baseHost = registrableDomain(new URL(baseUrl).hostname);
89
+ } catch {
90
+ return null;
91
+ }
92
+ const re = /<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
93
+ const seen = new Set<string>();
94
+ let best: string | null = null;
95
+ let bestScore = 0;
96
+ let m: RegExpExecArray | null;
97
+ while ((m = re.exec(html))) {
98
+ let u: URL;
99
+ try {
100
+ u = new URL(m[1], baseUrl);
101
+ } catch {
102
+ continue;
103
+ }
104
+ if (u.protocol !== "http:" && u.protocol !== "https:") continue;
105
+ if (registrableDomain(u.hostname) !== baseHost) continue; // internal links only
106
+ const key = u.origin + u.pathname;
107
+ if (seen.has(key)) continue;
108
+ seen.add(key);
109
+ const text = m[2].replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
110
+ const path = u.pathname.toLowerCase();
111
+ let score = 0;
112
+ for (const kw of keywords) {
113
+ if (text.includes(kw)) score += 2;
114
+ if (path.includes(kw)) score += 1;
115
+ }
116
+ if (score > bestScore) {
117
+ bestScore = score;
118
+ best = u.toString();
119
+ }
120
+ }
121
+ return bestScore >= 2 ? best : null;
122
+ }
123
+
124
+ /**
125
+ * Pull a canonical logo URL out of a homepage: apple-touch-icon → og:image →
126
+ * rel=icon, resolved to an absolute URL. Pure — operates on the given HTML.
127
+ */
128
+ export function extractLogoUrl(html: string, baseUrl: string): string | null {
129
+ if (!html) return null;
130
+ const abs = (href: string): string | null => {
131
+ try {
132
+ return new URL(href, baseUrl).toString();
133
+ } catch {
134
+ return null;
135
+ }
136
+ };
137
+ const hrefOf = (tag: string): string | null => {
138
+ const mm = tag.match(/href=["']([^"']+)["']/i) || tag.match(/content=["']([^"']+)["']/i);
139
+ return mm ? abs(mm[1]) : null;
140
+ };
141
+ let mm = html.match(/<link[^>]+rel=["'][^"']*apple-touch-icon[^"']*["'][^>]*>/i);
142
+ if (mm) {
143
+ const u = hrefOf(mm[0]);
144
+ if (u) return u;
145
+ }
146
+ mm = html.match(/<meta[^>]+property=["']og:image["'][^>]*>/i);
147
+ if (mm) {
148
+ const u = hrefOf(mm[0]);
149
+ if (u) return u;
150
+ }
151
+ mm = html.match(/<link[^>]+rel=["'](?:shortcut icon|icon)["'][^>]*>/i);
152
+ if (mm) {
153
+ const u = hrefOf(mm[0]);
154
+ if (u) return u;
155
+ }
156
+ return null;
157
+ }
158
+
159
+ // ── Default SSRF-guarded fetchers ────────────────────────────────────────────
160
+
161
+ /** Manual-redirect fetch that re-validates every hop against the SSRF guard. */
162
+ async function guardedFetch(url: string): Promise<Response | null> {
163
+ let current = url;
164
+ for (let hop = 0; hop <= MAX_REDIRECTS; hop++) {
165
+ await assertPublicUrl(current);
166
+ const res = await fetch(current, {
167
+ redirect: "manual",
168
+ signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
169
+ headers: { "User-Agent": USER_AGENT },
170
+ });
171
+ const location = res.headers.get("location");
172
+ if (res.status >= 300 && res.status < 400 && location) {
173
+ try {
174
+ await res.body?.cancel();
175
+ } catch {
176
+ /* ignore */
177
+ }
178
+ current = new URL(location, current).toString();
179
+ continue;
180
+ }
181
+ return res;
182
+ }
183
+ return null;
184
+ }
185
+
186
+ const defaultFetchText: FetchText = async (url) => {
187
+ try {
188
+ const res = await guardedFetch(url);
189
+ return res && res.ok ? await res.text() : null;
190
+ } catch {
191
+ return null;
192
+ }
193
+ };
194
+
195
+ const defaultFetchBytes: FetchBytes = async (url) => {
196
+ try {
197
+ const res = await guardedFetch(url);
198
+ if (!res || !res.ok) return null;
199
+ const contentType = (res.headers.get("content-type") || "").split(";")[0].trim().toLowerCase();
200
+ const bytes = new Uint8Array(await res.arrayBuffer());
201
+ return { contentType, bytes };
202
+ } catch {
203
+ return null;
204
+ }
205
+ };
206
+
207
+ /**
208
+ * Follow redirects (SSRF-guarded, re-validated each hop) and return the final URL
209
+ * + status WITHOUT downloading the body. Used to detect identity drift.
210
+ */
211
+ export const resolveFinalUrl: ResolveUrl = async (rawUrl) => {
212
+ let current = rawUrl;
213
+ for (let hop = 0; hop <= MAX_REDIRECTS; hop++) {
214
+ await assertPublicUrl(current);
215
+ const res = await fetch(current, {
216
+ redirect: "manual",
217
+ signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
218
+ headers: { "User-Agent": USER_AGENT },
219
+ });
220
+ try {
221
+ await res.body?.cancel();
222
+ } catch {
223
+ /* ignore */
224
+ }
225
+ const location = res.headers.get("location");
226
+ if (res.status >= 300 && res.status < 400 && location) {
227
+ current = new URL(location, current).toString();
228
+ continue;
229
+ }
230
+ return { finalUrl: current, status: res.status };
231
+ }
232
+ return { finalUrl: current, status: 0 };
233
+ };
234
+
235
+ // ── Fetching helpers ─────────────────────────────────────────────────────────
236
+
237
+ /**
238
+ * Detect vendor identity drift: does this URL (or its www/apex sibling) redirect
239
+ * to a DIFFERENT registrable domain? Returns the drifted-to host, else null.
240
+ * Catches acquired/defunct products (e.g. www.spiff.com → salesforce.com) even
241
+ * when the apex itself errors. Only a 2xx/3xx landing on another domain counts —
242
+ * a status code or a throw is NOT drift (real sites block bare requests).
243
+ */
244
+ export async function detectDrift(
245
+ url: string,
246
+ srcHost: string,
247
+ resolve: ResolveUrl = resolveFinalUrl,
248
+ ): Promise<string | null> {
249
+ if (!srcHost) return null;
250
+ const tries = [url];
251
+ try {
252
+ const u = new URL(url);
253
+ const sibling = u.hostname.startsWith("www.")
254
+ ? url.replace("://www.", "://")
255
+ : url.replace(`://${u.hostname}`, `://www.${u.hostname}`);
256
+ if (sibling !== url) tries.push(sibling);
257
+ } catch {
258
+ /* ignore */
259
+ }
260
+ for (const t of tries) {
261
+ try {
262
+ const { finalUrl, status } = await resolve(t);
263
+ if (status < 200 || status >= 400) continue;
264
+ const finalHost = hostOf(finalUrl);
265
+ if (finalHost && registrableDomain(finalHost) !== registrableDomain(srcHost)) return finalHost;
266
+ } catch {
267
+ /* a throw isn't a drift signal */
268
+ }
269
+ }
270
+ return null;
271
+ }
272
+
273
+ /**
274
+ * Fallback for JS-nav conglomerates whose product links aren't in the rendered
275
+ * homepage: find the category page from the vendor's sitemap. Bounded (≤6 sitemap
276
+ * fetches, ≤20k URLs), same-registrable-domain, plain XML only; media sitemaps
277
+ * skipped, non-English locales de-prioritized, /products/ preferred; requires the
278
+ * path to hit the keywords so locale/blog false-positives are rejected.
279
+ */
280
+ export async function findCategoryPageInSitemap(
281
+ rootUrl: string,
282
+ keywords: string[],
283
+ fetchText: FetchText = defaultFetchText,
284
+ ): Promise<string | null> {
285
+ let root: URL;
286
+ try {
287
+ root = new URL(rootUrl);
288
+ } catch {
289
+ return null;
290
+ }
291
+ if (!keywords.length) return null;
292
+ const rootDom = registrableDomain(root.hostname);
293
+ const sameDomain = (u: string) => registrableDomain(hostOf(u) || "") === rootDom;
294
+ const pathScore = (u: string): number => {
295
+ let path: string;
296
+ try {
297
+ path = new URL(u).pathname.toLowerCase();
298
+ } catch {
299
+ return 0;
300
+ }
301
+ let s = 0;
302
+ for (const kw of keywords) if (path.includes(kw)) s += 1;
303
+ if (s > 0) {
304
+ if (/\/(product|solution)s?\//.test(path)) s += 0.5; // prefer product pages
305
+ const loc = path.match(/^\/([a-z]{2})(?:-[a-z]{2})?\//); // de-prioritize non-English locales
306
+ if (loc && !["en", "us"].includes(loc[1])) s -= 0.6;
307
+ }
308
+ return s;
309
+ };
310
+
311
+ const candidates = new Set([`${root.origin}/sitemap.xml`, `${root.origin}/sitemap_index.xml`]);
312
+ const robots = await fetchText(`${root.origin}/robots.txt`);
313
+ if (robots) {
314
+ for (const mm of robots.slice(0, 100_000).matchAll(/^\s*sitemap:\s*(\S+)/gim)) {
315
+ try {
316
+ candidates.add(new URL(mm[1].trim(), root.origin).toString());
317
+ } catch {
318
+ /* skip */
319
+ }
320
+ }
321
+ }
322
+
323
+ const queue = [...candidates];
324
+ const seen = new Set<string>();
325
+ let best: string | null = null;
326
+ let bestScore = 0;
327
+ let fetched = 0;
328
+ let scanned = 0;
329
+ while (queue.length && fetched < 6 && scanned < 20_000) {
330
+ const sm = queue.shift() as string;
331
+ if (seen.has(sm) || !sameDomain(sm)) continue;
332
+ seen.add(sm);
333
+ if (sm.endsWith(".gz")) continue; // skip compressed sitemaps
334
+ const xml = await fetchText(sm);
335
+ if (!xml) continue;
336
+ fetched++;
337
+ const body = xml.slice(0, 5_000_000);
338
+ const locs = [...body.matchAll(/<loc>\s*([^<\s]+?)\s*<\/loc>/gi)].map((mm) => mm[1].replace(/&amp;/g, "&"));
339
+ if (/<sitemapindex/i.test(body)) {
340
+ const ranked = locs
341
+ .filter(sameDomain)
342
+ .filter((l) => !/(pdf|video|image|img|news|siteimprove)/i.test(l))
343
+ .map((l): [number, string] => [pathScore(l) + (/product|solution/i.test(l) ? 1 : 0), l])
344
+ .sort((a, b) => b[0] - a[0]);
345
+ for (const [, l] of ranked.slice(0, 5)) queue.push(l);
346
+ } else {
347
+ for (const l of locs) {
348
+ scanned++;
349
+ if (!sameDomain(l)) continue;
350
+ const s = pathScore(l);
351
+ if (s > bestScore) {
352
+ bestScore = s;
353
+ best = l;
354
+ }
355
+ }
356
+ }
357
+ }
358
+ return bestScore >= Math.min(2, keywords.length) ? best : null;
359
+ }
360
+
361
+ /**
362
+ * Find a vendor's category-specific page: scan its (already-fetched) homepage nav,
363
+ * then fall back to the sitemap. Returns the page URL, or null to keep the homepage.
364
+ */
365
+ export async function findCategoryPage(
366
+ homepageHtml: string,
367
+ homepageUrl: string,
368
+ category: string,
369
+ fetchText: FetchText = defaultFetchText,
370
+ ): Promise<string | null> {
371
+ const keywords = categoryKeywords(category);
372
+ if (!keywords.length) return null;
373
+ const nav = pickCategoryPage(homepageHtml, homepageUrl, keywords);
374
+ if (nav) return nav;
375
+ return findCategoryPageInSitemap(homepageUrl, keywords, fetchText);
376
+ }
377
+
378
+ /**
379
+ * A vendor logo as a self-contained `data:` URI — the form `MarketVendor.logo`
380
+ * renders and the report serves under a strict `img-src data:` CSP. Prefers the
381
+ * page-declared logo (from the given homepage HTML), then a favicon service.
382
+ * Bounded to small raster/SVG (≤50KB).
383
+ */
384
+ export async function fetchLogoDataUri(
385
+ homepageUrl: string,
386
+ html?: string,
387
+ fetchBytes: FetchBytes = defaultFetchBytes,
388
+ ): Promise<string | null> {
389
+ const host = hostOf(homepageUrl);
390
+ if (!host) return null;
391
+ const candidates: string[] = [];
392
+ if (html) {
393
+ const fromPage = extractLogoUrl(html, homepageUrl);
394
+ if (fromPage) candidates.push(fromPage);
395
+ }
396
+ candidates.push(`https://www.google.com/s2/favicons?domain=${host}&sz=64`);
397
+ for (const url of candidates) {
398
+ const got = await fetchBytes(url);
399
+ if (!got) continue;
400
+ if (!got.contentType.startsWith("image/")) continue;
401
+ if (got.bytes.length === 0 || got.bytes.length > 50_000) continue;
402
+ return `data:${got.contentType};base64,${Buffer.from(got.bytes).toString("base64")}`;
403
+ }
404
+ return null;
405
+ }