fullstackgtm 0.32.0 → 0.33.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +1 -0
- package/dist/marketSourcing.d.ts +66 -0
- package/dist/marketSourcing.js +405 -0
- package/package.json +1 -1
- package/src/index.ts +14 -0
- package/src/marketSourcing.ts +405 -0
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,29 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
|
5
5
|
and the project adheres to [Semantic Versioning](https://semver.org/).
|
|
6
6
|
The path to 1.0 is planned in [docs/roadmap-to-1.0.md](./docs/roadmap-to-1.0.md).
|
|
7
7
|
|
|
8
|
+
## [0.33.0] — 2026-06-18
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- **Market sourcing helpers (`marketSourcing.ts`)** — find the right page to
|
|
13
|
+
capture per vendor, detect acquired/redirected vendors, and extract brand logos,
|
|
14
|
+
with zero coupling to any transport:
|
|
15
|
+
- `pickCategoryPage(html, baseUrl, keywords)` — follow a vendor's own nav to its
|
|
16
|
+
category page (so multi-product companies like SAP/Salesforce are captured on
|
|
17
|
+
the product page, not the corporate homepage). Pure.
|
|
18
|
+
- `findCategoryPageInSitemap(rootUrl, keywords, fetchText?)` — sitemap fallback
|
|
19
|
+
for JS-mega-menu sites whose product links aren't in the rendered homepage.
|
|
20
|
+
- `findCategoryPage(...)` — nav-scan then sitemap, combined.
|
|
21
|
+
- `detectDrift(url, srcHost, resolve?)` / `resolveFinalUrl(url)` — skip vendors
|
|
22
|
+
whose site redirects to a different company (an acquired/defunct product).
|
|
23
|
+
- `extractLogoUrl(html, baseUrl)` + `fetchLogoDataUri(homepageUrl, html?, …)` —
|
|
24
|
+
a vendor logo as a self-contained `data:` URI for `MarketVendor.logo`.
|
|
25
|
+
- `categoryKeywords()` + `registrableDomain()` utilities.
|
|
26
|
+
|
|
27
|
+
Pure functions operate on already-fetched HTML; fetching helpers default to the
|
|
28
|
+
package's SSRF-guarded `assertPublicUrl` + `fetch` but accept an injectable
|
|
29
|
+
fetcher (testable offline, browser-render-friendly). 8 tests.
|
|
30
|
+
|
|
8
31
|
## [0.32.0] — 2026-06-18
|
|
9
32
|
|
|
10
33
|
### Added
|
package/dist/index.d.ts
CHANGED
|
@@ -34,6 +34,7 @@ export { buildWorksheet, classifyMarket, type ClassifyMarketOptions, type Classi
|
|
|
34
34
|
export { computeDirectives, computeOverlayStats, directivesToPlan, overlayToMarkdown, type CallDocument, type ClaimMentionStats, type DirectiveStat, type DirectiveType, type MarketDirective, type OverlayOptions, type OverlayStats, type VendorMentionStats, } from "./marketOverlay.ts";
|
|
35
35
|
export { computeScaleIndex, dimensionForMetric, scaleReportToText, type ScaleDimension, type ScaleReport, type SignalEstimate, type VendorScale, } from "./marketScale.ts";
|
|
36
36
|
export { marketMapToHtml, marketMapToMarkdown } from "./marketReport.ts";
|
|
37
|
+
export { registrableDomain, categoryKeywords, pickCategoryPage, extractLogoUrl, resolveFinalUrl, detectDrift, findCategoryPageInSitemap, findCategoryPage, fetchLogoDataUri, type FetchText, type FetchBytes, type ResolveUrl, } from "./marketSourcing.ts";
|
|
37
38
|
export { computeMissedFirings, createFileScheduleRunStore, createFileScheduleStore, cronMatches, crontabSentinels, expectedFirings, nextCronFiring, parseCron, renderManagedBlock, replaceManagedBlock, scheduleId, scheduleRunsDir, schedulesPath, systemCrontabIo, tokenizeCommand, validateSchedulableArgv, type CronExpression, type CrontabIo, type ScheduleEntry, type ScheduleProvider, type ScheduleRunRecord, type ScheduleRunStore, type ScheduleRunTrigger, type ScheduleStore, } from "./schedule.ts";
|
|
38
39
|
export { suggestValues, type SuggestionConfidence, type ValueSuggestion } from "./suggest.ts";
|
|
39
40
|
export type { ApprovalStatus, AuditFinding, AuditFindingSeverity, CanonicalAccount, CanonicalActivity, CanonicalContact, CanonicalDeal, CanonicalGtmSnapshot, CanonicalUser, CrmProvider, GtmAuditRule, GtmConnector, GtmEvidence, GtmEvidenceSourceSystem, GtmObjectType, GtmPolicy, GtmRuleContext, GtmRuleResult, GtmSnapshotIndex, PatchOperation, PatchOperationResult, PatchOperationType, PatchPlan, PatchPlanRun, PatchPlanRunStatus, PatchVerification, PipelineFinding, PipelineFindingStatus, PipelineFindingType, ProviderIdentity, RiskLevel, SourceFreshness, } from "./types.ts";
|
package/dist/index.js
CHANGED
|
@@ -34,5 +34,6 @@ export { buildWorksheet, classifyMarket, } from "./marketClassify.js";
|
|
|
34
34
|
export { computeDirectives, computeOverlayStats, directivesToPlan, overlayToMarkdown, } from "./marketOverlay.js";
|
|
35
35
|
export { computeScaleIndex, dimensionForMetric, scaleReportToText, } from "./marketScale.js";
|
|
36
36
|
export { marketMapToHtml, marketMapToMarkdown } from "./marketReport.js";
|
|
37
|
+
export { registrableDomain, categoryKeywords, pickCategoryPage, extractLogoUrl, resolveFinalUrl, detectDrift, findCategoryPageInSitemap, findCategoryPage, fetchLogoDataUri, } from "./marketSourcing.js";
|
|
37
38
|
export { computeMissedFirings, createFileScheduleRunStore, createFileScheduleStore, cronMatches, crontabSentinels, expectedFirings, nextCronFiring, parseCron, renderManagedBlock, replaceManagedBlock, scheduleId, scheduleRunsDir, schedulesPath, systemCrontabIo, tokenizeCommand, validateSchedulableArgv, } from "./schedule.js";
|
|
38
39
|
export { suggestValues } from "./suggest.js";
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/** Fetch a text resource (sitemap, robots.txt); null on any failure. */
|
|
2
|
+
export type FetchText = (url: string) => Promise<string | null>;
|
|
3
|
+
/** Fetch raw bytes (a logo image); null on any failure. */
|
|
4
|
+
export type FetchBytes = (url: string) => Promise<{
|
|
5
|
+
contentType: string;
|
|
6
|
+
bytes: Uint8Array;
|
|
7
|
+
} | null>;
|
|
8
|
+
/** Resolve a URL's final destination after redirects. */
|
|
9
|
+
export type ResolveUrl = (url: string) => Promise<{
|
|
10
|
+
finalUrl: string;
|
|
11
|
+
status: number;
|
|
12
|
+
}>;
|
|
13
|
+
/**
|
|
14
|
+
* Best-effort registrable domain (eTLD+1) for comparing vendor identity across a
|
|
15
|
+
* redirect. Heuristic (no full public-suffix list): last two labels, or three for
|
|
16
|
+
* a known multi-label suffix. Good enough to catch "spiff.com → salesforce.com".
|
|
17
|
+
*/
|
|
18
|
+
export declare function registrableDomain(host: string): string;
|
|
19
|
+
/** Significant category words for matching pages/links (drops generic filler). */
|
|
20
|
+
export declare function categoryKeywords(category: string): string[];
|
|
21
|
+
/**
|
|
22
|
+
* Conglomerate page-selection: when a vendor's homepage isn't about the category,
|
|
23
|
+
* follow its own nav to the category page. Scans same-registrable-domain internal
|
|
24
|
+
* links and returns the one whose anchor text / path best matches the category
|
|
25
|
+
* keywords (anchor text weighted 2×, path 1×; requires ≥2 to avoid false hits).
|
|
26
|
+
* Returns null if no link is a clear match. Pure — operates on the given HTML.
|
|
27
|
+
*/
|
|
28
|
+
export declare function pickCategoryPage(html: string, baseUrl: string, keywords: string[]): string | null;
|
|
29
|
+
/**
|
|
30
|
+
* Pull a canonical logo URL out of a homepage: apple-touch-icon → og:image →
|
|
31
|
+
* rel=icon, resolved to an absolute URL. Pure — operates on the given HTML.
|
|
32
|
+
*/
|
|
33
|
+
export declare function extractLogoUrl(html: string, baseUrl: string): string | null;
|
|
34
|
+
/**
|
|
35
|
+
* Follow redirects (SSRF-guarded, re-validated each hop) and return the final URL
|
|
36
|
+
* + status WITHOUT downloading the body. Used to detect identity drift.
|
|
37
|
+
*/
|
|
38
|
+
export declare const resolveFinalUrl: ResolveUrl;
|
|
39
|
+
/**
|
|
40
|
+
* Detect vendor identity drift: does this URL (or its www/apex sibling) redirect
|
|
41
|
+
* to a DIFFERENT registrable domain? Returns the drifted-to host, else null.
|
|
42
|
+
* Catches acquired/defunct products (e.g. www.spiff.com → salesforce.com) even
|
|
43
|
+
* when the apex itself errors. Only a 2xx/3xx landing on another domain counts —
|
|
44
|
+
* a status code or a throw is NOT drift (real sites block bare requests).
|
|
45
|
+
*/
|
|
46
|
+
export declare function detectDrift(url: string, srcHost: string, resolve?: ResolveUrl): Promise<string | null>;
|
|
47
|
+
/**
|
|
48
|
+
* Fallback for JS-nav conglomerates whose product links aren't in the rendered
|
|
49
|
+
* homepage: find the category page from the vendor's sitemap. Bounded (≤6 sitemap
|
|
50
|
+
* fetches, ≤20k URLs), same-registrable-domain, plain XML only; media sitemaps
|
|
51
|
+
* skipped, non-English locales de-prioritized, /products/ preferred; requires the
|
|
52
|
+
* path to hit the keywords so locale/blog false-positives are rejected.
|
|
53
|
+
*/
|
|
54
|
+
export declare function findCategoryPageInSitemap(rootUrl: string, keywords: string[], fetchText?: FetchText): Promise<string | null>;
|
|
55
|
+
/**
|
|
56
|
+
* Find a vendor's category-specific page: scan its (already-fetched) homepage nav,
|
|
57
|
+
* then fall back to the sitemap. Returns the page URL, or null to keep the homepage.
|
|
58
|
+
*/
|
|
59
|
+
export declare function findCategoryPage(homepageHtml: string, homepageUrl: string, category: string, fetchText?: FetchText): Promise<string | null>;
|
|
60
|
+
/**
|
|
61
|
+
* A vendor logo as a self-contained `data:` URI — the form `MarketVendor.logo`
|
|
62
|
+
* renders and the report serves under a strict `img-src data:` CSP. Prefers the
|
|
63
|
+
* page-declared logo (from the given homepage HTML), then a favicon service.
|
|
64
|
+
* Bounded to small raster/SVG (≤50KB).
|
|
65
|
+
*/
|
|
66
|
+
export declare function fetchLogoDataUri(homepageUrl: string, html?: string, fetchBytes?: FetchBytes): Promise<string | null>;
|
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Market sourcing — find the *right* page to capture for each vendor, detect
|
|
3
|
+
* acquired/redirected vendors, and extract brand logos. These raise the quality
|
|
4
|
+
* of a cold-start map and are useful to every consumer (CLI, MCP, or a hosted
|
|
5
|
+
* service), with zero coupling to any transport.
|
|
6
|
+
*
|
|
7
|
+
* Design:
|
|
8
|
+
* - **Pure functions** (`pickCategoryPage`, `extractLogoUrl`, `categoryKeywords`,
|
|
9
|
+
* `registrableDomain`) operate on already-fetched HTML / strings. The caller
|
|
10
|
+
* fetches the page with whatever it likes — the hosted service injects a
|
|
11
|
+
* browser-rendering fetch for JS-walled homepages; the CLI uses the default.
|
|
12
|
+
* - **Fetching helpers** (`resolveFinalUrl`, `detectDrift`,
|
|
13
|
+
* `findCategoryPageInSitemap`, `fetchLogoDataUri`) default to the package's
|
|
14
|
+
* SSRF-guarded `assertPublicUrl` + global `fetch`, but each accepts an
|
|
15
|
+
* injectable fetcher so they stay testable offline and transport-agnostic.
|
|
16
|
+
* Sitemaps / redirects / logo bytes are plain resources — they never need a
|
|
17
|
+
* headless browser, so the default fetch is sufficient even in the hosted layer.
|
|
18
|
+
*/
|
|
19
|
+
import { assertPublicUrl } from "./market.js";
|
|
20
|
+
const USER_AGENT = "fullstackgtm-market/0 (+https://github.com/fullstackgtm/core)";
|
|
21
|
+
const FETCH_TIMEOUT_MS = 15_000;
|
|
22
|
+
const MAX_REDIRECTS = 5;
|
|
23
|
+
function hostOf(url) {
|
|
24
|
+
try {
|
|
25
|
+
return new URL(url).hostname.replace(/^www\./, "");
|
|
26
|
+
}
|
|
27
|
+
catch {
|
|
28
|
+
return null;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
// ── Pure helpers ────────────────────────────────────────────────────────────
|
|
32
|
+
// A few common multi-label public suffixes so "bbc.co.uk" → "bbc.co.uk", not "co.uk".
|
|
33
|
+
const MULTI_LABEL_TLDS = new Set([
|
|
34
|
+
"co.uk", "com.au", "co.nz", "co.jp", "com.br", "co.in", "com.sg", "co.za", "com.mx", "com.cn",
|
|
35
|
+
]);
|
|
36
|
+
/**
|
|
37
|
+
* Best-effort registrable domain (eTLD+1) for comparing vendor identity across a
|
|
38
|
+
* redirect. Heuristic (no full public-suffix list): last two labels, or three for
|
|
39
|
+
* a known multi-label suffix. Good enough to catch "spiff.com → salesforce.com".
|
|
40
|
+
*/
|
|
41
|
+
export function registrableDomain(host) {
|
|
42
|
+
const h = String(host || "").toLowerCase().replace(/\.$/, "");
|
|
43
|
+
const labels = h.split(".");
|
|
44
|
+
if (labels.length <= 2)
|
|
45
|
+
return h;
|
|
46
|
+
const lastTwo = labels.slice(-2).join(".");
|
|
47
|
+
return MULTI_LABEL_TLDS.has(lastTwo) ? labels.slice(-3).join(".") : lastTwo;
|
|
48
|
+
}
|
|
49
|
+
const CATEGORY_STOPWORDS = new Set([
|
|
50
|
+
"software", "platform", "platforms", "tool", "tools", "system", "systems", "management",
|
|
51
|
+
"solution", "solutions", "app", "apps", "service", "services", "suite", "the", "for", "and", "of",
|
|
52
|
+
]);
|
|
53
|
+
/** Significant category words for matching pages/links (drops generic filler). */
|
|
54
|
+
export function categoryKeywords(category) {
|
|
55
|
+
return [
|
|
56
|
+
...new Set(String(category || "")
|
|
57
|
+
.toLowerCase()
|
|
58
|
+
.split(/[^a-z0-9]+/)
|
|
59
|
+
.filter((w) => w.length >= 3 && !CATEGORY_STOPWORDS.has(w))),
|
|
60
|
+
];
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Conglomerate page-selection: when a vendor's homepage isn't about the category,
|
|
64
|
+
* follow its own nav to the category page. Scans same-registrable-domain internal
|
|
65
|
+
* links and returns the one whose anchor text / path best matches the category
|
|
66
|
+
* keywords (anchor text weighted 2×, path 1×; requires ≥2 to avoid false hits).
|
|
67
|
+
* Returns null if no link is a clear match. Pure — operates on the given HTML.
|
|
68
|
+
*/
|
|
69
|
+
export function pickCategoryPage(html, baseUrl, keywords) {
|
|
70
|
+
if (!html || !keywords.length)
|
|
71
|
+
return null;
|
|
72
|
+
let baseHost;
|
|
73
|
+
try {
|
|
74
|
+
baseHost = registrableDomain(new URL(baseUrl).hostname);
|
|
75
|
+
}
|
|
76
|
+
catch {
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
const re = /<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
|
|
80
|
+
const seen = new Set();
|
|
81
|
+
let best = null;
|
|
82
|
+
let bestScore = 0;
|
|
83
|
+
let m;
|
|
84
|
+
while ((m = re.exec(html))) {
|
|
85
|
+
let u;
|
|
86
|
+
try {
|
|
87
|
+
u = new URL(m[1], baseUrl);
|
|
88
|
+
}
|
|
89
|
+
catch {
|
|
90
|
+
continue;
|
|
91
|
+
}
|
|
92
|
+
if (u.protocol !== "http:" && u.protocol !== "https:")
|
|
93
|
+
continue;
|
|
94
|
+
if (registrableDomain(u.hostname) !== baseHost)
|
|
95
|
+
continue; // internal links only
|
|
96
|
+
const key = u.origin + u.pathname;
|
|
97
|
+
if (seen.has(key))
|
|
98
|
+
continue;
|
|
99
|
+
seen.add(key);
|
|
100
|
+
const text = m[2].replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
101
|
+
const path = u.pathname.toLowerCase();
|
|
102
|
+
let score = 0;
|
|
103
|
+
for (const kw of keywords) {
|
|
104
|
+
if (text.includes(kw))
|
|
105
|
+
score += 2;
|
|
106
|
+
if (path.includes(kw))
|
|
107
|
+
score += 1;
|
|
108
|
+
}
|
|
109
|
+
if (score > bestScore) {
|
|
110
|
+
bestScore = score;
|
|
111
|
+
best = u.toString();
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
return bestScore >= 2 ? best : null;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Pull a canonical logo URL out of a homepage: apple-touch-icon → og:image →
|
|
118
|
+
* rel=icon, resolved to an absolute URL. Pure — operates on the given HTML.
|
|
119
|
+
*/
|
|
120
|
+
export function extractLogoUrl(html, baseUrl) {
|
|
121
|
+
if (!html)
|
|
122
|
+
return null;
|
|
123
|
+
const abs = (href) => {
|
|
124
|
+
try {
|
|
125
|
+
return new URL(href, baseUrl).toString();
|
|
126
|
+
}
|
|
127
|
+
catch {
|
|
128
|
+
return null;
|
|
129
|
+
}
|
|
130
|
+
};
|
|
131
|
+
const hrefOf = (tag) => {
|
|
132
|
+
const mm = tag.match(/href=["']([^"']+)["']/i) || tag.match(/content=["']([^"']+)["']/i);
|
|
133
|
+
return mm ? abs(mm[1]) : null;
|
|
134
|
+
};
|
|
135
|
+
let mm = html.match(/<link[^>]+rel=["'][^"']*apple-touch-icon[^"']*["'][^>]*>/i);
|
|
136
|
+
if (mm) {
|
|
137
|
+
const u = hrefOf(mm[0]);
|
|
138
|
+
if (u)
|
|
139
|
+
return u;
|
|
140
|
+
}
|
|
141
|
+
mm = html.match(/<meta[^>]+property=["']og:image["'][^>]*>/i);
|
|
142
|
+
if (mm) {
|
|
143
|
+
const u = hrefOf(mm[0]);
|
|
144
|
+
if (u)
|
|
145
|
+
return u;
|
|
146
|
+
}
|
|
147
|
+
mm = html.match(/<link[^>]+rel=["'](?:shortcut icon|icon)["'][^>]*>/i);
|
|
148
|
+
if (mm) {
|
|
149
|
+
const u = hrefOf(mm[0]);
|
|
150
|
+
if (u)
|
|
151
|
+
return u;
|
|
152
|
+
}
|
|
153
|
+
return null;
|
|
154
|
+
}
|
|
155
|
+
// ── Default SSRF-guarded fetchers ────────────────────────────────────────────
|
|
156
|
+
/** Manual-redirect fetch that re-validates every hop against the SSRF guard. */
|
|
157
|
+
async function guardedFetch(url) {
|
|
158
|
+
let current = url;
|
|
159
|
+
for (let hop = 0; hop <= MAX_REDIRECTS; hop++) {
|
|
160
|
+
await assertPublicUrl(current);
|
|
161
|
+
const res = await fetch(current, {
|
|
162
|
+
redirect: "manual",
|
|
163
|
+
signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
|
|
164
|
+
headers: { "User-Agent": USER_AGENT },
|
|
165
|
+
});
|
|
166
|
+
const location = res.headers.get("location");
|
|
167
|
+
if (res.status >= 300 && res.status < 400 && location) {
|
|
168
|
+
try {
|
|
169
|
+
await res.body?.cancel();
|
|
170
|
+
}
|
|
171
|
+
catch {
|
|
172
|
+
/* ignore */
|
|
173
|
+
}
|
|
174
|
+
current = new URL(location, current).toString();
|
|
175
|
+
continue;
|
|
176
|
+
}
|
|
177
|
+
return res;
|
|
178
|
+
}
|
|
179
|
+
return null;
|
|
180
|
+
}
|
|
181
|
+
const defaultFetchText = async (url) => {
|
|
182
|
+
try {
|
|
183
|
+
const res = await guardedFetch(url);
|
|
184
|
+
return res && res.ok ? await res.text() : null;
|
|
185
|
+
}
|
|
186
|
+
catch {
|
|
187
|
+
return null;
|
|
188
|
+
}
|
|
189
|
+
};
|
|
190
|
+
const defaultFetchBytes = async (url) => {
|
|
191
|
+
try {
|
|
192
|
+
const res = await guardedFetch(url);
|
|
193
|
+
if (!res || !res.ok)
|
|
194
|
+
return null;
|
|
195
|
+
const contentType = (res.headers.get("content-type") || "").split(";")[0].trim().toLowerCase();
|
|
196
|
+
const bytes = new Uint8Array(await res.arrayBuffer());
|
|
197
|
+
return { contentType, bytes };
|
|
198
|
+
}
|
|
199
|
+
catch {
|
|
200
|
+
return null;
|
|
201
|
+
}
|
|
202
|
+
};
|
|
203
|
+
/**
|
|
204
|
+
* Follow redirects (SSRF-guarded, re-validated each hop) and return the final URL
|
|
205
|
+
* + status WITHOUT downloading the body. Used to detect identity drift.
|
|
206
|
+
*/
|
|
207
|
+
export const resolveFinalUrl = async (rawUrl) => {
|
|
208
|
+
let current = rawUrl;
|
|
209
|
+
for (let hop = 0; hop <= MAX_REDIRECTS; hop++) {
|
|
210
|
+
await assertPublicUrl(current);
|
|
211
|
+
const res = await fetch(current, {
|
|
212
|
+
redirect: "manual",
|
|
213
|
+
signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
|
|
214
|
+
headers: { "User-Agent": USER_AGENT },
|
|
215
|
+
});
|
|
216
|
+
try {
|
|
217
|
+
await res.body?.cancel();
|
|
218
|
+
}
|
|
219
|
+
catch {
|
|
220
|
+
/* ignore */
|
|
221
|
+
}
|
|
222
|
+
const location = res.headers.get("location");
|
|
223
|
+
if (res.status >= 300 && res.status < 400 && location) {
|
|
224
|
+
current = new URL(location, current).toString();
|
|
225
|
+
continue;
|
|
226
|
+
}
|
|
227
|
+
return { finalUrl: current, status: res.status };
|
|
228
|
+
}
|
|
229
|
+
return { finalUrl: current, status: 0 };
|
|
230
|
+
};
|
|
231
|
+
// ── Fetching helpers ─────────────────────────────────────────────────────────
|
|
232
|
+
/**
|
|
233
|
+
* Detect vendor identity drift: does this URL (or its www/apex sibling) redirect
|
|
234
|
+
* to a DIFFERENT registrable domain? Returns the drifted-to host, else null.
|
|
235
|
+
* Catches acquired/defunct products (e.g. www.spiff.com → salesforce.com) even
|
|
236
|
+
* when the apex itself errors. Only a 2xx/3xx landing on another domain counts —
|
|
237
|
+
* a status code or a throw is NOT drift (real sites block bare requests).
|
|
238
|
+
*/
|
|
239
|
+
export async function detectDrift(url, srcHost, resolve = resolveFinalUrl) {
|
|
240
|
+
if (!srcHost)
|
|
241
|
+
return null;
|
|
242
|
+
const tries = [url];
|
|
243
|
+
try {
|
|
244
|
+
const u = new URL(url);
|
|
245
|
+
const sibling = u.hostname.startsWith("www.")
|
|
246
|
+
? url.replace("://www.", "://")
|
|
247
|
+
: url.replace(`://${u.hostname}`, `://www.${u.hostname}`);
|
|
248
|
+
if (sibling !== url)
|
|
249
|
+
tries.push(sibling);
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
/* ignore */
|
|
253
|
+
}
|
|
254
|
+
for (const t of tries) {
|
|
255
|
+
try {
|
|
256
|
+
const { finalUrl, status } = await resolve(t);
|
|
257
|
+
if (status < 200 || status >= 400)
|
|
258
|
+
continue;
|
|
259
|
+
const finalHost = hostOf(finalUrl);
|
|
260
|
+
if (finalHost && registrableDomain(finalHost) !== registrableDomain(srcHost))
|
|
261
|
+
return finalHost;
|
|
262
|
+
}
|
|
263
|
+
catch {
|
|
264
|
+
/* a throw isn't a drift signal */
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
return null;
|
|
268
|
+
}
|
|
269
|
+
/**
|
|
270
|
+
* Fallback for JS-nav conglomerates whose product links aren't in the rendered
|
|
271
|
+
* homepage: find the category page from the vendor's sitemap. Bounded (≤6 sitemap
|
|
272
|
+
* fetches, ≤20k URLs), same-registrable-domain, plain XML only; media sitemaps
|
|
273
|
+
* skipped, non-English locales de-prioritized, /products/ preferred; requires the
|
|
274
|
+
* path to hit the keywords so locale/blog false-positives are rejected.
|
|
275
|
+
*/
|
|
276
|
+
export async function findCategoryPageInSitemap(rootUrl, keywords, fetchText = defaultFetchText) {
|
|
277
|
+
let root;
|
|
278
|
+
try {
|
|
279
|
+
root = new URL(rootUrl);
|
|
280
|
+
}
|
|
281
|
+
catch {
|
|
282
|
+
return null;
|
|
283
|
+
}
|
|
284
|
+
if (!keywords.length)
|
|
285
|
+
return null;
|
|
286
|
+
const rootDom = registrableDomain(root.hostname);
|
|
287
|
+
const sameDomain = (u) => registrableDomain(hostOf(u) || "") === rootDom;
|
|
288
|
+
const pathScore = (u) => {
|
|
289
|
+
let path;
|
|
290
|
+
try {
|
|
291
|
+
path = new URL(u).pathname.toLowerCase();
|
|
292
|
+
}
|
|
293
|
+
catch {
|
|
294
|
+
return 0;
|
|
295
|
+
}
|
|
296
|
+
let s = 0;
|
|
297
|
+
for (const kw of keywords)
|
|
298
|
+
if (path.includes(kw))
|
|
299
|
+
s += 1;
|
|
300
|
+
if (s > 0) {
|
|
301
|
+
if (/\/(product|solution)s?\//.test(path))
|
|
302
|
+
s += 0.5; // prefer product pages
|
|
303
|
+
const loc = path.match(/^\/([a-z]{2})(?:-[a-z]{2})?\//); // de-prioritize non-English locales
|
|
304
|
+
if (loc && !["en", "us"].includes(loc[1]))
|
|
305
|
+
s -= 0.6;
|
|
306
|
+
}
|
|
307
|
+
return s;
|
|
308
|
+
};
|
|
309
|
+
const candidates = new Set([`${root.origin}/sitemap.xml`, `${root.origin}/sitemap_index.xml`]);
|
|
310
|
+
const robots = await fetchText(`${root.origin}/robots.txt`);
|
|
311
|
+
if (robots) {
|
|
312
|
+
for (const mm of robots.slice(0, 100_000).matchAll(/^\s*sitemap:\s*(\S+)/gim)) {
|
|
313
|
+
try {
|
|
314
|
+
candidates.add(new URL(mm[1].trim(), root.origin).toString());
|
|
315
|
+
}
|
|
316
|
+
catch {
|
|
317
|
+
/* skip */
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
const queue = [...candidates];
|
|
322
|
+
const seen = new Set();
|
|
323
|
+
let best = null;
|
|
324
|
+
let bestScore = 0;
|
|
325
|
+
let fetched = 0;
|
|
326
|
+
let scanned = 0;
|
|
327
|
+
while (queue.length && fetched < 6 && scanned < 20_000) {
|
|
328
|
+
const sm = queue.shift();
|
|
329
|
+
if (seen.has(sm) || !sameDomain(sm))
|
|
330
|
+
continue;
|
|
331
|
+
seen.add(sm);
|
|
332
|
+
if (sm.endsWith(".gz"))
|
|
333
|
+
continue; // skip compressed sitemaps
|
|
334
|
+
const xml = await fetchText(sm);
|
|
335
|
+
if (!xml)
|
|
336
|
+
continue;
|
|
337
|
+
fetched++;
|
|
338
|
+
const body = xml.slice(0, 5_000_000);
|
|
339
|
+
const locs = [...body.matchAll(/<loc>\s*([^<\s]+?)\s*<\/loc>/gi)].map((mm) => mm[1].replace(/&/g, "&"));
|
|
340
|
+
if (/<sitemapindex/i.test(body)) {
|
|
341
|
+
const ranked = locs
|
|
342
|
+
.filter(sameDomain)
|
|
343
|
+
.filter((l) => !/(pdf|video|image|img|news|siteimprove)/i.test(l))
|
|
344
|
+
.map((l) => [pathScore(l) + (/product|solution/i.test(l) ? 1 : 0), l])
|
|
345
|
+
.sort((a, b) => b[0] - a[0]);
|
|
346
|
+
for (const [, l] of ranked.slice(0, 5))
|
|
347
|
+
queue.push(l);
|
|
348
|
+
}
|
|
349
|
+
else {
|
|
350
|
+
for (const l of locs) {
|
|
351
|
+
scanned++;
|
|
352
|
+
if (!sameDomain(l))
|
|
353
|
+
continue;
|
|
354
|
+
const s = pathScore(l);
|
|
355
|
+
if (s > bestScore) {
|
|
356
|
+
bestScore = s;
|
|
357
|
+
best = l;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
return bestScore >= Math.min(2, keywords.length) ? best : null;
|
|
363
|
+
}
|
|
364
|
+
/**
|
|
365
|
+
* Find a vendor's category-specific page: scan its (already-fetched) homepage nav,
|
|
366
|
+
* then fall back to the sitemap. Returns the page URL, or null to keep the homepage.
|
|
367
|
+
*/
|
|
368
|
+
export async function findCategoryPage(homepageHtml, homepageUrl, category, fetchText = defaultFetchText) {
|
|
369
|
+
const keywords = categoryKeywords(category);
|
|
370
|
+
if (!keywords.length)
|
|
371
|
+
return null;
|
|
372
|
+
const nav = pickCategoryPage(homepageHtml, homepageUrl, keywords);
|
|
373
|
+
if (nav)
|
|
374
|
+
return nav;
|
|
375
|
+
return findCategoryPageInSitemap(homepageUrl, keywords, fetchText);
|
|
376
|
+
}
|
|
377
|
+
/**
|
|
378
|
+
* A vendor logo as a self-contained `data:` URI — the form `MarketVendor.logo`
|
|
379
|
+
* renders and the report serves under a strict `img-src data:` CSP. Prefers the
|
|
380
|
+
* page-declared logo (from the given homepage HTML), then a favicon service.
|
|
381
|
+
* Bounded to small raster/SVG (≤50KB).
|
|
382
|
+
*/
|
|
383
|
+
export async function fetchLogoDataUri(homepageUrl, html, fetchBytes = defaultFetchBytes) {
|
|
384
|
+
const host = hostOf(homepageUrl);
|
|
385
|
+
if (!host)
|
|
386
|
+
return null;
|
|
387
|
+
const candidates = [];
|
|
388
|
+
if (html) {
|
|
389
|
+
const fromPage = extractLogoUrl(html, homepageUrl);
|
|
390
|
+
if (fromPage)
|
|
391
|
+
candidates.push(fromPage);
|
|
392
|
+
}
|
|
393
|
+
candidates.push(`https://www.google.com/s2/favicons?domain=${host}&sz=64`);
|
|
394
|
+
for (const url of candidates) {
|
|
395
|
+
const got = await fetchBytes(url);
|
|
396
|
+
if (!got)
|
|
397
|
+
continue;
|
|
398
|
+
if (!got.contentType.startsWith("image/"))
|
|
399
|
+
continue;
|
|
400
|
+
if (got.bytes.length === 0 || got.bytes.length > 50_000)
|
|
401
|
+
continue;
|
|
402
|
+
return `data:${got.contentType};base64,${Buffer.from(got.bytes).toString("base64")}`;
|
|
403
|
+
}
|
|
404
|
+
return null;
|
|
405
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "fullstackgtm",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.33.0",
|
|
4
4
|
"description": "Open-source agentic GTM ops framework: canonical GTM data model, pluggable deterministic audits, reviewable dry-run patch plans, approval-gated write-back with conflict detection, and cross-system entity resolution. HubSpot, Salesforce, and Stripe connectors included.",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"author": "Full Stack GTM LLC <ryan@fullstackgtm.com> (https://fullstackgtm.com)",
|
package/src/index.ts
CHANGED
|
@@ -291,6 +291,20 @@ export {
|
|
|
291
291
|
type VendorScale,
|
|
292
292
|
} from "./marketScale.ts";
|
|
293
293
|
export { marketMapToHtml, marketMapToMarkdown } from "./marketReport.ts";
|
|
294
|
+
export {
|
|
295
|
+
registrableDomain,
|
|
296
|
+
categoryKeywords,
|
|
297
|
+
pickCategoryPage,
|
|
298
|
+
extractLogoUrl,
|
|
299
|
+
resolveFinalUrl,
|
|
300
|
+
detectDrift,
|
|
301
|
+
findCategoryPageInSitemap,
|
|
302
|
+
findCategoryPage,
|
|
303
|
+
fetchLogoDataUri,
|
|
304
|
+
type FetchText,
|
|
305
|
+
type FetchBytes,
|
|
306
|
+
type ResolveUrl,
|
|
307
|
+
} from "./marketSourcing.ts";
|
|
294
308
|
export {
|
|
295
309
|
computeMissedFirings,
|
|
296
310
|
createFileScheduleRunStore,
|
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Market sourcing — find the *right* page to capture for each vendor, detect
|
|
3
|
+
* acquired/redirected vendors, and extract brand logos. These raise the quality
|
|
4
|
+
* of a cold-start map and are useful to every consumer (CLI, MCP, or a hosted
|
|
5
|
+
* service), with zero coupling to any transport.
|
|
6
|
+
*
|
|
7
|
+
* Design:
|
|
8
|
+
* - **Pure functions** (`pickCategoryPage`, `extractLogoUrl`, `categoryKeywords`,
|
|
9
|
+
* `registrableDomain`) operate on already-fetched HTML / strings. The caller
|
|
10
|
+
* fetches the page with whatever it likes — the hosted service injects a
|
|
11
|
+
* browser-rendering fetch for JS-walled homepages; the CLI uses the default.
|
|
12
|
+
* - **Fetching helpers** (`resolveFinalUrl`, `detectDrift`,
|
|
13
|
+
* `findCategoryPageInSitemap`, `fetchLogoDataUri`) default to the package's
|
|
14
|
+
* SSRF-guarded `assertPublicUrl` + global `fetch`, but each accepts an
|
|
15
|
+
* injectable fetcher so they stay testable offline and transport-agnostic.
|
|
16
|
+
* Sitemaps / redirects / logo bytes are plain resources — they never need a
|
|
17
|
+
* headless browser, so the default fetch is sufficient even in the hosted layer.
|
|
18
|
+
*/
|
|
19
|
+
import { assertPublicUrl } from "./market.ts";
|
|
20
|
+
|
|
21
|
+
const USER_AGENT = "fullstackgtm-market/0 (+https://github.com/fullstackgtm/core)";
|
|
22
|
+
const FETCH_TIMEOUT_MS = 15_000;
|
|
23
|
+
const MAX_REDIRECTS = 5;
|
|
24
|
+
|
|
25
|
+
/** Fetch a text resource (sitemap, robots.txt); null on any failure. */
|
|
26
|
+
export type FetchText = (url: string) => Promise<string | null>;
|
|
27
|
+
/** Fetch raw bytes (a logo image); null on any failure. */
|
|
28
|
+
export type FetchBytes = (url: string) => Promise<{ contentType: string; bytes: Uint8Array } | null>;
|
|
29
|
+
/** Resolve a URL's final destination after redirects. */
|
|
30
|
+
export type ResolveUrl = (url: string) => Promise<{ finalUrl: string; status: number }>;
|
|
31
|
+
|
|
32
|
+
function hostOf(url: string): string | null {
|
|
33
|
+
try {
|
|
34
|
+
return new URL(url).hostname.replace(/^www\./, "");
|
|
35
|
+
} catch {
|
|
36
|
+
return null;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// ── Pure helpers ────────────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
// A few common multi-label public suffixes so "bbc.co.uk" → "bbc.co.uk", not "co.uk".
|
|
43
|
+
const MULTI_LABEL_TLDS = new Set([
|
|
44
|
+
"co.uk", "com.au", "co.nz", "co.jp", "com.br", "co.in", "com.sg", "co.za", "com.mx", "com.cn",
|
|
45
|
+
]);
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Best-effort registrable domain (eTLD+1) for comparing vendor identity across a
|
|
49
|
+
* redirect. Heuristic (no full public-suffix list): last two labels, or three for
|
|
50
|
+
* a known multi-label suffix. Good enough to catch "spiff.com → salesforce.com".
|
|
51
|
+
*/
|
|
52
|
+
export function registrableDomain(host: string): string {
|
|
53
|
+
const h = String(host || "").toLowerCase().replace(/\.$/, "");
|
|
54
|
+
const labels = h.split(".");
|
|
55
|
+
if (labels.length <= 2) return h;
|
|
56
|
+
const lastTwo = labels.slice(-2).join(".");
|
|
57
|
+
return MULTI_LABEL_TLDS.has(lastTwo) ? labels.slice(-3).join(".") : lastTwo;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const CATEGORY_STOPWORDS = new Set([
|
|
61
|
+
"software", "platform", "platforms", "tool", "tools", "system", "systems", "management",
|
|
62
|
+
"solution", "solutions", "app", "apps", "service", "services", "suite", "the", "for", "and", "of",
|
|
63
|
+
]);
|
|
64
|
+
|
|
65
|
+
/** Significant category words for matching pages/links (drops generic filler). */
|
|
66
|
+
export function categoryKeywords(category: string): string[] {
|
|
67
|
+
return [
|
|
68
|
+
...new Set(
|
|
69
|
+
String(category || "")
|
|
70
|
+
.toLowerCase()
|
|
71
|
+
.split(/[^a-z0-9]+/)
|
|
72
|
+
.filter((w) => w.length >= 3 && !CATEGORY_STOPWORDS.has(w)),
|
|
73
|
+
),
|
|
74
|
+
];
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Conglomerate page-selection: when a vendor's homepage isn't about the category,
|
|
79
|
+
* follow its own nav to the category page. Scans same-registrable-domain internal
|
|
80
|
+
* links and returns the one whose anchor text / path best matches the category
|
|
81
|
+
* keywords (anchor text weighted 2×, path 1×; requires ≥2 to avoid false hits).
|
|
82
|
+
* Returns null if no link is a clear match. Pure — operates on the given HTML.
|
|
83
|
+
*/
|
|
84
|
+
export function pickCategoryPage(html: string, baseUrl: string, keywords: string[]): string | null {
|
|
85
|
+
if (!html || !keywords.length) return null;
|
|
86
|
+
let baseHost: string;
|
|
87
|
+
try {
|
|
88
|
+
baseHost = registrableDomain(new URL(baseUrl).hostname);
|
|
89
|
+
} catch {
|
|
90
|
+
return null;
|
|
91
|
+
}
|
|
92
|
+
const re = /<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
|
|
93
|
+
const seen = new Set<string>();
|
|
94
|
+
let best: string | null = null;
|
|
95
|
+
let bestScore = 0;
|
|
96
|
+
let m: RegExpExecArray | null;
|
|
97
|
+
while ((m = re.exec(html))) {
|
|
98
|
+
let u: URL;
|
|
99
|
+
try {
|
|
100
|
+
u = new URL(m[1], baseUrl);
|
|
101
|
+
} catch {
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
if (u.protocol !== "http:" && u.protocol !== "https:") continue;
|
|
105
|
+
if (registrableDomain(u.hostname) !== baseHost) continue; // internal links only
|
|
106
|
+
const key = u.origin + u.pathname;
|
|
107
|
+
if (seen.has(key)) continue;
|
|
108
|
+
seen.add(key);
|
|
109
|
+
const text = m[2].replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
110
|
+
const path = u.pathname.toLowerCase();
|
|
111
|
+
let score = 0;
|
|
112
|
+
for (const kw of keywords) {
|
|
113
|
+
if (text.includes(kw)) score += 2;
|
|
114
|
+
if (path.includes(kw)) score += 1;
|
|
115
|
+
}
|
|
116
|
+
if (score > bestScore) {
|
|
117
|
+
bestScore = score;
|
|
118
|
+
best = u.toString();
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return bestScore >= 2 ? best : null;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Pull a canonical logo URL out of a homepage: apple-touch-icon → og:image →
|
|
126
|
+
* rel=icon, resolved to an absolute URL. Pure — operates on the given HTML.
|
|
127
|
+
*/
|
|
128
|
+
export function extractLogoUrl(html: string, baseUrl: string): string | null {
|
|
129
|
+
if (!html) return null;
|
|
130
|
+
const abs = (href: string): string | null => {
|
|
131
|
+
try {
|
|
132
|
+
return new URL(href, baseUrl).toString();
|
|
133
|
+
} catch {
|
|
134
|
+
return null;
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
const hrefOf = (tag: string): string | null => {
|
|
138
|
+
const mm = tag.match(/href=["']([^"']+)["']/i) || tag.match(/content=["']([^"']+)["']/i);
|
|
139
|
+
return mm ? abs(mm[1]) : null;
|
|
140
|
+
};
|
|
141
|
+
let mm = html.match(/<link[^>]+rel=["'][^"']*apple-touch-icon[^"']*["'][^>]*>/i);
|
|
142
|
+
if (mm) {
|
|
143
|
+
const u = hrefOf(mm[0]);
|
|
144
|
+
if (u) return u;
|
|
145
|
+
}
|
|
146
|
+
mm = html.match(/<meta[^>]+property=["']og:image["'][^>]*>/i);
|
|
147
|
+
if (mm) {
|
|
148
|
+
const u = hrefOf(mm[0]);
|
|
149
|
+
if (u) return u;
|
|
150
|
+
}
|
|
151
|
+
mm = html.match(/<link[^>]+rel=["'](?:shortcut icon|icon)["'][^>]*>/i);
|
|
152
|
+
if (mm) {
|
|
153
|
+
const u = hrefOf(mm[0]);
|
|
154
|
+
if (u) return u;
|
|
155
|
+
}
|
|
156
|
+
return null;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// ── Default SSRF-guarded fetchers ────────────────────────────────────────────
|
|
160
|
+
|
|
161
|
+
/** Manual-redirect fetch that re-validates every hop against the SSRF guard. */
|
|
162
|
+
async function guardedFetch(url: string): Promise<Response | null> {
|
|
163
|
+
let current = url;
|
|
164
|
+
for (let hop = 0; hop <= MAX_REDIRECTS; hop++) {
|
|
165
|
+
await assertPublicUrl(current);
|
|
166
|
+
const res = await fetch(current, {
|
|
167
|
+
redirect: "manual",
|
|
168
|
+
signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
|
|
169
|
+
headers: { "User-Agent": USER_AGENT },
|
|
170
|
+
});
|
|
171
|
+
const location = res.headers.get("location");
|
|
172
|
+
if (res.status >= 300 && res.status < 400 && location) {
|
|
173
|
+
try {
|
|
174
|
+
await res.body?.cancel();
|
|
175
|
+
} catch {
|
|
176
|
+
/* ignore */
|
|
177
|
+
}
|
|
178
|
+
current = new URL(location, current).toString();
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
return res;
|
|
182
|
+
}
|
|
183
|
+
return null;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const defaultFetchText: FetchText = async (url) => {
|
|
187
|
+
try {
|
|
188
|
+
const res = await guardedFetch(url);
|
|
189
|
+
return res && res.ok ? await res.text() : null;
|
|
190
|
+
} catch {
|
|
191
|
+
return null;
|
|
192
|
+
}
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
const defaultFetchBytes: FetchBytes = async (url) => {
|
|
196
|
+
try {
|
|
197
|
+
const res = await guardedFetch(url);
|
|
198
|
+
if (!res || !res.ok) return null;
|
|
199
|
+
const contentType = (res.headers.get("content-type") || "").split(";")[0].trim().toLowerCase();
|
|
200
|
+
const bytes = new Uint8Array(await res.arrayBuffer());
|
|
201
|
+
return { contentType, bytes };
|
|
202
|
+
} catch {
|
|
203
|
+
return null;
|
|
204
|
+
}
|
|
205
|
+
};
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* Follow redirects (SSRF-guarded, re-validated each hop) and return the final URL
|
|
209
|
+
* + status WITHOUT downloading the body. Used to detect identity drift.
|
|
210
|
+
*/
|
|
211
|
+
export const resolveFinalUrl: ResolveUrl = async (rawUrl) => {
|
|
212
|
+
let current = rawUrl;
|
|
213
|
+
for (let hop = 0; hop <= MAX_REDIRECTS; hop++) {
|
|
214
|
+
await assertPublicUrl(current);
|
|
215
|
+
const res = await fetch(current, {
|
|
216
|
+
redirect: "manual",
|
|
217
|
+
signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
|
|
218
|
+
headers: { "User-Agent": USER_AGENT },
|
|
219
|
+
});
|
|
220
|
+
try {
|
|
221
|
+
await res.body?.cancel();
|
|
222
|
+
} catch {
|
|
223
|
+
/* ignore */
|
|
224
|
+
}
|
|
225
|
+
const location = res.headers.get("location");
|
|
226
|
+
if (res.status >= 300 && res.status < 400 && location) {
|
|
227
|
+
current = new URL(location, current).toString();
|
|
228
|
+
continue;
|
|
229
|
+
}
|
|
230
|
+
return { finalUrl: current, status: res.status };
|
|
231
|
+
}
|
|
232
|
+
return { finalUrl: current, status: 0 };
|
|
233
|
+
};
|
|
234
|
+
|
|
235
|
+
// ── Fetching helpers ─────────────────────────────────────────────────────────
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Detect vendor identity drift: does this URL (or its www/apex sibling) redirect
|
|
239
|
+
* to a DIFFERENT registrable domain? Returns the drifted-to host, else null.
|
|
240
|
+
* Catches acquired/defunct products (e.g. www.spiff.com → salesforce.com) even
|
|
241
|
+
* when the apex itself errors. Only a 2xx/3xx landing on another domain counts —
|
|
242
|
+
* a status code or a throw is NOT drift (real sites block bare requests).
|
|
243
|
+
*/
|
|
244
|
+
export async function detectDrift(
|
|
245
|
+
url: string,
|
|
246
|
+
srcHost: string,
|
|
247
|
+
resolve: ResolveUrl = resolveFinalUrl,
|
|
248
|
+
): Promise<string | null> {
|
|
249
|
+
if (!srcHost) return null;
|
|
250
|
+
const tries = [url];
|
|
251
|
+
try {
|
|
252
|
+
const u = new URL(url);
|
|
253
|
+
const sibling = u.hostname.startsWith("www.")
|
|
254
|
+
? url.replace("://www.", "://")
|
|
255
|
+
: url.replace(`://${u.hostname}`, `://www.${u.hostname}`);
|
|
256
|
+
if (sibling !== url) tries.push(sibling);
|
|
257
|
+
} catch {
|
|
258
|
+
/* ignore */
|
|
259
|
+
}
|
|
260
|
+
for (const t of tries) {
|
|
261
|
+
try {
|
|
262
|
+
const { finalUrl, status } = await resolve(t);
|
|
263
|
+
if (status < 200 || status >= 400) continue;
|
|
264
|
+
const finalHost = hostOf(finalUrl);
|
|
265
|
+
if (finalHost && registrableDomain(finalHost) !== registrableDomain(srcHost)) return finalHost;
|
|
266
|
+
} catch {
|
|
267
|
+
/* a throw isn't a drift signal */
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
return null;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* Fallback for JS-nav conglomerates whose product links aren't in the rendered
|
|
275
|
+
* homepage: find the category page from the vendor's sitemap. Bounded (≤6 sitemap
|
|
276
|
+
* fetches, ≤20k URLs), same-registrable-domain, plain XML only; media sitemaps
|
|
277
|
+
* skipped, non-English locales de-prioritized, /products/ preferred; requires the
|
|
278
|
+
* path to hit the keywords so locale/blog false-positives are rejected.
|
|
279
|
+
*/
|
|
280
|
+
export async function findCategoryPageInSitemap(
|
|
281
|
+
rootUrl: string,
|
|
282
|
+
keywords: string[],
|
|
283
|
+
fetchText: FetchText = defaultFetchText,
|
|
284
|
+
): Promise<string | null> {
|
|
285
|
+
let root: URL;
|
|
286
|
+
try {
|
|
287
|
+
root = new URL(rootUrl);
|
|
288
|
+
} catch {
|
|
289
|
+
return null;
|
|
290
|
+
}
|
|
291
|
+
if (!keywords.length) return null;
|
|
292
|
+
const rootDom = registrableDomain(root.hostname);
|
|
293
|
+
const sameDomain = (u: string) => registrableDomain(hostOf(u) || "") === rootDom;
|
|
294
|
+
const pathScore = (u: string): number => {
|
|
295
|
+
let path: string;
|
|
296
|
+
try {
|
|
297
|
+
path = new URL(u).pathname.toLowerCase();
|
|
298
|
+
} catch {
|
|
299
|
+
return 0;
|
|
300
|
+
}
|
|
301
|
+
let s = 0;
|
|
302
|
+
for (const kw of keywords) if (path.includes(kw)) s += 1;
|
|
303
|
+
if (s > 0) {
|
|
304
|
+
if (/\/(product|solution)s?\//.test(path)) s += 0.5; // prefer product pages
|
|
305
|
+
const loc = path.match(/^\/([a-z]{2})(?:-[a-z]{2})?\//); // de-prioritize non-English locales
|
|
306
|
+
if (loc && !["en", "us"].includes(loc[1])) s -= 0.6;
|
|
307
|
+
}
|
|
308
|
+
return s;
|
|
309
|
+
};
|
|
310
|
+
|
|
311
|
+
const candidates = new Set([`${root.origin}/sitemap.xml`, `${root.origin}/sitemap_index.xml`]);
|
|
312
|
+
const robots = await fetchText(`${root.origin}/robots.txt`);
|
|
313
|
+
if (robots) {
|
|
314
|
+
for (const mm of robots.slice(0, 100_000).matchAll(/^\s*sitemap:\s*(\S+)/gim)) {
|
|
315
|
+
try {
|
|
316
|
+
candidates.add(new URL(mm[1].trim(), root.origin).toString());
|
|
317
|
+
} catch {
|
|
318
|
+
/* skip */
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
const queue = [...candidates];
|
|
324
|
+
const seen = new Set<string>();
|
|
325
|
+
let best: string | null = null;
|
|
326
|
+
let bestScore = 0;
|
|
327
|
+
let fetched = 0;
|
|
328
|
+
let scanned = 0;
|
|
329
|
+
while (queue.length && fetched < 6 && scanned < 20_000) {
|
|
330
|
+
const sm = queue.shift() as string;
|
|
331
|
+
if (seen.has(sm) || !sameDomain(sm)) continue;
|
|
332
|
+
seen.add(sm);
|
|
333
|
+
if (sm.endsWith(".gz")) continue; // skip compressed sitemaps
|
|
334
|
+
const xml = await fetchText(sm);
|
|
335
|
+
if (!xml) continue;
|
|
336
|
+
fetched++;
|
|
337
|
+
const body = xml.slice(0, 5_000_000);
|
|
338
|
+
const locs = [...body.matchAll(/<loc>\s*([^<\s]+?)\s*<\/loc>/gi)].map((mm) => mm[1].replace(/&/g, "&"));
|
|
339
|
+
if (/<sitemapindex/i.test(body)) {
|
|
340
|
+
const ranked = locs
|
|
341
|
+
.filter(sameDomain)
|
|
342
|
+
.filter((l) => !/(pdf|video|image|img|news|siteimprove)/i.test(l))
|
|
343
|
+
.map((l): [number, string] => [pathScore(l) + (/product|solution/i.test(l) ? 1 : 0), l])
|
|
344
|
+
.sort((a, b) => b[0] - a[0]);
|
|
345
|
+
for (const [, l] of ranked.slice(0, 5)) queue.push(l);
|
|
346
|
+
} else {
|
|
347
|
+
for (const l of locs) {
|
|
348
|
+
scanned++;
|
|
349
|
+
if (!sameDomain(l)) continue;
|
|
350
|
+
const s = pathScore(l);
|
|
351
|
+
if (s > bestScore) {
|
|
352
|
+
bestScore = s;
|
|
353
|
+
best = l;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
return bestScore >= Math.min(2, keywords.length) ? best : null;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/**
|
|
362
|
+
* Find a vendor's category-specific page: scan its (already-fetched) homepage nav,
|
|
363
|
+
* then fall back to the sitemap. Returns the page URL, or null to keep the homepage.
|
|
364
|
+
*/
|
|
365
|
+
export async function findCategoryPage(
|
|
366
|
+
homepageHtml: string,
|
|
367
|
+
homepageUrl: string,
|
|
368
|
+
category: string,
|
|
369
|
+
fetchText: FetchText = defaultFetchText,
|
|
370
|
+
): Promise<string | null> {
|
|
371
|
+
const keywords = categoryKeywords(category);
|
|
372
|
+
if (!keywords.length) return null;
|
|
373
|
+
const nav = pickCategoryPage(homepageHtml, homepageUrl, keywords);
|
|
374
|
+
if (nav) return nav;
|
|
375
|
+
return findCategoryPageInSitemap(homepageUrl, keywords, fetchText);
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
/**
|
|
379
|
+
* A vendor logo as a self-contained `data:` URI — the form `MarketVendor.logo`
|
|
380
|
+
* renders and the report serves under a strict `img-src data:` CSP. Prefers the
|
|
381
|
+
* page-declared logo (from the given homepage HTML), then a favicon service.
|
|
382
|
+
* Bounded to small raster/SVG (≤50KB).
|
|
383
|
+
*/
|
|
384
|
+
export async function fetchLogoDataUri(
|
|
385
|
+
homepageUrl: string,
|
|
386
|
+
html?: string,
|
|
387
|
+
fetchBytes: FetchBytes = defaultFetchBytes,
|
|
388
|
+
): Promise<string | null> {
|
|
389
|
+
const host = hostOf(homepageUrl);
|
|
390
|
+
if (!host) return null;
|
|
391
|
+
const candidates: string[] = [];
|
|
392
|
+
if (html) {
|
|
393
|
+
const fromPage = extractLogoUrl(html, homepageUrl);
|
|
394
|
+
if (fromPage) candidates.push(fromPage);
|
|
395
|
+
}
|
|
396
|
+
candidates.push(`https://www.google.com/s2/favicons?domain=${host}&sz=64`);
|
|
397
|
+
for (const url of candidates) {
|
|
398
|
+
const got = await fetchBytes(url);
|
|
399
|
+
if (!got) continue;
|
|
400
|
+
if (!got.contentType.startsWith("image/")) continue;
|
|
401
|
+
if (got.bytes.length === 0 || got.bytes.length > 50_000) continue;
|
|
402
|
+
return `data:${got.contentType};base64,${Buffer.from(got.bytes).toString("base64")}`;
|
|
403
|
+
}
|
|
404
|
+
return null;
|
|
405
|
+
}
|