company-dossier 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +117 -0
- package/dist/cli.js +1527 -0
- package/dist/index.d.ts +209 -0
- package/dist/index.js +1379 -0
- package/dist/mcp.js +1394 -0
- package/package.json +65 -0
package/dist/cli.js
ADDED
|
@@ -0,0 +1,1527 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/collectors/dns.ts
|
|
4
|
+
import { promises as dns } from "dns";
|
|
5
|
+
|
|
6
|
+
// src/utils.ts
|
|
7
|
+
import * as fs from "fs";
|
|
8
|
+
import * as path from "path";
|
|
9
|
+
var USER_AGENT = "company-dossier/0.1 (+https://companydossier.lol)";
|
|
10
|
+
function mkdirp(dirPath) {
|
|
11
|
+
if (!fs.existsSync(dirPath)) {
|
|
12
|
+
fs.mkdirSync(dirPath, { recursive: true });
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
function writeFileSafe(filePath, content) {
|
|
16
|
+
mkdirp(path.dirname(filePath));
|
|
17
|
+
fs.writeFileSync(filePath, content, "utf8");
|
|
18
|
+
}
|
|
19
|
+
function todayISO() {
|
|
20
|
+
return (/* @__PURE__ */ new Date()).toISOString().split("T")[0];
|
|
21
|
+
}
|
|
22
|
+
function titleCase(str) {
|
|
23
|
+
return str.replace(/_/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
|
24
|
+
}
|
|
25
|
+
async function fetchText(url, timeoutMs = 1e4) {
|
|
26
|
+
const controller = new AbortController();
|
|
27
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
28
|
+
try {
|
|
29
|
+
const resp = await fetch(url, {
|
|
30
|
+
signal: controller.signal,
|
|
31
|
+
headers: { "User-Agent": USER_AGENT }
|
|
32
|
+
});
|
|
33
|
+
if (!resp.ok) {
|
|
34
|
+
throw new Error(`HTTP ${resp.status}`);
|
|
35
|
+
}
|
|
36
|
+
return await resp.text();
|
|
37
|
+
} finally {
|
|
38
|
+
clearTimeout(timer);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
async function fetchJSON(url, timeoutMs = 15e3) {
|
|
42
|
+
const text = await fetchText(url, timeoutMs);
|
|
43
|
+
return JSON.parse(text);
|
|
44
|
+
}
|
|
45
|
+
function sleep(ms) {
|
|
46
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
47
|
+
}
|
|
48
|
+
function toDomain(input) {
|
|
49
|
+
return input.trim().replace(/^https?:\/\//i, "").replace(/\/.*$/, "").replace(/^www\./i, "").toLowerCase();
|
|
50
|
+
}
|
|
51
|
+
function looksLikeDomain(input) {
|
|
52
|
+
const trimmed = input.trim();
|
|
53
|
+
if (/^https?:\/\//i.test(trimmed)) {
|
|
54
|
+
return true;
|
|
55
|
+
}
|
|
56
|
+
return /^[a-z0-9-]+(\.[a-z0-9-]+)+$/i.test(trimmed) && !trimmed.includes(" ");
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// src/collectors/dns.ts
|
|
60
|
+
async function collectDns(domainInput) {
|
|
61
|
+
const domain = toDomain(domainInput);
|
|
62
|
+
const result = {
|
|
63
|
+
domain,
|
|
64
|
+
mxRecords: [],
|
|
65
|
+
emailProvider: "Unknown",
|
|
66
|
+
spfRecord: "",
|
|
67
|
+
dmarcRecord: "",
|
|
68
|
+
verificationTokens: [],
|
|
69
|
+
subdomains: []
|
|
70
|
+
};
|
|
71
|
+
try {
|
|
72
|
+
const mx = await dns.resolveMx(domain);
|
|
73
|
+
result.mxRecords = mx.map((r) => ({ exchange: r.exchange, priority: r.priority }));
|
|
74
|
+
const mxStr = mx.map((r) => r.exchange).join(" ").toLowerCase();
|
|
75
|
+
if (mxStr.includes("google") || mxStr.includes("gmail")) {
|
|
76
|
+
result.emailProvider = "Google Workspace";
|
|
77
|
+
} else if (mxStr.includes("outlook") || mxStr.includes("microsoft")) {
|
|
78
|
+
result.emailProvider = "Microsoft 365";
|
|
79
|
+
} else if (mxStr.includes("zoho")) {
|
|
80
|
+
result.emailProvider = "Zoho";
|
|
81
|
+
} else if (mxStr.includes("proton")) {
|
|
82
|
+
result.emailProvider = "ProtonMail";
|
|
83
|
+
} else if (mxStr.includes("amazonses") || mxStr.includes("amazon")) {
|
|
84
|
+
result.emailProvider = "Amazon SES";
|
|
85
|
+
}
|
|
86
|
+
} catch {
|
|
87
|
+
}
|
|
88
|
+
try {
|
|
89
|
+
const txt = await dns.resolveTxt(domain);
|
|
90
|
+
for (const record of txt) {
|
|
91
|
+
const joined = record.join("");
|
|
92
|
+
if (joined.startsWith("v=spf1")) {
|
|
93
|
+
result.spfRecord = joined;
|
|
94
|
+
}
|
|
95
|
+
if (joined.includes("google-site-verification") || joined.includes("MS=") || joined.includes("facebook-domain-verification") || joined.includes("apple-domain-verification") || joined.includes("atlassian-domain-verification")) {
|
|
96
|
+
result.verificationTokens.push(joined);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
} catch {
|
|
100
|
+
}
|
|
101
|
+
try {
|
|
102
|
+
const dmarc = await dns.resolveTxt("_dmarc." + domain);
|
|
103
|
+
result.dmarcRecord = dmarc.map((r) => r.join("")).find((r) => r.startsWith("v=DMARC1")) || "";
|
|
104
|
+
} catch {
|
|
105
|
+
}
|
|
106
|
+
const subdomainChecks = ["www", "mail", "autodiscover", "blog", "shop", "app", "api"];
|
|
107
|
+
for (const sub of subdomainChecks) {
|
|
108
|
+
try {
|
|
109
|
+
const cname = await dns.resolveCname(sub + "." + domain);
|
|
110
|
+
if (cname.length > 0) {
|
|
111
|
+
result.subdomains.push(sub + "." + domain + " -> " + cname[0]);
|
|
112
|
+
}
|
|
113
|
+
} catch {
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return result;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// src/collectors/website.ts
|
|
120
|
+
function stripHtml(html) {
|
|
121
|
+
let text = html.replace(/<script[\s\S]*?<\/script>/gi, "");
|
|
122
|
+
text = text.replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
123
|
+
text = text.replace(/<nav[\s\S]*?<\/nav>/gi, "");
|
|
124
|
+
text = text.replace(/<footer[\s\S]*?<\/footer>/gi, "");
|
|
125
|
+
text = text.replace(/<[^>]+>/g, " ");
|
|
126
|
+
text = text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, '"').replace(/'/g, "'").replace(/ /g, " ");
|
|
127
|
+
text = text.replace(/\s+/g, " ").trim();
|
|
128
|
+
return text;
|
|
129
|
+
}
|
|
130
|
+
function extractTitle(html) {
|
|
131
|
+
const m = html.match(/<title[^>]*>(.*?)<\/title>/is);
|
|
132
|
+
return m ? m[1].trim() : "";
|
|
133
|
+
}
|
|
134
|
+
function extractDescription(html) {
|
|
135
|
+
const m = html.match(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']*)["']/i) || html.match(/<meta[^>]+content=["']([^"']*)["'][^>]+name=["']description["']/i);
|
|
136
|
+
return m ? m[1].trim() : "";
|
|
137
|
+
}
|
|
138
|
+
function extractHeadings(html) {
|
|
139
|
+
const headings = [];
|
|
140
|
+
const pattern = /<h[12][^>]*>(.*?)<\/h[12]>/gis;
|
|
141
|
+
let match;
|
|
142
|
+
while ((match = pattern.exec(html)) !== null) {
|
|
143
|
+
const text = match[1].replace(/<[^>]+>/g, "").trim();
|
|
144
|
+
if (text) {
|
|
145
|
+
headings.push(text);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
return headings;
|
|
149
|
+
}
|
|
150
|
+
function extractEmails(text) {
|
|
151
|
+
const matches = text.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g);
|
|
152
|
+
return matches ? [...new Set(matches)] : [];
|
|
153
|
+
}
|
|
154
|
+
function extractPhones(text) {
|
|
155
|
+
const matches = text.match(/(?:\+1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g);
|
|
156
|
+
return matches ? [...new Set(matches)] : [];
|
|
157
|
+
}
|
|
158
|
+
function extractSocialLinks(html) {
|
|
159
|
+
const patterns = [
|
|
160
|
+
/https?:\/\/(?:www\.)?linkedin\.com\/company\/[^"'\s<>)]+/gi,
|
|
161
|
+
/https?:\/\/(?:www\.)?twitter\.com\/[^"'\s<>)]+/gi,
|
|
162
|
+
/https?:\/\/(?:www\.)?x\.com\/[^"'\s<>)]+/gi,
|
|
163
|
+
/https?:\/\/(?:www\.)?facebook\.com\/[^"'\s<>)]+/gi,
|
|
164
|
+
/https?:\/\/(?:www\.)?instagram\.com\/[^"'\s<>)]+/gi,
|
|
165
|
+
/https?:\/\/(?:www\.)?youtube\.com\/(?:@|channel\/|c\/)[^"'\s<>)]+/gi,
|
|
166
|
+
/https?:\/\/(?:www\.)?tiktok\.com\/@[^"'\s<>)]+/gi,
|
|
167
|
+
/https?:\/\/(?:www\.)?github\.com\/[^"'\s<>)]+/gi
|
|
168
|
+
];
|
|
169
|
+
const links = /* @__PURE__ */ new Set();
|
|
170
|
+
for (const pattern of patterns) {
|
|
171
|
+
const matches = html.match(pattern);
|
|
172
|
+
if (matches) {
|
|
173
|
+
for (const m of matches) {
|
|
174
|
+
links.add(m);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return [...links];
|
|
179
|
+
}
|
|
180
|
+
function parseInternalLinks(html, origin) {
|
|
181
|
+
const urls = /* @__PURE__ */ new Set();
|
|
182
|
+
const hrefPattern = /href=["']([^"'#]+)["']/gi;
|
|
183
|
+
let match;
|
|
184
|
+
while ((match = hrefPattern.exec(html)) !== null) {
|
|
185
|
+
const href = match[1].trim();
|
|
186
|
+
if (/^(mailto:|tel:|javascript:|data:|#)/i.test(href)) {
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
try {
|
|
190
|
+
const resolved = new URL(href, origin);
|
|
191
|
+
if (resolved.origin === origin) {
|
|
192
|
+
resolved.hash = "";
|
|
193
|
+
urls.add(resolved.href);
|
|
194
|
+
}
|
|
195
|
+
} catch {
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
return [...urls];
|
|
199
|
+
}
|
|
200
|
+
function parseSitemapUrls(xml) {
|
|
201
|
+
const urls = [];
|
|
202
|
+
const locPattern = /<loc>(.*?)<\/loc>/g;
|
|
203
|
+
let match;
|
|
204
|
+
while ((match = locPattern.exec(xml)) !== null) {
|
|
205
|
+
urls.push(match[1].trim());
|
|
206
|
+
}
|
|
207
|
+
return urls;
|
|
208
|
+
}
|
|
209
|
+
async function processPage(url) {
|
|
210
|
+
try {
|
|
211
|
+
const html = await fetchText(url, 1e4);
|
|
212
|
+
const page = {
|
|
213
|
+
url,
|
|
214
|
+
title: extractTitle(html),
|
|
215
|
+
description: extractDescription(html),
|
|
216
|
+
headings: extractHeadings(html),
|
|
217
|
+
textContent: stripHtml(html).slice(0, 5e3)
|
|
218
|
+
};
|
|
219
|
+
return { page, html };
|
|
220
|
+
} catch {
|
|
221
|
+
return null;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
async function collectWebsite(baseUrl, options = {}) {
|
|
225
|
+
const progress = options.progress || (() => {
|
|
226
|
+
});
|
|
227
|
+
const maxPages = options.maxPages ?? 25;
|
|
228
|
+
const result = {
|
|
229
|
+
url: baseUrl,
|
|
230
|
+
title: "",
|
|
231
|
+
description: "",
|
|
232
|
+
keywords: [],
|
|
233
|
+
socialLinks: [],
|
|
234
|
+
emails: [],
|
|
235
|
+
phones: [],
|
|
236
|
+
schemaOrg: null,
|
|
237
|
+
sitemapUrls: [],
|
|
238
|
+
robotsTxt: "",
|
|
239
|
+
rawHtml: "",
|
|
240
|
+
pages: [],
|
|
241
|
+
allEmails: [],
|
|
242
|
+
allPhones: [],
|
|
243
|
+
pageCount: 0
|
|
244
|
+
};
|
|
245
|
+
let origin;
|
|
246
|
+
try {
|
|
247
|
+
origin = new URL(baseUrl).origin;
|
|
248
|
+
} catch {
|
|
249
|
+
result.error = `Invalid URL: ${baseUrl}`;
|
|
250
|
+
return result;
|
|
251
|
+
}
|
|
252
|
+
const allEmailSet = /* @__PURE__ */ new Set();
|
|
253
|
+
const allPhoneSet = /* @__PURE__ */ new Set();
|
|
254
|
+
const allSocialSet = /* @__PURE__ */ new Set();
|
|
255
|
+
const crawledUrls = /* @__PURE__ */ new Set();
|
|
256
|
+
const urlsToCrawl = [];
|
|
257
|
+
progress("Fetching homepage...");
|
|
258
|
+
try {
|
|
259
|
+
const html = await fetchText(baseUrl, 15e3);
|
|
260
|
+
result.rawHtml = html;
|
|
261
|
+
result.title = extractTitle(html);
|
|
262
|
+
result.description = extractDescription(html);
|
|
263
|
+
const kwMatch = html.match(
|
|
264
|
+
/<meta[^>]+name=["']keywords["'][^>]+content=["']([^"']*)["']/i
|
|
265
|
+
);
|
|
266
|
+
if (kwMatch) {
|
|
267
|
+
result.keywords = kwMatch[1].split(",").map((k) => k.trim()).filter(Boolean);
|
|
268
|
+
}
|
|
269
|
+
const schemaMatches = html.match(
|
|
270
|
+
/<script[^>]+type=["']application\/ld\+json["'][^>]*>(.*?)<\/script>/gis
|
|
271
|
+
);
|
|
272
|
+
if (schemaMatches) {
|
|
273
|
+
try {
|
|
274
|
+
result.schemaOrg = JSON.parse(schemaMatches[0].replace(/<\/?script[^>]*>/gi, ""));
|
|
275
|
+
} catch {
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
extractEmails(html).forEach((e) => allEmailSet.add(e));
|
|
279
|
+
extractPhones(html).forEach((p) => allPhoneSet.add(p));
|
|
280
|
+
extractSocialLinks(html).forEach((s) => allSocialSet.add(s));
|
|
281
|
+
result.pages.push({
|
|
282
|
+
url: baseUrl,
|
|
283
|
+
title: result.title,
|
|
284
|
+
description: result.description,
|
|
285
|
+
headings: extractHeadings(html),
|
|
286
|
+
textContent: stripHtml(html).slice(0, 5e3)
|
|
287
|
+
});
|
|
288
|
+
crawledUrls.add(baseUrl);
|
|
289
|
+
crawledUrls.add(baseUrl.replace(/\/$/, ""));
|
|
290
|
+
const homeLinks = parseInternalLinks(html, origin);
|
|
291
|
+
for (const link of homeLinks) {
|
|
292
|
+
if (!crawledUrls.has(link) && !crawledUrls.has(link.replace(/\/$/, ""))) {
|
|
293
|
+
urlsToCrawl.push(link);
|
|
294
|
+
crawledUrls.add(link);
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
progress(`Homepage parsed. Found ${homeLinks.length} internal links.`);
|
|
298
|
+
} catch (err) {
|
|
299
|
+
result.error = err instanceof Error ? err.message : String(err);
|
|
300
|
+
return result;
|
|
301
|
+
}
|
|
302
|
+
progress("Fetching sitemap.xml...");
|
|
303
|
+
try {
|
|
304
|
+
const sitemapXml = await fetchText(origin + "/sitemap.xml", 1e4);
|
|
305
|
+
const sitemapUrls = parseSitemapUrls(sitemapXml);
|
|
306
|
+
result.sitemapUrls = sitemapUrls.slice(0, 500);
|
|
307
|
+
for (const sUrl of sitemapUrls) {
|
|
308
|
+
try {
|
|
309
|
+
const parsed = new URL(sUrl);
|
|
310
|
+
if (parsed.origin === origin && !crawledUrls.has(sUrl) && !crawledUrls.has(sUrl.replace(/\/$/, ""))) {
|
|
311
|
+
urlsToCrawl.push(sUrl);
|
|
312
|
+
crawledUrls.add(sUrl);
|
|
313
|
+
}
|
|
314
|
+
} catch {
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
} catch {
|
|
318
|
+
progress("Sitemap: not found or inaccessible.");
|
|
319
|
+
}
|
|
320
|
+
await sleep(300);
|
|
321
|
+
try {
|
|
322
|
+
result.robotsTxt = await fetchText(origin + "/robots.txt", 5e3);
|
|
323
|
+
} catch {
|
|
324
|
+
}
|
|
325
|
+
const pagesToCrawl = urlsToCrawl.slice(0, maxPages);
|
|
326
|
+
progress(`Crawling up to ${pagesToCrawl.length} internal pages...`);
|
|
327
|
+
for (let i = 0; i < pagesToCrawl.length; i++) {
|
|
328
|
+
const pageUrl = pagesToCrawl[i];
|
|
329
|
+
if (/\.(jpg|jpeg|png|gif|svg|webp|ico|css|js|woff|woff2|ttf|eot|mp4|mp3|zip|gz|pdf)(\?|$)/i.test(
|
|
330
|
+
pageUrl
|
|
331
|
+
)) {
|
|
332
|
+
continue;
|
|
333
|
+
}
|
|
334
|
+
if (i > 0) {
|
|
335
|
+
await sleep(300);
|
|
336
|
+
}
|
|
337
|
+
progress(`[${i + 1}/${pagesToCrawl.length}] Crawling: ${pageUrl}`);
|
|
338
|
+
const fetched = await processPage(pageUrl);
|
|
339
|
+
if (fetched) {
|
|
340
|
+
result.pages.push(fetched.page);
|
|
341
|
+
extractEmails(fetched.html).forEach((e) => allEmailSet.add(e));
|
|
342
|
+
extractPhones(fetched.html).forEach((p) => allPhoneSet.add(p));
|
|
343
|
+
extractSocialLinks(fetched.html).forEach((s) => allSocialSet.add(s));
|
|
344
|
+
if (pagesToCrawl.length < maxPages) {
|
|
345
|
+
const newLinks = parseInternalLinks(fetched.html, origin);
|
|
346
|
+
for (const link of newLinks) {
|
|
347
|
+
if (pagesToCrawl.length >= maxPages) {
|
|
348
|
+
break;
|
|
349
|
+
}
|
|
350
|
+
if (!crawledUrls.has(link) && !crawledUrls.has(link.replace(/\/$/, ""))) {
|
|
351
|
+
pagesToCrawl.push(link);
|
|
352
|
+
crawledUrls.add(link);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
result.allEmails = [...allEmailSet];
|
|
359
|
+
result.allPhones = [...allPhoneSet];
|
|
360
|
+
result.socialLinks = [...allSocialSet];
|
|
361
|
+
result.emails = result.allEmails;
|
|
362
|
+
result.phones = result.allPhones;
|
|
363
|
+
result.pageCount = result.pages.length;
|
|
364
|
+
progress(
|
|
365
|
+
`Website crawl complete: ${result.pageCount} pages, ${result.allEmails.length} emails, ${result.allPhones.length} phones, ${result.socialLinks.length} social profiles.`
|
|
366
|
+
);
|
|
367
|
+
return result;
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
// src/collectors/wayback.ts
|
|
371
|
+
function timestampToMonth(ts) {
|
|
372
|
+
return ts.slice(0, 4) + "-" + ts.slice(4, 6);
|
|
373
|
+
}
|
|
374
|
+
function formatTimestamp(ts) {
|
|
375
|
+
if (ts.length < 8) {
|
|
376
|
+
return ts;
|
|
377
|
+
}
|
|
378
|
+
return `${ts.slice(0, 4)}-${ts.slice(4, 6)}-${ts.slice(6, 8)}`;
|
|
379
|
+
}
|
|
380
|
+
function waybackUrl(timestamp, originalUrl) {
|
|
381
|
+
return `https://web.archive.org/web/${timestamp}/${originalUrl}`;
|
|
382
|
+
}
|
|
383
|
+
async function collectWayback(domainInput, progressCallback) {
|
|
384
|
+
const progress = progressCallback || (() => {
|
|
385
|
+
});
|
|
386
|
+
const domain = toDomain(domainInput);
|
|
387
|
+
const result = {
|
|
388
|
+
domain,
|
|
389
|
+
totalCaptures: 0,
|
|
390
|
+
firstCapture: "",
|
|
391
|
+
lastCapture: "",
|
|
392
|
+
uniqueUrls: [],
|
|
393
|
+
pdfUrls: [],
|
|
394
|
+
pdfWaybackUrls: [],
|
|
395
|
+
deletedPages: [],
|
|
396
|
+
contentTypeDistribution: {},
|
|
397
|
+
captureTimeline: [],
|
|
398
|
+
capturesPerMonth: 0,
|
|
399
|
+
siteGrowthSummary: ""
|
|
400
|
+
};
|
|
401
|
+
progress("Wayback: Querying all unique URLs...");
|
|
402
|
+
try {
|
|
403
|
+
const cdxUrl = `https://web.archive.org/cdx/search/cdx?url=${domain}/*&output=json&fl=timestamp,original,statuscode,mimetype&collapse=original&limit=2000`;
|
|
404
|
+
const data = await fetchJSON(cdxUrl, 3e4);
|
|
405
|
+
if (!Array.isArray(data) || data.length < 2) {
|
|
406
|
+
result.error = "No Wayback data found";
|
|
407
|
+
return result;
|
|
408
|
+
}
|
|
409
|
+
const rows = data.slice(1);
|
|
410
|
+
result.totalCaptures = rows.length;
|
|
411
|
+
const timestamps = rows.map((r) => String(r[0])).sort();
|
|
412
|
+
result.firstCapture = formatTimestamp(timestamps[0] || "");
|
|
413
|
+
result.lastCapture = formatTimestamp(timestamps[timestamps.length - 1] || "");
|
|
414
|
+
const urlSet = /* @__PURE__ */ new Set();
|
|
415
|
+
for (const row of rows) {
|
|
416
|
+
urlSet.add(String(row[1]));
|
|
417
|
+
}
|
|
418
|
+
result.uniqueUrls = [...urlSet].slice(0, 500);
|
|
419
|
+
const mimeCount = {};
|
|
420
|
+
for (const row of rows) {
|
|
421
|
+
const mime = String(row[3] || "unknown").split(";")[0].trim().toLowerCase();
|
|
422
|
+
const category = mime.includes("html") ? "HTML" : mime.includes("pdf") ? "PDF" : mime.includes("image") ? "Image" : mime.includes("javascript") || mime.includes("ecmascript") ? "JavaScript" : mime.includes("css") ? "CSS" : mime.includes("json") ? "JSON" : mime.includes("xml") ? "XML" : "Other";
|
|
423
|
+
mimeCount[category] = (mimeCount[category] || 0) + 1;
|
|
424
|
+
}
|
|
425
|
+
result.contentTypeDistribution = mimeCount;
|
|
426
|
+
const urlStatusMap = /* @__PURE__ */ new Map();
|
|
427
|
+
for (const row of rows) {
|
|
428
|
+
const url = String(row[1]);
|
|
429
|
+
const status = String(row[2]);
|
|
430
|
+
const ts = String(row[0]);
|
|
431
|
+
if (!urlStatusMap.has(url)) {
|
|
432
|
+
urlStatusMap.set(url, { statuses: [], timestamps: [] });
|
|
433
|
+
}
|
|
434
|
+
urlStatusMap.get(url).statuses.push(status);
|
|
435
|
+
urlStatusMap.get(url).timestamps.push(ts);
|
|
436
|
+
}
|
|
437
|
+
for (const [url, info] of urlStatusMap) {
|
|
438
|
+
if (info.statuses.includes("200") && info.statuses[info.statuses.length - 1] !== "200") {
|
|
439
|
+
result.deletedPages.push(url);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
progress(
|
|
443
|
+
`Wayback Q1: ${result.uniqueUrls.length} unique URLs, first: ${result.firstCapture}, last: ${result.lastCapture}`
|
|
444
|
+
);
|
|
445
|
+
} catch (err) {
|
|
446
|
+
result.error = "CDX query 1 failed: " + (err instanceof Error ? err.message : String(err));
|
|
447
|
+
return result;
|
|
448
|
+
}
|
|
449
|
+
await sleep(1500);
|
|
450
|
+
progress("Wayback: Querying PDFs...");
|
|
451
|
+
try {
|
|
452
|
+
const pdfCdxUrl = `https://web.archive.org/cdx/search/cdx?url=${domain}/*&output=json&fl=timestamp,original,statuscode,mimetype&mimetype=application/pdf&limit=500`;
|
|
453
|
+
const pdfData = await fetchJSON(pdfCdxUrl, 2e4);
|
|
454
|
+
if (Array.isArray(pdfData) && pdfData.length > 1) {
|
|
455
|
+
const pdfRows = pdfData.slice(1);
|
|
456
|
+
const pdfOriginals = /* @__PURE__ */ new Map();
|
|
457
|
+
for (const row of pdfRows) {
|
|
458
|
+
const orig = String(row[1]);
|
|
459
|
+
const ts = String(row[0]);
|
|
460
|
+
if (!pdfOriginals.has(orig) || ts > pdfOriginals.get(orig)) {
|
|
461
|
+
pdfOriginals.set(orig, ts);
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
result.pdfUrls = [...pdfOriginals.keys()];
|
|
465
|
+
const pdfEntries = [...pdfOriginals.entries()].slice(0, 10);
|
|
466
|
+
result.pdfWaybackUrls = pdfEntries.map(([orig, ts]) => waybackUrl(ts, orig));
|
|
467
|
+
progress(`Wayback Q2: ${result.pdfUrls.length} unique PDFs found.`);
|
|
468
|
+
} else {
|
|
469
|
+
progress("Wayback Q2: No PDFs found.");
|
|
470
|
+
}
|
|
471
|
+
} catch (err) {
|
|
472
|
+
progress("Wayback Q2 failed: " + (err instanceof Error ? err.message : String(err)));
|
|
473
|
+
}
|
|
474
|
+
await sleep(1500);
|
|
475
|
+
progress("Wayback: Querying capture timeline...");
|
|
476
|
+
try {
|
|
477
|
+
const timelineCdxUrl = `https://web.archive.org/cdx/search/cdx?url=${domain}/*&output=json&fl=timestamp&collapse=timestamp:6&limit=5000`;
|
|
478
|
+
const timelineData = await fetchJSON(timelineCdxUrl, 2e4);
|
|
479
|
+
if (Array.isArray(timelineData) && timelineData.length > 1) {
|
|
480
|
+
const timelineRows = timelineData.slice(1);
|
|
481
|
+
const monthCounts = /* @__PURE__ */ new Map();
|
|
482
|
+
for (const row of timelineRows) {
|
|
483
|
+
const month = timestampToMonth(String(row[0]));
|
|
484
|
+
monthCounts.set(month, (monthCounts.get(month) || 0) + 1);
|
|
485
|
+
}
|
|
486
|
+
const sortedMonths = [...monthCounts.entries()].sort(
|
|
487
|
+
(a, b) => a[0].localeCompare(b[0])
|
|
488
|
+
);
|
|
489
|
+
result.captureTimeline = sortedMonths.map(([month, count]) => ({ month, count }));
|
|
490
|
+
if (sortedMonths.length > 0) {
|
|
491
|
+
const totalMonthCaptures = sortedMonths.reduce((sum, [, c]) => sum + c, 0);
|
|
492
|
+
result.capturesPerMonth = Math.round(totalMonthCaptures / sortedMonths.length * 10) / 10;
|
|
493
|
+
}
|
|
494
|
+
if (sortedMonths.length >= 2) {
|
|
495
|
+
const firstYear = sortedMonths[0][0].slice(0, 4);
|
|
496
|
+
const lastYear = sortedMonths[sortedMonths.length - 1][0].slice(0, 4);
|
|
497
|
+
const yearSpan = parseInt(lastYear) - parseInt(firstYear) + 1;
|
|
498
|
+
const mid = Math.floor(sortedMonths.length / 2);
|
|
499
|
+
const firstHalfAvg = sortedMonths.slice(0, mid).reduce((s, [, c]) => s + c, 0) / Math.max(mid, 1);
|
|
500
|
+
const secondHalfAvg = sortedMonths.slice(mid).reduce((s, [, c]) => s + c, 0) / (sortedMonths.length - mid);
|
|
501
|
+
const growthPct = firstHalfAvg > 0 ? Math.round((secondHalfAvg - firstHalfAvg) / firstHalfAvg * 100) : 0;
|
|
502
|
+
result.siteGrowthSummary = `${yearSpan} years of captures (${firstYear}-${lastYear}). ${timelineRows.length} total timeline entries across ${sortedMonths.length} months. Avg ${result.capturesPerMonth} captures/month. Second-half activity ${growthPct >= 0 ? "+" : ""}${growthPct}% vs first-half.`;
|
|
503
|
+
}
|
|
504
|
+
progress(`Wayback Q3: ${result.captureTimeline.length} months of activity.`);
|
|
505
|
+
} else {
|
|
506
|
+
progress("Wayback Q3: No timeline data.");
|
|
507
|
+
}
|
|
508
|
+
} catch (err) {
|
|
509
|
+
progress("Wayback Q3 failed: " + (err instanceof Error ? err.message : String(err)));
|
|
510
|
+
}
|
|
511
|
+
progress(
|
|
512
|
+
`Wayback complete: ${result.totalCaptures} captures, ${result.pdfUrls.length} PDFs, ${result.deletedPages.length} deleted pages.`
|
|
513
|
+
);
|
|
514
|
+
return result;
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// src/collectors/techstack.ts
|
|
518
|
+
function extractTechStack(html) {
|
|
519
|
+
const result = {
|
|
520
|
+
cms: "Unknown",
|
|
521
|
+
analyticsIds: [],
|
|
522
|
+
gtmIds: [],
|
|
523
|
+
adPixels: [],
|
|
524
|
+
cdn: "",
|
|
525
|
+
frameworks: [],
|
|
526
|
+
metaGenerator: ""
|
|
527
|
+
};
|
|
528
|
+
try {
|
|
529
|
+
if (html.includes("wix.com") || html.includes("X-Wix")) {
|
|
530
|
+
result.cms = "Wix";
|
|
531
|
+
} else if (html.includes("wp-content") || html.includes("wordpress")) {
|
|
532
|
+
result.cms = "WordPress";
|
|
533
|
+
} else if (html.includes("Shopify.theme") || html.includes("cdn.shopify")) {
|
|
534
|
+
result.cms = "Shopify";
|
|
535
|
+
} else if (html.includes("squarespace.com") || html.includes("static1.squarespace")) {
|
|
536
|
+
result.cms = "Squarespace";
|
|
537
|
+
} else if (html.includes("webflow.com")) {
|
|
538
|
+
result.cms = "Webflow";
|
|
539
|
+
} else if (html.includes("ghost.io")) {
|
|
540
|
+
result.cms = "Ghost";
|
|
541
|
+
} else if (html.includes("hubspot")) {
|
|
542
|
+
result.cms = "HubSpot";
|
|
543
|
+
}
|
|
544
|
+
const genMatch = html.match(
|
|
545
|
+
/<meta[^>]+name=["']generator["'][^>]+content=["']([^"']*)["']/i
|
|
546
|
+
);
|
|
547
|
+
if (genMatch) {
|
|
548
|
+
result.metaGenerator = genMatch[1];
|
|
549
|
+
}
|
|
550
|
+
const ga4Matches = html.match(/G-[A-Z0-9]{6,}/g);
|
|
551
|
+
const uaMatches = html.match(/UA-\d{6,}-\d/g);
|
|
552
|
+
if (ga4Matches) {
|
|
553
|
+
result.analyticsIds.push(...new Set(ga4Matches));
|
|
554
|
+
}
|
|
555
|
+
if (uaMatches) {
|
|
556
|
+
result.analyticsIds.push(...new Set(uaMatches));
|
|
557
|
+
}
|
|
558
|
+
const gtmMatches = html.match(/GTM-[A-Z0-9]{4,}/g);
|
|
559
|
+
if (gtmMatches) {
|
|
560
|
+
result.gtmIds.push(...new Set(gtmMatches));
|
|
561
|
+
}
|
|
562
|
+
const fbPixel = html.match(/fbq\(['"]init['"],\s*['"](\d+)['"]/);
|
|
563
|
+
if (fbPixel) {
|
|
564
|
+
result.adPixels.push("Meta Pixel: " + fbPixel[1]);
|
|
565
|
+
}
|
|
566
|
+
if (/tiktok[^"']*pixel/i.test(html)) {
|
|
567
|
+
result.adPixels.push("TikTok Pixel");
|
|
568
|
+
}
|
|
569
|
+
if (/linkedin\.com\/px|_linkedin_partner_id/i.test(html)) {
|
|
570
|
+
result.adPixels.push("LinkedIn Insight Tag");
|
|
571
|
+
}
|
|
572
|
+
if (/thetradedesk|ttd/i.test(html)) {
|
|
573
|
+
result.adPixels.push("The Trade Desk");
|
|
574
|
+
}
|
|
575
|
+
if (html.includes("react") || html.includes("__NEXT_DATA__")) {
|
|
576
|
+
result.frameworks.push("React/Next.js");
|
|
577
|
+
}
|
|
578
|
+
if (html.includes("vue") || html.includes("__VUE__")) {
|
|
579
|
+
result.frameworks.push("Vue.js");
|
|
580
|
+
}
|
|
581
|
+
if (html.includes("angular")) {
|
|
582
|
+
result.frameworks.push("Angular");
|
|
583
|
+
}
|
|
584
|
+
if (html.includes("svelte")) {
|
|
585
|
+
result.frameworks.push("Svelte");
|
|
586
|
+
}
|
|
587
|
+
if (html.includes("cloudflare")) {
|
|
588
|
+
result.cdn = "Cloudflare";
|
|
589
|
+
} else if (html.includes("fastly")) {
|
|
590
|
+
result.cdn = "Fastly";
|
|
591
|
+
} else if (html.includes("akamai")) {
|
|
592
|
+
result.cdn = "Akamai";
|
|
593
|
+
} else if (html.includes("cloudfront")) {
|
|
594
|
+
result.cdn = "CloudFront (AWS)";
|
|
595
|
+
}
|
|
596
|
+
} catch (err) {
|
|
597
|
+
result.error = err instanceof Error ? err.message : String(err);
|
|
598
|
+
}
|
|
599
|
+
return result;
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
// src/collectors/search.ts
|
|
603
|
+
async function urlExists(url, timeoutMs = 8e3) {
|
|
604
|
+
const controller = new AbortController();
|
|
605
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
606
|
+
try {
|
|
607
|
+
const resp = await fetch(url, {
|
|
608
|
+
method: "HEAD",
|
|
609
|
+
signal: controller.signal,
|
|
610
|
+
redirect: "manual",
|
|
611
|
+
headers: { "User-Agent": USER_AGENT }
|
|
612
|
+
});
|
|
613
|
+
const status = resp.status;
|
|
614
|
+
return status === 200 || status === 301 || status === 302;
|
|
615
|
+
} catch {
|
|
616
|
+
return false;
|
|
617
|
+
} finally {
|
|
618
|
+
clearTimeout(timer);
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
function generateSlugs(companyName) {
|
|
622
|
+
const base = companyName.toLowerCase().trim();
|
|
623
|
+
const slugs = /* @__PURE__ */ new Set();
|
|
624
|
+
slugs.add(base.replace(/[^a-z0-9]/g, ""));
|
|
625
|
+
slugs.add(
|
|
626
|
+
base.replace(/[^a-z0-9]+/g, "-").replace(/-+$/, "").replace(/^-+/, "")
|
|
627
|
+
);
|
|
628
|
+
slugs.add(
|
|
629
|
+
base.replace(/[^a-z0-9]+/g, "_").replace(/_+$/, "").replace(/^_+/, "")
|
|
630
|
+
);
|
|
631
|
+
const stripped = base.replace(/\b(inc|llc|corp|corporation|ltd|limited|co|company|group|holdings)\b/gi, "").trim();
|
|
632
|
+
if (stripped && stripped !== base) {
|
|
633
|
+
slugs.add(stripped.replace(/[^a-z0-9]/g, ""));
|
|
634
|
+
slugs.add(
|
|
635
|
+
stripped.replace(/[^a-z0-9]+/g, "-").replace(/-+$/, "").replace(/^-+/, "")
|
|
636
|
+
);
|
|
637
|
+
}
|
|
638
|
+
return [...slugs].filter((s) => s.length > 1);
|
|
639
|
+
}
|
|
640
|
+
function isGenericHandle(handle) {
|
|
641
|
+
const generic = /* @__PURE__ */ new Set([
|
|
642
|
+
"share",
|
|
643
|
+
"sharer",
|
|
644
|
+
"intent",
|
|
645
|
+
"home",
|
|
646
|
+
"search",
|
|
647
|
+
"login",
|
|
648
|
+
"signup",
|
|
649
|
+
"about",
|
|
650
|
+
"help",
|
|
651
|
+
"support",
|
|
652
|
+
"privacy",
|
|
653
|
+
"terms",
|
|
654
|
+
"policy",
|
|
655
|
+
"hashtag",
|
|
656
|
+
"explore",
|
|
657
|
+
"settings",
|
|
658
|
+
"legal"
|
|
659
|
+
]);
|
|
660
|
+
return generic.has(handle.toLowerCase());
|
|
661
|
+
}
|
|
662
|
+
async function queryUSASpending(companyName, progress) {
|
|
663
|
+
progress("Search: Querying USASpending.gov...");
|
|
664
|
+
const controller = new AbortController();
|
|
665
|
+
const timer = setTimeout(() => controller.abort(), 2e4);
|
|
666
|
+
try {
|
|
667
|
+
const body = JSON.stringify({
|
|
668
|
+
filters: {
|
|
669
|
+
recipient_search_text: [companyName],
|
|
670
|
+
time_period: [
|
|
671
|
+
{ start_date: "2010-01-01", end_date: (/* @__PURE__ */ new Date()).toISOString().split("T")[0] }
|
|
672
|
+
],
|
|
673
|
+
award_type_codes: ["A", "B", "C", "D"]
|
|
674
|
+
},
|
|
675
|
+
fields: [
|
|
676
|
+
"Award ID",
|
|
677
|
+
"Recipient Name",
|
|
678
|
+
"Total Obligation",
|
|
679
|
+
"Awarding Agency",
|
|
680
|
+
"Start Date",
|
|
681
|
+
"End Date",
|
|
682
|
+
"Description"
|
|
683
|
+
],
|
|
684
|
+
limit: 25,
|
|
685
|
+
page: 1,
|
|
686
|
+
sort: "Total Obligation",
|
|
687
|
+
order: "desc"
|
|
688
|
+
});
|
|
689
|
+
const resp = await fetch("https://api.usaspending.gov/api/v2/search/spending_by_award/", {
|
|
690
|
+
method: "POST",
|
|
691
|
+
signal: controller.signal,
|
|
692
|
+
headers: { "Content-Type": "application/json", "User-Agent": USER_AGENT },
|
|
693
|
+
body
|
|
694
|
+
});
|
|
695
|
+
if (!resp.ok) {
|
|
696
|
+
progress(`USASpending: HTTP ${resp.status}`);
|
|
697
|
+
return { awards: 0, total: 0, contracts: [] };
|
|
698
|
+
}
|
|
699
|
+
const data = await resp.json();
|
|
700
|
+
const awards = data.page_metadata?.total || 0;
|
|
701
|
+
const contracts = [];
|
|
702
|
+
let total = 0;
|
|
703
|
+
if (Array.isArray(data.results)) {
|
|
704
|
+
for (const r of data.results) {
|
|
705
|
+
const amt = parseFloat(String(r["Total Obligation"] ?? "0"));
|
|
706
|
+
if (!isNaN(amt)) {
|
|
707
|
+
total += amt;
|
|
708
|
+
}
|
|
709
|
+
contracts.push({
|
|
710
|
+
awardId: String(r["Award ID"] ?? ""),
|
|
711
|
+
recipientName: String(r["Recipient Name"] ?? ""),
|
|
712
|
+
totalObligation: isNaN(amt) ? 0 : amt,
|
|
713
|
+
awardingAgency: String(r["Awarding Agency"] ?? ""),
|
|
714
|
+
startDate: String(r["Start Date"] ?? ""),
|
|
715
|
+
endDate: String(r["End Date"] ?? ""),
|
|
716
|
+
description: String(r["Description"] ?? "")
|
|
717
|
+
});
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
progress(`USASpending: ${awards} awards, $${Math.round(total).toLocaleString()} total.`);
|
|
721
|
+
return { awards, total, contracts };
|
|
722
|
+
} catch (err) {
|
|
723
|
+
progress("USASpending: query failed \u2014 " + (err instanceof Error ? err.message : String(err)));
|
|
724
|
+
return { awards: 0, total: 0, contracts: [] };
|
|
725
|
+
} finally {
|
|
726
|
+
clearTimeout(timer);
|
|
727
|
+
}
|
|
728
|
+
}
|
|
729
|
+
async function discoverSocialProfiles(companyName, progress) {
|
|
730
|
+
const slugs = generateSlugs(companyName);
|
|
731
|
+
const profiles = [];
|
|
732
|
+
const checked = /* @__PURE__ */ new Set();
|
|
733
|
+
const platforms = [
|
|
734
|
+
{ name: "LinkedIn", template: (s) => `https://www.linkedin.com/company/${s}` },
|
|
735
|
+
{ name: "Twitter/X", template: (s) => `https://x.com/${s}` },
|
|
736
|
+
{ name: "Facebook", template: (s) => `https://www.facebook.com/${s}` },
|
|
737
|
+
{ name: "Instagram", template: (s) => `https://www.instagram.com/${s}` },
|
|
738
|
+
{ name: "YouTube", template: (s) => `https://www.youtube.com/@${s}` },
|
|
739
|
+
{ name: "TikTok", template: (s) => `https://www.tiktok.com/@${s}` },
|
|
740
|
+
{ name: "GitHub", template: (s) => `https://github.com/${s}` }
|
|
741
|
+
];
|
|
742
|
+
progress(
|
|
743
|
+
`Search: Probing social profiles for ${slugs.length} slug(s) across ${platforms.length} platforms...`
|
|
744
|
+
);
|
|
745
|
+
for (const platform of platforms) {
|
|
746
|
+
for (const slug of slugs) {
|
|
747
|
+
const url = platform.template(slug);
|
|
748
|
+
if (checked.has(url)) {
|
|
749
|
+
continue;
|
|
750
|
+
}
|
|
751
|
+
checked.add(url);
|
|
752
|
+
if (profiles.some((p) => p.platform === platform.name)) {
|
|
753
|
+
break;
|
|
754
|
+
}
|
|
755
|
+
await sleep(700);
|
|
756
|
+
const exists = await urlExists(url);
|
|
757
|
+
if (exists) {
|
|
758
|
+
profiles.push({ platform: platform.name, url, source: "HEAD probe" });
|
|
759
|
+
progress(`Search: Found ${platform.name} profile.`);
|
|
760
|
+
break;
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
progress(`Search: ${profiles.length} social profiles discovered via probing.`);
|
|
765
|
+
return profiles;
|
|
766
|
+
}
|
|
767
|
+
function extractSocialFromPages(pageContents) {
|
|
768
|
+
const socialPatterns = [
|
|
769
|
+
{ platform: "LinkedIn", regex: /https?:\/\/(?:www\.)?linkedin\.com\/company\/([a-zA-Z0-9_-]+)/g },
|
|
770
|
+
{ platform: "Twitter/X", regex: /https?:\/\/(?:www\.)?(?:twitter\.com|x\.com)\/([a-zA-Z0-9_]+)/g },
|
|
771
|
+
{ platform: "Facebook", regex: /https?:\/\/(?:www\.)?facebook\.com\/([a-zA-Z0-9._-]+)/g },
|
|
772
|
+
{ platform: "Instagram", regex: /https?:\/\/(?:www\.)?instagram\.com\/([a-zA-Z0-9._]+)/g },
|
|
773
|
+
{ platform: "YouTube", regex: /https?:\/\/(?:www\.)?youtube\.com\/(?:@|channel\/|c\/|user\/)([a-zA-Z0-9_-]+)/g },
|
|
774
|
+
{ platform: "GitHub", regex: /https?:\/\/(?:www\.)?github\.com\/([a-zA-Z0-9_-]+)/g },
|
|
775
|
+
{ platform: "TikTok", regex: /https?:\/\/(?:www\.)?tiktok\.com\/@([a-zA-Z0-9._]+)/g },
|
|
776
|
+
{ platform: "Pinterest", regex: /https?:\/\/(?:www\.)?pinterest\.com\/([a-zA-Z0-9_-]+)/g },
|
|
777
|
+
{ platform: "Glassdoor", regex: /https?:\/\/(?:www\.)?glassdoor\.com\/Overview\/[^"'\s]+/g },
|
|
778
|
+
{ platform: "Crunchbase", regex: /https?:\/\/(?:www\.)?crunchbase\.com\/organization\/([a-zA-Z0-9_-]+)/g }
|
|
779
|
+
];
|
|
780
|
+
const profiles = [];
|
|
781
|
+
const seen = /* @__PURE__ */ new Set();
|
|
782
|
+
const allHtml = pageContents.join("\n");
|
|
783
|
+
for (const { platform, regex } of socialPatterns) {
|
|
784
|
+
let match;
|
|
785
|
+
while ((match = regex.exec(allHtml)) !== null) {
|
|
786
|
+
const fullUrl = match[0];
|
|
787
|
+
const handle = match[1] || fullUrl;
|
|
788
|
+
const key = platform + ":" + handle.toLowerCase();
|
|
789
|
+
if (!seen.has(key) && !isGenericHandle(handle)) {
|
|
790
|
+
seen.add(key);
|
|
791
|
+
profiles.push({
|
|
792
|
+
platform,
|
|
793
|
+
url: fullUrl.startsWith("http") ? fullUrl : "https://" + fullUrl,
|
|
794
|
+
source: "website crawl"
|
|
795
|
+
});
|
|
796
|
+
}
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
return profiles;
|
|
800
|
+
}
|
|
801
|
+
async function collectSearch(companyName, options = {}) {
|
|
802
|
+
const progress = options.progress || (() => {
|
|
803
|
+
});
|
|
804
|
+
const result = {
|
|
805
|
+
companyName,
|
|
806
|
+
usaSpendingAwards: 0,
|
|
807
|
+
usaSpendingTotal: 0,
|
|
808
|
+
usaSpendingContracts: [],
|
|
809
|
+
socialProfiles: []
|
|
810
|
+
};
|
|
811
|
+
try {
|
|
812
|
+
const spending = await queryUSASpending(companyName, progress);
|
|
813
|
+
result.usaSpendingAwards = spending.awards;
|
|
814
|
+
result.usaSpendingTotal = spending.total;
|
|
815
|
+
result.usaSpendingContracts = spending.contracts;
|
|
816
|
+
} catch (err) {
|
|
817
|
+
progress("USASpending error: " + (err instanceof Error ? err.message : String(err)));
|
|
818
|
+
}
|
|
819
|
+
if (options.pageContents && options.pageContents.length > 0) {
|
|
820
|
+
try {
|
|
821
|
+
const extracted = extractSocialFromPages(options.pageContents);
|
|
822
|
+
const existingKeys = new Set(result.socialProfiles.map((p) => p.platform + ":" + p.url));
|
|
823
|
+
for (const profile of extracted) {
|
|
824
|
+
const key = profile.platform + ":" + profile.url;
|
|
825
|
+
if (!existingKeys.has(key)) {
|
|
826
|
+
result.socialProfiles.push(profile);
|
|
827
|
+
existingKeys.add(key);
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
} catch (err) {
|
|
831
|
+
progress("Social extraction error: " + (err instanceof Error ? err.message : String(err)));
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
if (options.probeSocial !== false) {
|
|
835
|
+
try {
|
|
836
|
+
const probed = await discoverSocialProfiles(companyName, progress);
|
|
837
|
+
const existingPlatforms = new Set(result.socialProfiles.map((p) => p.platform));
|
|
838
|
+
for (const p of probed) {
|
|
839
|
+
if (!existingPlatforms.has(p.platform)) {
|
|
840
|
+
result.socialProfiles.push(p);
|
|
841
|
+
existingPlatforms.add(p.platform);
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
} catch (err) {
|
|
845
|
+
progress("Social probe error: " + (err instanceof Error ? err.message : String(err)));
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
progress(
|
|
849
|
+
`Search complete: ${result.usaSpendingAwards} gov awards, ${result.socialProfiles.length} social profiles.`
|
|
850
|
+
);
|
|
851
|
+
return result;
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
// src/core.ts
|
|
855
|
+
var SECTIONS = [
|
|
856
|
+
"overview",
|
|
857
|
+
"people",
|
|
858
|
+
"hiring",
|
|
859
|
+
"money",
|
|
860
|
+
"locations",
|
|
861
|
+
"tech",
|
|
862
|
+
"news",
|
|
863
|
+
"relationships",
|
|
864
|
+
"risk"
|
|
865
|
+
];
|
|
866
|
+
var GENERATOR = "company-dossier v0.1.0";
|
|
867
|
+
var HOMEPAGE = "https://companydossier.lol";
|
|
868
|
+
var GAP = "_Gap: no public data found \u2014 requires manual research._";
|
|
869
|
+
function deriveCompanyName(target, website) {
|
|
870
|
+
const schemaName = website?.schemaOrg && typeof website.schemaOrg === "object" ? website.schemaOrg.name : void 0;
|
|
871
|
+
if (typeof schemaName === "string" && schemaName.trim()) {
|
|
872
|
+
return schemaName.trim();
|
|
873
|
+
}
|
|
874
|
+
if (!looksLikeDomain(target)) {
|
|
875
|
+
return target.trim();
|
|
876
|
+
}
|
|
877
|
+
if (website?.title) {
|
|
878
|
+
const seg = website.title.split(/[|–—\-:·]/)[0].trim();
|
|
879
|
+
if (seg && seg.length <= 60) {
|
|
880
|
+
return seg;
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
const domain = toDomain(target);
|
|
884
|
+
const root = domain.split(".")[0];
|
|
885
|
+
return titleCase(root);
|
|
886
|
+
}
|
|
887
|
+
function sourceLine(label) {
|
|
888
|
+
return `
|
|
889
|
+
|
|
890
|
+
> Source: ${label}`;
|
|
891
|
+
}
|
|
892
|
+
function renderOverview(meta, d) {
|
|
893
|
+
const w = d.website;
|
|
894
|
+
const schema = w?.schemaOrg && typeof w.schemaOrg === "object" ? JSON.stringify(w.schemaOrg, null, 2) : null;
|
|
895
|
+
const desc = w?.description || "";
|
|
896
|
+
return `# 1. Overview & Identity \u2014 ${meta.companyName}
|
|
897
|
+
|
|
898
|
+
| Field | Value | Source |
|
|
899
|
+
|-------|-------|--------|
|
|
900
|
+
| Company name | ${meta.companyName} | ${schema ? "Schema.org / HTML" : w?.title ? "HTML <title>" : "derived from domain"} |
|
|
901
|
+
| Website | ${meta.websiteUrl || "N/A"} | input |
|
|
902
|
+
| Domain | ${meta.domain} | input |
|
|
903
|
+
| Page title | ${w?.title || "N/A"} | HTML \`<title>\` |
|
|
904
|
+
| Description | ${desc || "N/A"} | meta description |
|
|
905
|
+
| Pages crawled | ${w?.pageCount ?? 0} | website crawler |
|
|
906
|
+
| Keywords | ${w?.keywords?.length ? w.keywords.join(", ") : "N/A"} | meta keywords |
|
|
907
|
+
|
|
908
|
+
## Description
|
|
909
|
+
|
|
910
|
+
${desc ? desc : GAP}${desc ? sourceLine(`${meta.websiteUrl} meta description`) : ""}
|
|
911
|
+
|
|
912
|
+
## Schema.org (JSON-LD)
|
|
913
|
+
|
|
914
|
+
${schema ? "```json\n" + schema + "\n```" + sourceLine(`${meta.websiteUrl} <script type="application/ld+json">`) : "_No schema.org JSON-LD found on the homepage._"}
|
|
915
|
+
`;
|
|
916
|
+
}
|
|
917
|
+
function renderPeople(meta, d) {
|
|
918
|
+
const w = d.website;
|
|
919
|
+
const emails = w?.allEmails || [];
|
|
920
|
+
const personEmails = emails.filter((e) => /^[a-z]+([._-][a-z]+)?@/i.test(e));
|
|
921
|
+
return `# 2. People & Org Chart \u2014 ${meta.companyName}
|
|
922
|
+
|
|
923
|
+
> People data from public pages is limited. Named individuals, titles, and an
|
|
924
|
+
> org chart usually require LinkedIn, press, or filings \u2014 marked as gaps below.
|
|
925
|
+
|
|
926
|
+
## Contact emails (${emails.length})
|
|
927
|
+
|
|
928
|
+
${emails.length ? emails.map((e) => `- \`${e}\``).join("\n") + sourceLine("website crawl") : GAP}
|
|
929
|
+
|
|
930
|
+
## Likely individual contacts
|
|
931
|
+
|
|
932
|
+
${personEmails.length ? personEmails.map((e) => `- \`${e}\` (name-pattern email)`).join("\n") + sourceLine("inferred from email format \u2014 unverified") : "_No personal-pattern emails found._"}
|
|
933
|
+
|
|
934
|
+
## Leadership & org chart
|
|
935
|
+
|
|
936
|
+
${GAP}
|
|
937
|
+
`;
|
|
938
|
+
}
|
|
939
|
+
function renderHiring(meta, d) {
|
|
940
|
+
const w = d.website;
|
|
941
|
+
const careerPages = (w?.pages || []).filter(
|
|
942
|
+
(p) => /career|job|hiring|join|work-with-us|vacanc|recruit/i.test(p.url + " " + p.title)
|
|
943
|
+
);
|
|
944
|
+
const sitemapCareer = (w?.sitemapUrls || []).filter((u) => /career|job|hiring|recruit/i.test(u));
|
|
945
|
+
return `# 3. Hiring Radar \u2014 ${meta.companyName}
|
|
946
|
+
|
|
947
|
+
> Signals of growth and open roles, derived from the company's own site.
|
|
948
|
+
|
|
949
|
+
## Careers / jobs pages found
|
|
950
|
+
|
|
951
|
+
${careerPages.length ? careerPages.map((p) => `- [${p.title || p.url}](${p.url})`).join("\n") + sourceLine("website crawl") : "_No careers pages found during crawl._"}
|
|
952
|
+
|
|
953
|
+
## Career URLs in sitemap
|
|
954
|
+
|
|
955
|
+
${sitemapCareer.length ? sitemapCareer.slice(0, 30).map((u) => `- ${u}`).join("\n") + sourceLine("sitemap.xml") : "_None in sitemap (or no sitemap available)._"}
|
|
956
|
+
|
|
957
|
+
## Open roles & headcount trend
|
|
958
|
+
|
|
959
|
+
${GAP}
|
|
960
|
+
`;
|
|
961
|
+
}
|
|
962
|
+
function renderMoney(meta, d) {
|
|
963
|
+
const s = d.search;
|
|
964
|
+
const contracts = s?.usaSpendingContracts || [];
|
|
965
|
+
let contractTable = "";
|
|
966
|
+
if (contracts.length) {
|
|
967
|
+
const total = s?.usaSpendingTotal || contracts.reduce((a, c) => a + c.totalObligation, 0);
|
|
968
|
+
contractTable = `### Federal contracts (USASpending.gov)
|
|
969
|
+
|
|
970
|
+
> ${s?.usaSpendingAwards || contracts.length} awards found; top ${contracts.length} shown. Total obligated: $${Math.round(total).toLocaleString()}
|
|
971
|
+
|
|
972
|
+
| Award ID | Agency | Amount | Period | Description |
|
|
973
|
+
|----------|--------|--------|--------|-------------|
|
|
974
|
+
${contracts.map(
|
|
975
|
+
(c) => `| ${c.awardId} | ${c.awardingAgency} | $${Math.round(c.totalObligation).toLocaleString()} | ${c.startDate} \u2192 ${c.endDate} | ${(c.description || "").slice(0, 80).replace(/\|/g, "/")} |`
|
|
976
|
+
).join("\n")}${sourceLine("api.usaspending.gov")}`;
|
|
977
|
+
}
|
|
978
|
+
return `# 4. Money Trail \u2014 ${meta.companyName}
|
|
979
|
+
|
|
980
|
+
> Public financial signals. Revenue, funding, and valuation for private
|
|
981
|
+
> companies are rarely public and are marked as gaps.
|
|
982
|
+
|
|
983
|
+
${contracts.length ? contractTable : "### Federal contracts\n\n_No USASpending.gov contract awards matched this name._"}
|
|
984
|
+
|
|
985
|
+
## Funding, revenue & valuation
|
|
986
|
+
|
|
987
|
+
${GAP}
|
|
988
|
+
`;
|
|
989
|
+
}
|
|
990
|
+
function renderLocations(meta, d) {
|
|
991
|
+
const w = d.website;
|
|
992
|
+
const phones = w?.allPhones || [];
|
|
993
|
+
let address = "";
|
|
994
|
+
const schema = w?.schemaOrg;
|
|
995
|
+
if (schema && typeof schema === "object") {
|
|
996
|
+
const addr = schema.address;
|
|
997
|
+
if (addr) {
|
|
998
|
+
address = "```json\n" + JSON.stringify(addr, null, 2) + "\n```";
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
return `# 5. Locations \u2014 ${meta.companyName}
|
|
1002
|
+
|
|
1003
|
+
## Address (schema.org)
|
|
1004
|
+
|
|
1005
|
+
${address ? address + sourceLine("schema.org address") : "_No structured address found._"}
|
|
1006
|
+
|
|
1007
|
+
## Phone numbers (${phones.length})
|
|
1008
|
+
|
|
1009
|
+
${phones.length ? phones.map((p) => `- ${p}`).join("\n") + sourceLine("website crawl") : GAP}
|
|
1010
|
+
|
|
1011
|
+
## Offices & facilities
|
|
1012
|
+
|
|
1013
|
+
${GAP}
|
|
1014
|
+
`;
|
|
1015
|
+
}
|
|
1016
|
+
function renderTech(meta, d) {
|
|
1017
|
+
const t = d.tech;
|
|
1018
|
+
const dns2 = d.dns;
|
|
1019
|
+
if (!t && !dns2) {
|
|
1020
|
+
return `# 6. Tech Fingerprint \u2014 ${meta.companyName}
|
|
1021
|
+
|
|
1022
|
+
${GAP}
|
|
1023
|
+
`;
|
|
1024
|
+
}
|
|
1025
|
+
const techBlock = t ? `## Web technology
|
|
1026
|
+
|
|
1027
|
+
| Component | Value | Confidence |
|
|
1028
|
+
|-----------|-------|-----------|
|
|
1029
|
+
| CMS | ${t.cms} | ${t.cms !== "Unknown" ? "high" : "low"} |
|
|
1030
|
+
| Generator | ${t.metaGenerator || "Not detected"} | ${t.metaGenerator ? "high" : "low"} |
|
|
1031
|
+
| CDN | ${t.cdn || "Not detected"} | ${t.cdn ? "high" : "low"} |
|
|
1032
|
+
| Frameworks | ${t.frameworks.length ? t.frameworks.join(", ") : "Not detected"} | moderate |
|
|
1033
|
+
|
|
1034
|
+
### Analytics
|
|
1035
|
+
${t.analyticsIds.length ? t.analyticsIds.map((id) => `- \`${id}\``).join("\n") : "_None detected._"}
|
|
1036
|
+
|
|
1037
|
+
### Tag managers
|
|
1038
|
+
${t.gtmIds.length ? t.gtmIds.map((id) => `- \`${id}\``).join("\n") : "_None detected._"}
|
|
1039
|
+
|
|
1040
|
+
### Advertising pixels
|
|
1041
|
+
${t.adPixels.length ? t.adPixels.map((p) => `- ${p}`).join("\n") : "_None detected._"}${sourceLine("HTML fingerprint of crawled pages")}` : "";
|
|
1042
|
+
const dnsBlock = dns2 ? `## Email & DNS infrastructure
|
|
1043
|
+
|
|
1044
|
+
**Email provider:** ${dns2.emailProvider} (from MX records)
|
|
1045
|
+
|
|
1046
|
+
| Priority | MX Exchange |
|
|
1047
|
+
|----------|-------------|
|
|
1048
|
+
${dns2.mxRecords.length ? dns2.mxRecords.map((r) => `| ${r.priority} | ${r.exchange} |`).join("\n") : "| \u2014 | No MX records |"}
|
|
1049
|
+
|
|
1050
|
+
**SPF:** \`${dns2.spfRecord || "none"}\`
|
|
1051
|
+
**DMARC:** \`${dns2.dmarcRecord || "none"}\`
|
|
1052
|
+
|
|
1053
|
+
### Verification tokens
|
|
1054
|
+
${dns2.verificationTokens.length ? dns2.verificationTokens.map((v) => `- \`${v}\``).join("\n") : "_None found._"}
|
|
1055
|
+
|
|
1056
|
+
### Subdomains (CNAME)
|
|
1057
|
+
${dns2.subdomains.length ? dns2.subdomains.map((s) => `- ${s}`).join("\n") : "_None detected._"}${sourceLine("DNS resolver (MX/TXT/CNAME)")}` : "";
|
|
1058
|
+
return `# 6. Tech Fingerprint \u2014 ${meta.companyName}
|
|
1059
|
+
|
|
1060
|
+
${techBlock}
|
|
1061
|
+
|
|
1062
|
+
${dnsBlock}
|
|
1063
|
+
`;
|
|
1064
|
+
}
|
|
1065
|
+
function renderNews(meta, d) {
|
|
1066
|
+
const wb = d.wayback;
|
|
1067
|
+
if (!wb || wb.error) {
|
|
1068
|
+
return `# 7. News & Timeline \u2014 ${meta.companyName}
|
|
1069
|
+
|
|
1070
|
+
_Wayback Machine data unavailable${wb?.error ? `: ${wb.error}` : ""}._
|
|
1071
|
+
|
|
1072
|
+
## Press & news
|
|
1073
|
+
|
|
1074
|
+
${GAP}
|
|
1075
|
+
`;
|
|
1076
|
+
}
|
|
1077
|
+
const firstYear = wb.firstCapture ? wb.firstCapture.slice(0, 4) : "";
|
|
1078
|
+
const lastYear = wb.lastCapture ? wb.lastCapture.slice(0, 4) : "";
|
|
1079
|
+
const yearsActive = firstYear && lastYear ? parseInt(lastYear) - parseInt(firstYear) : 0;
|
|
1080
|
+
const yearCounts = /* @__PURE__ */ new Map();
|
|
1081
|
+
for (const entry of wb.captureTimeline) {
|
|
1082
|
+
const year = entry.month.slice(0, 4);
|
|
1083
|
+
yearCounts.set(year, (yearCounts.get(year) || 0) + entry.count);
|
|
1084
|
+
}
|
|
1085
|
+
const yearTable = [...yearCounts.entries()].map(([year, ct]) => `| ${year} | ${ct} |`).join("\n");
|
|
1086
|
+
return `# 7. News & Timeline \u2014 ${meta.companyName}
|
|
1087
|
+
|
|
1088
|
+
## Website history (Wayback Machine)
|
|
1089
|
+
|
|
1090
|
+
| Metric | Value |
|
|
1091
|
+
|--------|-------|
|
|
1092
|
+
| First capture | ${wb.firstCapture || "Unknown"} |
|
|
1093
|
+
| Last capture | ${wb.lastCapture || "Unknown"} |
|
|
1094
|
+
| Years archived | ${yearsActive > 0 ? yearsActive : "Unknown"} |
|
|
1095
|
+
| Total captures | ${wb.totalCaptures} |
|
|
1096
|
+
| Unique URLs | ${wb.uniqueUrls.length} |
|
|
1097
|
+
| PDFs discovered | ${wb.pdfUrls.length} |
|
|
1098
|
+
| Deleted pages | ${wb.deletedPages.length} |
|
|
1099
|
+
|
|
1100
|
+
${wb.siteGrowthSummary ? `**Growth:** ${wb.siteGrowthSummary}
|
|
1101
|
+
` : ""}
|
|
1102
|
+
${yearTable ? `### Captures by year
|
|
1103
|
+
|
|
1104
|
+
| Year | Captures |
|
|
1105
|
+
|------|----------|
|
|
1106
|
+
${yearTable}
|
|
1107
|
+
` : ""}
|
|
1108
|
+
### Notable archived PDFs
|
|
1109
|
+
${wb.pdfWaybackUrls.length ? wb.pdfWaybackUrls.map((u) => `- ${u}`).join("\n") : "_None found._"}
|
|
1110
|
+
|
|
1111
|
+
### Deleted pages (were live, now gone)
|
|
1112
|
+
${wb.deletedPages.length ? wb.deletedPages.slice(0, 30).map((u) => `- ${u}`).join("\n") : "_None detected._"}${sourceLine("web.archive.org CDX API")}
|
|
1113
|
+
|
|
1114
|
+
## Press & news coverage
|
|
1115
|
+
|
|
1116
|
+
${GAP}
|
|
1117
|
+
`;
|
|
1118
|
+
}
|
|
1119
|
+
function renderRelationships(meta, d) {
|
|
1120
|
+
const s = d.search;
|
|
1121
|
+
const w = d.website;
|
|
1122
|
+
const social = s?.socialProfiles || [];
|
|
1123
|
+
const merged = /* @__PURE__ */ new Map();
|
|
1124
|
+
for (const p of social) {
|
|
1125
|
+
merged.set(p.platform + ":" + p.url, p);
|
|
1126
|
+
}
|
|
1127
|
+
for (const link of w?.socialLinks || []) {
|
|
1128
|
+
let platform = "Other";
|
|
1129
|
+
if (/linkedin/i.test(link)) platform = "LinkedIn";
|
|
1130
|
+
else if (/twitter|x\.com/i.test(link)) platform = "Twitter/X";
|
|
1131
|
+
else if (/facebook/i.test(link)) platform = "Facebook";
|
|
1132
|
+
else if (/instagram/i.test(link)) platform = "Instagram";
|
|
1133
|
+
else if (/youtube/i.test(link)) platform = "YouTube";
|
|
1134
|
+
else if (/tiktok/i.test(link)) platform = "TikTok";
|
|
1135
|
+
else if (/github/i.test(link)) platform = "GitHub";
|
|
1136
|
+
const key = platform + ":" + link;
|
|
1137
|
+
if (!merged.has(key)) merged.set(key, { platform, url: link, source: "website HTML" });
|
|
1138
|
+
}
|
|
1139
|
+
const all = [...merged.values()];
|
|
1140
|
+
return `# 8. Relationship Web \u2014 ${meta.companyName}
|
|
1141
|
+
|
|
1142
|
+
## Social & external profiles (${all.length})
|
|
1143
|
+
|
|
1144
|
+
${all.length ? all.map((p) => `- **${p.platform}**: [${p.url}](${p.url}) _(${p.source})_`).join("\n") : GAP}
|
|
1145
|
+
|
|
1146
|
+
## Partners, suppliers & customers
|
|
1147
|
+
|
|
1148
|
+
${GAP}
|
|
1149
|
+
`;
|
|
1150
|
+
}
|
|
1151
|
+
function renderRisk(meta, d) {
|
|
1152
|
+
const flags = [];
|
|
1153
|
+
const dns2 = d.dns;
|
|
1154
|
+
const wb = d.wayback;
|
|
1155
|
+
const w = d.website;
|
|
1156
|
+
if (dns2 && !dns2.dmarcRecord) {
|
|
1157
|
+
flags.push("No DMARC record \u2014 domain is more vulnerable to email spoofing. _(Source: DNS)_");
|
|
1158
|
+
}
|
|
1159
|
+
if (dns2 && !dns2.spfRecord) {
|
|
1160
|
+
flags.push("No SPF record \u2014 sender policy not published. _(Source: DNS)_");
|
|
1161
|
+
}
|
|
1162
|
+
if (wb && !wb.error && wb.deletedPages.length > 0) {
|
|
1163
|
+
flags.push(
|
|
1164
|
+
`${wb.deletedPages.length} pages were live and are now gone \u2014 possible discontinued products or repositioning. _(Source: Wayback)_`
|
|
1165
|
+
);
|
|
1166
|
+
}
|
|
1167
|
+
if (w?.error) {
|
|
1168
|
+
flags.push(`Homepage was unreachable during crawl: ${w.error} _(Source: crawler)_`);
|
|
1169
|
+
}
|
|
1170
|
+
if (wb && !wb.error && wb.siteGrowthSummary.includes("-") && /-\d+%/.test(wb.siteGrowthSummary)) {
|
|
1171
|
+
flags.push("Declining web-capture activity in the second half of history. _(Source: Wayback)_");
|
|
1172
|
+
}
|
|
1173
|
+
return `# 9. Risk Flags \u2014 ${meta.companyName}
|
|
1174
|
+
|
|
1175
|
+
> Automated, low-confidence signals from public technical data. Not legal,
|
|
1176
|
+
> financial, or compliance advice. Verify before acting.
|
|
1177
|
+
|
|
1178
|
+
## Detected flags
|
|
1179
|
+
|
|
1180
|
+
${flags.length ? flags.map((f) => `- ${f}`).join("\n") : "_No automated risk flags raised from collected data._"}
|
|
1181
|
+
|
|
1182
|
+
## Legal, regulatory & financial risk
|
|
1183
|
+
|
|
1184
|
+
${GAP}
|
|
1185
|
+
`;
|
|
1186
|
+
}
|
|
1187
|
+
var RENDERERS = {
|
|
1188
|
+
overview: renderOverview,
|
|
1189
|
+
people: renderPeople,
|
|
1190
|
+
hiring: renderHiring,
|
|
1191
|
+
money: renderMoney,
|
|
1192
|
+
locations: renderLocations,
|
|
1193
|
+
tech: renderTech,
|
|
1194
|
+
news: renderNews,
|
|
1195
|
+
relationships: renderRelationships,
|
|
1196
|
+
risk: renderRisk
|
|
1197
|
+
};
|
|
1198
|
+
var SECTION_FILENAMES = {
|
|
1199
|
+
overview: "01_overview_identity.md",
|
|
1200
|
+
people: "02_people_org_chart.md",
|
|
1201
|
+
hiring: "03_hiring_radar.md",
|
|
1202
|
+
money: "04_money_trail.md",
|
|
1203
|
+
locations: "05_locations.md",
|
|
1204
|
+
tech: "06_tech_fingerprint.md",
|
|
1205
|
+
news: "07_news_timeline.md",
|
|
1206
|
+
relationships: "08_relationship_web.md",
|
|
1207
|
+
risk: "09_risk_flags.md"
|
|
1208
|
+
};
|
|
1209
|
+
var SECTION_TITLES = {
|
|
1210
|
+
overview: "Overview & Identity",
|
|
1211
|
+
people: "People & Org Chart",
|
|
1212
|
+
hiring: "Hiring Radar",
|
|
1213
|
+
money: "Money Trail",
|
|
1214
|
+
locations: "Locations",
|
|
1215
|
+
tech: "Tech Fingerprint",
|
|
1216
|
+
news: "News & Timeline",
|
|
1217
|
+
relationships: "Relationship Web",
|
|
1218
|
+
risk: "Risk Flags"
|
|
1219
|
+
};
|
|
1220
|
+
function renderReadme(meta, sections) {
|
|
1221
|
+
const sourceRows = meta.sources.map((s) => `| ${s.name} | ${s.status} | ${s.note || ""} |`).join("\n");
|
|
1222
|
+
const nav = sections.map((id, i) => `| ${i + 1} | [${SECTION_TITLES[id]}](${SECTION_FILENAMES[id]}) |`).join("\n");
|
|
1223
|
+
return `# ${meta.companyName} \u2014 Intelligence Dossier
|
|
1224
|
+
|
|
1225
|
+
> Compiled from PUBLIC data only by ${GENERATOR} on ${meta.generatedAt}.
|
|
1226
|
+
> ${meta.homepage}
|
|
1227
|
+
|
|
1228
|
+
**Target:** ${meta.target} | **Domain:** ${meta.domain} | **Website:** ${meta.websiteUrl || "N/A"}
|
|
1229
|
+
|
|
1230
|
+
## Collection summary
|
|
1231
|
+
|
|
1232
|
+
| Source | Status | Notes |
|
|
1233
|
+
|--------|--------|-------|
|
|
1234
|
+
${sourceRows}
|
|
1235
|
+
|
|
1236
|
+
## Sections
|
|
1237
|
+
|
|
1238
|
+
| # | Section |
|
|
1239
|
+
|---|---------|
|
|
1240
|
+
${nav}
|
|
1241
|
+
|
|
1242
|
+
---
|
|
1243
|
+
|
|
1244
|
+
Every derived claim is annotated with its source. Sections without public data
|
|
1245
|
+
are clearly marked as gaps requiring manual research. This dossier uses no
|
|
1246
|
+
private databases and no API keys are required.
|
|
1247
|
+
`;
|
|
1248
|
+
}
|
|
1249
|
+
async function buildDossier(target, opts = {}) {
|
|
1250
|
+
const progress = opts.progress || (() => {
|
|
1251
|
+
});
|
|
1252
|
+
const sections = (opts.sections && opts.sections.length ? opts.sections : [...SECTIONS]).filter(
|
|
1253
|
+
(s) => SECTIONS.includes(s)
|
|
1254
|
+
);
|
|
1255
|
+
const isDomain = looksLikeDomain(target);
|
|
1256
|
+
const domain = isDomain ? toDomain(target) : "";
|
|
1257
|
+
const websiteUrl = domain ? `https://${domain}` : "";
|
|
1258
|
+
const needsWebsite = !!websiteUrl && sections.some(
|
|
1259
|
+
(s) => ["overview", "people", "hiring", "locations", "tech", "relationships", "risk"].includes(s)
|
|
1260
|
+
);
|
|
1261
|
+
const needsDns = !!domain && sections.some((s) => ["tech", "risk"].includes(s));
|
|
1262
|
+
const needsWayback = !!domain && sections.some((s) => ["news", "risk"].includes(s));
|
|
1263
|
+
const data = {};
|
|
1264
|
+
const sources = [];
|
|
1265
|
+
if (needsWebsite) {
|
|
1266
|
+
progress("Collecting website...");
|
|
1267
|
+
data.website = await collectWebsite(websiteUrl, {
|
|
1268
|
+
maxPages: opts.maxPages ?? 25,
|
|
1269
|
+
progress
|
|
1270
|
+
});
|
|
1271
|
+
sources.push({
|
|
1272
|
+
name: "Website crawl",
|
|
1273
|
+
status: data.website.error ? "failed" : "ok",
|
|
1274
|
+
note: data.website.error ? data.website.error : `${data.website.pageCount} pages, ${data.website.allEmails.length} emails`
|
|
1275
|
+
});
|
|
1276
|
+
if (data.website.rawHtml || data.website.pages.length) {
|
|
1277
|
+
const allHtml = [data.website.rawHtml, ...data.website.pages.map((p) => p.textContent)].join(
|
|
1278
|
+
"\n"
|
|
1279
|
+
);
|
|
1280
|
+
data.tech = extractTechStack(allHtml);
|
|
1281
|
+
sources.push({ name: "Tech fingerprint", status: "ok", note: data.tech.cms });
|
|
1282
|
+
}
|
|
1283
|
+
}
|
|
1284
|
+
const companyNameSeed = deriveCompanyName(target, data.website);
|
|
1285
|
+
const tasks = [];
|
|
1286
|
+
if (needsDns) {
|
|
1287
|
+
tasks.push(
|
|
1288
|
+
(async () => {
|
|
1289
|
+
progress("Collecting DNS...");
|
|
1290
|
+
data.dns = await collectDns(domain);
|
|
1291
|
+
sources.push({
|
|
1292
|
+
name: "DNS recon",
|
|
1293
|
+
status: data.dns.error ? "failed" : "ok",
|
|
1294
|
+
note: data.dns.error || `${data.dns.mxRecords.length} MX, provider ${data.dns.emailProvider}`
|
|
1295
|
+
});
|
|
1296
|
+
})()
|
|
1297
|
+
);
|
|
1298
|
+
}
|
|
1299
|
+
if (needsWayback) {
|
|
1300
|
+
tasks.push(
|
|
1301
|
+
(async () => {
|
|
1302
|
+
progress("Collecting Wayback history...");
|
|
1303
|
+
data.wayback = await collectWayback(domain, progress);
|
|
1304
|
+
sources.push({
|
|
1305
|
+
name: "Wayback Machine",
|
|
1306
|
+
status: data.wayback.error ? "failed" : "ok",
|
|
1307
|
+
note: data.wayback.error || `${data.wayback.totalCaptures} captures`
|
|
1308
|
+
});
|
|
1309
|
+
})()
|
|
1310
|
+
);
|
|
1311
|
+
}
|
|
1312
|
+
if (sections.some((s) => ["money", "relationships"].includes(s))) {
|
|
1313
|
+
tasks.push(
|
|
1314
|
+
(async () => {
|
|
1315
|
+
progress("Collecting public search...");
|
|
1316
|
+
const pageContents = data.website?.rawHtml || data.website?.pages.length ? [data.website.rawHtml, ...data.website.pages.map((p) => p.textContent) || []] : void 0;
|
|
1317
|
+
data.search = await collectSearch(companyNameSeed, {
|
|
1318
|
+
progress,
|
|
1319
|
+
pageContents,
|
|
1320
|
+
probeSocial: !opts.skipSocialProbe
|
|
1321
|
+
});
|
|
1322
|
+
sources.push({
|
|
1323
|
+
name: "Public search",
|
|
1324
|
+
status: data.search.error ? "failed" : "ok",
|
|
1325
|
+
note: `${data.search.usaSpendingContracts.length} contracts, ${data.search.socialProfiles.length} social profiles`
|
|
1326
|
+
});
|
|
1327
|
+
})()
|
|
1328
|
+
);
|
|
1329
|
+
}
|
|
1330
|
+
await Promise.all(tasks);
|
|
1331
|
+
const companyName = deriveCompanyName(target, data.website);
|
|
1332
|
+
const meta = {
|
|
1333
|
+
target,
|
|
1334
|
+
companyName,
|
|
1335
|
+
domain: domain || "(name only)",
|
|
1336
|
+
websiteUrl,
|
|
1337
|
+
generatedAt: todayISO(),
|
|
1338
|
+
generator: GENERATOR,
|
|
1339
|
+
homepage: HOMEPAGE,
|
|
1340
|
+
sectionsRequested: sections,
|
|
1341
|
+
sources
|
|
1342
|
+
};
|
|
1343
|
+
const files = [];
|
|
1344
|
+
files.push({ path: "README.md", content: renderReadme(meta, sections) });
|
|
1345
|
+
for (const id of sections) {
|
|
1346
|
+
files.push({ path: SECTION_FILENAMES[id], content: RENDERERS[id](meta, data) });
|
|
1347
|
+
}
|
|
1348
|
+
const json = {
|
|
1349
|
+
meta,
|
|
1350
|
+
data
|
|
1351
|
+
};
|
|
1352
|
+
files.push({ path: "dossier.json", content: JSON.stringify(json, null, 2) });
|
|
1353
|
+
progress("Dossier assembled.");
|
|
1354
|
+
return { meta, json, files };
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
// src/index.ts
|
|
1358
|
+
import * as path2 from "path";
|
|
1359
|
+
function writeDossier(result, outDir) {
|
|
1360
|
+
const folderName = `${result.meta.companyName} DOSSIER`.trim();
|
|
1361
|
+
const safeFolder = folderName.replace(/[/\\]/g, "_");
|
|
1362
|
+
const target = path2.join(outDir, safeFolder);
|
|
1363
|
+
for (const file of result.files) {
|
|
1364
|
+
writeFileSafe(path2.join(target, file.path), file.content);
|
|
1365
|
+
}
|
|
1366
|
+
return target;
|
|
1367
|
+
}
|
|
1368
|
+
|
|
1369
|
+
// src/cli.ts
|
|
1370
|
+
var VERSION = "0.1.0";
|
|
1371
|
+
var HELP = `company-dossier v${VERSION}
|
|
1372
|
+
Build a complete, sourced intelligence dossier on any company from public data.
|
|
1373
|
+
|
|
1374
|
+
USAGE
|
|
1375
|
+
company-dossier <company-or-domain> [options]
|
|
1376
|
+
|
|
1377
|
+
ARGUMENTS
|
|
1378
|
+
<company-or-domain> A domain (acme.com / https://acme.com) or a company name.
|
|
1379
|
+
Domains unlock the full collector set; a bare name still
|
|
1380
|
+
produces a dossier from public search.
|
|
1381
|
+
|
|
1382
|
+
OPTIONS
|
|
1383
|
+
--out <dir> Output directory (default: current directory).
|
|
1384
|
+
--json Print the dossier JSON to stdout instead of writing files.
|
|
1385
|
+
--sections <list> Comma-separated subset of sections to build.
|
|
1386
|
+
Available: ${SECTIONS.join(", ")}
|
|
1387
|
+
--max-pages <n> Max internal pages to crawl (default: 25).
|
|
1388
|
+
--no-social-probe Skip slow HEAD-probing of social platforms.
|
|
1389
|
+
--quiet Suppress progress output.
|
|
1390
|
+
-h, --help Show this help.
|
|
1391
|
+
-v, --version Show version.
|
|
1392
|
+
|
|
1393
|
+
ENVIRONMENT
|
|
1394
|
+
COMPANY_DOSSIER_ANTHROPIC_KEY Optional. Reserved for future AI enrichment.
|
|
1395
|
+
A useful dossier is produced WITHOUT any key.
|
|
1396
|
+
|
|
1397
|
+
EXAMPLES
|
|
1398
|
+
company-dossier acme.com
|
|
1399
|
+
company-dossier acme.com --out ./research --quiet
|
|
1400
|
+
company-dossier "Acme Corporation"
|
|
1401
|
+
company-dossier acme.com --json > acme.json
|
|
1402
|
+
company-dossier acme.com --sections overview,tech,risk
|
|
1403
|
+
|
|
1404
|
+
Public sources only \u2014 no private databases, no API keys required.
|
|
1405
|
+
Learn more: https://companydossier.lol
|
|
1406
|
+
`;
|
|
1407
|
+
function parseArgs(argv) {
|
|
1408
|
+
const out = {
|
|
1409
|
+
out: process.cwd(),
|
|
1410
|
+
json: false,
|
|
1411
|
+
quiet: false,
|
|
1412
|
+
skipSocialProbe: false,
|
|
1413
|
+
help: false,
|
|
1414
|
+
version: false
|
|
1415
|
+
};
|
|
1416
|
+
for (let i = 0; i < argv.length; i++) {
|
|
1417
|
+
const a = argv[i];
|
|
1418
|
+
switch (a) {
|
|
1419
|
+
case "-h":
|
|
1420
|
+
case "--help":
|
|
1421
|
+
out.help = true;
|
|
1422
|
+
break;
|
|
1423
|
+
case "-v":
|
|
1424
|
+
case "--version":
|
|
1425
|
+
out.version = true;
|
|
1426
|
+
break;
|
|
1427
|
+
case "--json":
|
|
1428
|
+
out.json = true;
|
|
1429
|
+
break;
|
|
1430
|
+
case "--quiet":
|
|
1431
|
+
out.quiet = true;
|
|
1432
|
+
break;
|
|
1433
|
+
case "--no-social-probe":
|
|
1434
|
+
out.skipSocialProbe = true;
|
|
1435
|
+
break;
|
|
1436
|
+
case "--out":
|
|
1437
|
+
out.out = argv[++i] ?? out.out;
|
|
1438
|
+
break;
|
|
1439
|
+
case "--max-pages": {
|
|
1440
|
+
const n = parseInt(argv[++i] ?? "", 10);
|
|
1441
|
+
if (!isNaN(n) && n > 0) out.maxPages = n;
|
|
1442
|
+
break;
|
|
1443
|
+
}
|
|
1444
|
+
case "--sections": {
|
|
1445
|
+
const list = (argv[++i] ?? "").split(",").map((s) => s.trim().toLowerCase()).filter(Boolean);
|
|
1446
|
+
const valid = list.filter(
|
|
1447
|
+
(s) => SECTIONS.includes(s)
|
|
1448
|
+
);
|
|
1449
|
+
const invalid = list.filter((s) => !SECTIONS.includes(s));
|
|
1450
|
+
if (invalid.length) {
|
|
1451
|
+
out.error = `Unknown section(s): ${invalid.join(", ")}. Valid: ${SECTIONS.join(", ")}`;
|
|
1452
|
+
}
|
|
1453
|
+
out.sections = valid;
|
|
1454
|
+
break;
|
|
1455
|
+
}
|
|
1456
|
+
default:
|
|
1457
|
+
if (a.startsWith("-")) {
|
|
1458
|
+
out.error = `Unknown option: ${a}`;
|
|
1459
|
+
} else if (!out.target) {
|
|
1460
|
+
out.target = a;
|
|
1461
|
+
}
|
|
1462
|
+
}
|
|
1463
|
+
}
|
|
1464
|
+
return out;
|
|
1465
|
+
}
|
|
1466
|
+
async function main() {
|
|
1467
|
+
const args = parseArgs(process.argv.slice(2));
|
|
1468
|
+
if (args.help) {
|
|
1469
|
+
process.stdout.write(HELP);
|
|
1470
|
+
return 0;
|
|
1471
|
+
}
|
|
1472
|
+
if (args.version) {
|
|
1473
|
+
process.stdout.write(VERSION + "\n");
|
|
1474
|
+
return 0;
|
|
1475
|
+
}
|
|
1476
|
+
if (args.error) {
|
|
1477
|
+
process.stderr.write(`Error: ${args.error}
|
|
1478
|
+
|
|
1479
|
+
Run "company-dossier --help".
|
|
1480
|
+
`);
|
|
1481
|
+
return 2;
|
|
1482
|
+
}
|
|
1483
|
+
if (!args.target) {
|
|
1484
|
+
process.stderr.write("Error: missing <company-or-domain>.\n\n" + HELP);
|
|
1485
|
+
return 2;
|
|
1486
|
+
}
|
|
1487
|
+
const log = args.quiet ? () => {
|
|
1488
|
+
} : (msg) => process.stderr.write(msg + "\n");
|
|
1489
|
+
const opts = {
|
|
1490
|
+
out: args.out,
|
|
1491
|
+
json: args.json,
|
|
1492
|
+
sections: args.sections,
|
|
1493
|
+
maxPages: args.maxPages,
|
|
1494
|
+
skipSocialProbe: args.skipSocialProbe,
|
|
1495
|
+
progress: log,
|
|
1496
|
+
apiKeys: {
|
|
1497
|
+
anthropic: process.env.COMPANY_DOSSIER_ANTHROPIC_KEY
|
|
1498
|
+
}
|
|
1499
|
+
};
|
|
1500
|
+
let result;
|
|
1501
|
+
try {
|
|
1502
|
+
result = await buildDossier(args.target, opts);
|
|
1503
|
+
} catch (err) {
|
|
1504
|
+
process.stderr.write(
|
|
1505
|
+
`Fatal: ${err instanceof Error ? err.message : String(err)}
|
|
1506
|
+
`
|
|
1507
|
+
);
|
|
1508
|
+
return 1;
|
|
1509
|
+
}
|
|
1510
|
+
if (args.json) {
|
|
1511
|
+
process.stdout.write(JSON.stringify(result.json, null, 2) + "\n");
|
|
1512
|
+
return 0;
|
|
1513
|
+
}
|
|
1514
|
+
const folder = writeDossier(result, args.out);
|
|
1515
|
+
log(`
|
|
1516
|
+
Dossier written to: ${folder}`);
|
|
1517
|
+
log(`Files: ${result.files.map((f) => f.path).join(", ")}`);
|
|
1518
|
+
if (!args.quiet) {
|
|
1519
|
+
process.stdout.write(folder + "\n");
|
|
1520
|
+
}
|
|
1521
|
+
return 0;
|
|
1522
|
+
}
|
|
1523
|
+
main().then((code) => process.exit(code)).catch((err) => {
|
|
1524
|
+
process.stderr.write(`Fatal: ${err instanceof Error ? err.message : String(err)}
|
|
1525
|
+
`);
|
|
1526
|
+
process.exit(1);
|
|
1527
|
+
});
|