salesprompter-cli 0.1.23 → 0.1.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -13
- package/dist/auth.js +2 -2
- package/dist/cli.js +1373 -27
- package/dist/hunter-emails.js +291 -0
- package/dist/linkedin-companies.js +550 -0
- package/dist/linkedin-products.js +68 -18
- package/dist/linkedin-session.js +13 -3
- package/dist/sales-navigator.js +15 -4
- package/package.json +8 -7
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
import { load } from "cheerio";
|
|
2
|
+
import { BigQuery } from "@google-cloud/bigquery";
|
|
3
|
+
const DEFAULT_LINKEDIN_BASE_URL = "https://www.linkedin.com";
|
|
4
|
+
const DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36";
|
|
5
|
+
const DEFAULT_RAPIDAPI_LINKEDIN_COMPANY_HOST = "web-scraping-api2.p.rapidapi.com";
|
|
6
|
+
const DEFAULT_RAPIDAPI_LINKEDIN_COMPANY_ENDPOINT = "https://web-scraping-api2.p.rapidapi.com/get-company-by-linkedinurl";
|
|
7
|
+
const DEFAULT_BIGQUERY_PROJECT_ID = process.env.BQ_PROJECT_ID ?? process.env.GOOGLE_CLOUD_PROJECT ?? process.env.GCLOUD_PROJECT ?? "icpidentifier";
|
|
8
|
+
const LINKEDIN_COMPANIES_TABLE = "`icpidentifier.SalesGPT.linkedin_companies_processed`";
|
|
9
|
+
const LEADPOOL_TABLE = "`icpidentifier.SalesGPT.leadPool_new`";
|
|
10
|
+
function normalizeWhitespace(value) {
|
|
11
|
+
return (value ?? "").replace(/\s+/g, " ").trim();
|
|
12
|
+
}
|
|
13
|
+
function getLinkedInBaseUrl(env = process.env) {
|
|
14
|
+
const override = env.SALESPROMPTER_LINKEDIN_BASE_URL?.trim();
|
|
15
|
+
if (!override) {
|
|
16
|
+
return DEFAULT_LINKEDIN_BASE_URL;
|
|
17
|
+
}
|
|
18
|
+
return override.replace(/\/+$/, "");
|
|
19
|
+
}
|
|
20
|
+
function buildLinkedInCompanyUrl(companyId, env = process.env) {
|
|
21
|
+
return new URL(`/company/${companyId}/`, `${getLinkedInBaseUrl(env)}/`).toString().replace(/\/+$/, "");
|
|
22
|
+
}
|
|
23
|
+
function getCompanyHandleFromUrl(url) {
|
|
24
|
+
if (!url) {
|
|
25
|
+
return undefined;
|
|
26
|
+
}
|
|
27
|
+
const pathname = new URL(url).pathname;
|
|
28
|
+
const segments = pathname.split("/").filter(Boolean);
|
|
29
|
+
const companyIndex = segments.findIndex((segment) => segment.toLowerCase() === "company");
|
|
30
|
+
if (companyIndex === -1) {
|
|
31
|
+
return undefined;
|
|
32
|
+
}
|
|
33
|
+
const handle = normalizeWhitespace(segments[companyIndex + 1]);
|
|
34
|
+
if (!handle || /^\d+$/.test(handle)) {
|
|
35
|
+
return undefined;
|
|
36
|
+
}
|
|
37
|
+
return handle.toLowerCase();
|
|
38
|
+
}
|
|
39
|
+
function extractDomainFromWebsite(website) {
|
|
40
|
+
if (!website) {
|
|
41
|
+
return undefined;
|
|
42
|
+
}
|
|
43
|
+
try {
|
|
44
|
+
return new URL(website).hostname.replace(/^www\./i, "").toLowerCase();
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
return undefined;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
function getRapidApiLinkedInCompanyConfig(env = process.env) {
|
|
51
|
+
const apiKey = env.RAPIDAPI_KEY?.trim();
|
|
52
|
+
if (!apiKey) {
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
return {
|
|
56
|
+
apiKey,
|
|
57
|
+
host: env.RAPIDAPI_LINKEDIN_COMPANY_HOST?.trim() || DEFAULT_RAPIDAPI_LINKEDIN_COMPANY_HOST,
|
|
58
|
+
endpoint: env.RAPIDAPI_LINKEDIN_COMPANY_ENDPOINT?.trim() || DEFAULT_RAPIDAPI_LINKEDIN_COMPANY_ENDPOINT
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
function parseJsonLdOrganization($) {
|
|
62
|
+
const scripts = $('script[type="application/ld+json"]')
|
|
63
|
+
.map((_, element) => $(element).text())
|
|
64
|
+
.get();
|
|
65
|
+
for (const rawText of scripts) {
|
|
66
|
+
const text = rawText.trim();
|
|
67
|
+
if (!text) {
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
try {
|
|
71
|
+
const parsed = JSON.parse(text);
|
|
72
|
+
const items = Array.isArray(parsed) ? parsed : [parsed];
|
|
73
|
+
for (const item of items) {
|
|
74
|
+
if (!item || typeof item !== "object") {
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
77
|
+
const candidate = item;
|
|
78
|
+
const type = normalizeWhitespace(String(candidate["@type"] ?? ""));
|
|
79
|
+
if (type.toLowerCase() === "organization") {
|
|
80
|
+
return candidate;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
catch {
|
|
85
|
+
continue;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
return null;
|
|
89
|
+
}
|
|
90
|
+
function readDefinitionMap($) {
|
|
91
|
+
const entries = new Map();
|
|
92
|
+
$("dt").each((_, element) => {
|
|
93
|
+
const term = normalizeWhitespace($(element).text()).toLowerCase();
|
|
94
|
+
const description = normalizeWhitespace($(element).next("dd").text());
|
|
95
|
+
if (term && description) {
|
|
96
|
+
entries.set(term, description);
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
return entries;
|
|
100
|
+
}
|
|
101
|
+
function parseEmployeeCount(value) {
|
|
102
|
+
const digits = normalizeWhitespace(value).replace(/[^\d]/g, "");
|
|
103
|
+
if (!digits) {
|
|
104
|
+
return undefined;
|
|
105
|
+
}
|
|
106
|
+
const parsed = Number(digits);
|
|
107
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
108
|
+
}
|
|
109
|
+
function parseFoundedYear(value) {
|
|
110
|
+
const match = normalizeWhitespace(value).match(/\b(19|20)\d{2}\b/);
|
|
111
|
+
if (!match) {
|
|
112
|
+
return undefined;
|
|
113
|
+
}
|
|
114
|
+
return Number(match[0]);
|
|
115
|
+
}
|
|
116
|
+
export function parseLinkedInCompanyPage(html, requestUrl) {
|
|
117
|
+
const $ = load(html);
|
|
118
|
+
const timestamp = new Date().toISOString();
|
|
119
|
+
const canonicalUrl = normalizeWhitespace($('link[rel="canonical"]').attr("href")) || requestUrl;
|
|
120
|
+
const unavailableTitle = normalizeWhitespace($("title").text()).toLowerCase();
|
|
121
|
+
const unavailableBody = normalizeWhitespace($("body").text()).toLowerCase();
|
|
122
|
+
const unavailable = unavailableTitle.includes("page not found") ||
|
|
123
|
+
unavailableTitle.includes("linkedin") && unavailableTitle.includes("unavailable") ||
|
|
124
|
+
unavailableBody.includes("page isn't available") ||
|
|
125
|
+
unavailableBody.includes("this page doesn't exist");
|
|
126
|
+
const jsonLd = parseJsonLdOrganization($);
|
|
127
|
+
const definitions = readDefinitionMap($);
|
|
128
|
+
const requestIdMatch = requestUrl.match(/\/company\/(\d+)(?:\/|$)/i);
|
|
129
|
+
const canonicalIdMatch = canonicalUrl.match(/\/company\/(\d+)(?:\/|$)/i);
|
|
130
|
+
const resolvedId = Number(canonicalIdMatch?.[1] ?? requestIdMatch?.[1]);
|
|
131
|
+
if (!Number.isFinite(resolvedId)) {
|
|
132
|
+
throw new Error(`LinkedIn company URL is missing a numeric company id: ${requestUrl}`);
|
|
133
|
+
}
|
|
134
|
+
const name = normalizeWhitespace(String(jsonLd?.name ?? "")) ||
|
|
135
|
+
normalizeWhitespace($("h1").first().text()) ||
|
|
136
|
+
normalizeWhitespace($('meta[property="og:title"]').attr("content")) ||
|
|
137
|
+
undefined;
|
|
138
|
+
const website = normalizeWhitespace(String(jsonLd?.url ?? "")) ||
|
|
139
|
+
normalizeWhitespace(definitions.get("website")) ||
|
|
140
|
+
undefined;
|
|
141
|
+
const description = normalizeWhitespace(String(jsonLd?.description ?? "")) ||
|
|
142
|
+
normalizeWhitespace($('meta[name="description"]').attr("content")) ||
|
|
143
|
+
undefined;
|
|
144
|
+
const tagline = normalizeWhitespace($(".top-card-layout__headline").first().text()) ||
|
|
145
|
+
normalizeWhitespace($(".org-top-card-summary__tagline").first().text()) ||
|
|
146
|
+
undefined;
|
|
147
|
+
const industry = normalizeWhitespace(definitions.get("industry")) || undefined;
|
|
148
|
+
const companySize = normalizeWhitespace(definitions.get("company size")) || undefined;
|
|
149
|
+
const headquarters = normalizeWhitespace(definitions.get("headquarters")) || undefined;
|
|
150
|
+
const specialties = normalizeWhitespace(definitions.get("specialties")) || undefined;
|
|
151
|
+
const employeesOnLinkedIn = parseEmployeeCount(definitions.get("employees"));
|
|
152
|
+
const founded = parseFoundedYear(definitions.get("founded"));
|
|
153
|
+
return {
|
|
154
|
+
id: resolvedId,
|
|
155
|
+
handle: getCompanyHandleFromUrl(canonicalUrl),
|
|
156
|
+
name,
|
|
157
|
+
industry,
|
|
158
|
+
companySize,
|
|
159
|
+
headquarters,
|
|
160
|
+
website,
|
|
161
|
+
domain: extractDomainFromWebsite(website),
|
|
162
|
+
employeesOnLinkedIn,
|
|
163
|
+
description,
|
|
164
|
+
tagline,
|
|
165
|
+
specialties,
|
|
166
|
+
founded,
|
|
167
|
+
unavailable,
|
|
168
|
+
error: unavailable ? "LinkedIn company page is unavailable" : undefined,
|
|
169
|
+
timestamp
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
function parseRapidApiCompanyProfile(payload, candidate) {
|
|
173
|
+
const data = payload.data;
|
|
174
|
+
if (!data) {
|
|
175
|
+
throw new Error(`RapidAPI company response did not include data for ${candidate.companyUrl}`);
|
|
176
|
+
}
|
|
177
|
+
const id = Number(data.company_id ?? candidate.companyId);
|
|
178
|
+
if (!Number.isFinite(id)) {
|
|
179
|
+
throw new Error(`RapidAPI company response is missing a numeric company id for ${candidate.companyUrl}`);
|
|
180
|
+
}
|
|
181
|
+
const website = normalizeWhitespace(data.website) || undefined;
|
|
182
|
+
const linkedinUrl = normalizeWhitespace(data.linkedin_url) || candidate.companyUrl;
|
|
183
|
+
const industries = Array.isArray(data.industries) ? data.industries.map((value) => normalizeWhitespace(value)).filter(Boolean) : [];
|
|
184
|
+
return {
|
|
185
|
+
id,
|
|
186
|
+
handle: getCompanyHandleFromUrl(linkedinUrl),
|
|
187
|
+
name: normalizeWhitespace(data.company_name) || candidate.companyName,
|
|
188
|
+
industry: industries[0],
|
|
189
|
+
companySize: normalizeWhitespace(data.employee_range) || undefined,
|
|
190
|
+
headquarters: normalizeWhitespace(data.hq_full_address) || undefined,
|
|
191
|
+
website,
|
|
192
|
+
domain: normalizeWhitespace(data.domain) || extractDomainFromWebsite(website),
|
|
193
|
+
employeesOnLinkedIn: typeof data.employee_count === "number" && Number.isFinite(data.employee_count) ? data.employee_count : undefined,
|
|
194
|
+
description: normalizeWhitespace(data.description) || undefined,
|
|
195
|
+
tagline: normalizeWhitespace(data.tagline) || undefined,
|
|
196
|
+
specialties: normalizeWhitespace(data.specialties) || undefined,
|
|
197
|
+
founded: typeof data.year_founded === "number" && Number.isFinite(data.year_founded) ? data.year_founded : undefined,
|
|
198
|
+
unavailable: false,
|
|
199
|
+
timestamp: new Date().toISOString()
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
export function buildLinkedInCompanyBackfillSql(clientId, limit) {
|
|
203
|
+
return `WITH backlog AS (
|
|
204
|
+
SELECT
|
|
205
|
+
companyId,
|
|
206
|
+
ANY_VALUE(companyName) AS companyName,
|
|
207
|
+
ANY_VALUE(company_filter) AS companyFilter
|
|
208
|
+
FROM ${LEADPOOL_TABLE}
|
|
209
|
+
WHERE clientId = ${clientId}
|
|
210
|
+
AND companyId IS NOT NULL
|
|
211
|
+
AND (
|
|
212
|
+
COALESCE(company_toBeCrawled, FALSE) = TRUE
|
|
213
|
+
OR LOWER(CAST(company_filter AS STRING)) IN (
|
|
214
|
+
'02: company unavailable',
|
|
215
|
+
'02: linkedin companies unavailable',
|
|
216
|
+
'03: company to be crawled'
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
GROUP BY companyId
|
|
220
|
+
)
|
|
221
|
+
SELECT
|
|
222
|
+
companyId,
|
|
223
|
+
companyName,
|
|
224
|
+
companyFilter
|
|
225
|
+
FROM backlog
|
|
226
|
+
WHERE NOT EXISTS (
|
|
227
|
+
SELECT 1
|
|
228
|
+
FROM ${LINKEDIN_COMPANIES_TABLE} processed
|
|
229
|
+
WHERE processed.query = CONCAT('https://www.linkedin.com/company/', CAST(backlog.companyId AS STRING))
|
|
230
|
+
)
|
|
231
|
+
ORDER BY companyId
|
|
232
|
+
LIMIT ${limit}`;
|
|
233
|
+
}
|
|
234
|
+
function sqlString(value) {
|
|
235
|
+
if (value === undefined) {
|
|
236
|
+
return "NULL";
|
|
237
|
+
}
|
|
238
|
+
return `'${value.replaceAll("\\", "\\\\").replaceAll("'", "\\'")}'`;
|
|
239
|
+
}
|
|
240
|
+
function sqlInteger(value) {
|
|
241
|
+
return typeof value === "number" && Number.isFinite(value) ? String(Math.trunc(value)) : "NULL";
|
|
242
|
+
}
|
|
243
|
+
function sqlBoolean(value) {
|
|
244
|
+
return value ? "TRUE" : "FALSE";
|
|
245
|
+
}
|
|
246
|
+
export function buildLinkedInCompaniesMergeSql(rows) {
|
|
247
|
+
if (rows.length === 0) {
|
|
248
|
+
throw new Error("At least one LinkedIn company row is required for merge SQL.");
|
|
249
|
+
}
|
|
250
|
+
const sourceRows = rows
|
|
251
|
+
.map((row) => `STRUCT(
|
|
252
|
+
${sqlInteger(row.id)} AS id,
|
|
253
|
+
${sqlString(row.handle)} AS handle,
|
|
254
|
+
${sqlString(row.name)} AS name,
|
|
255
|
+
NULL AS industryId,
|
|
256
|
+
${sqlString(row.industry)} AS industry,
|
|
257
|
+
NULL AS industry_input,
|
|
258
|
+
NULL AS companySizeId,
|
|
259
|
+
${sqlString(row.companySize)} AS companySize,
|
|
260
|
+
${sqlString(row.headquarters)} AS headquarters,
|
|
261
|
+
NULL AS countryCode,
|
|
262
|
+
${sqlString(row.website)} AS website_linkedin,
|
|
263
|
+
${sqlString(row.domain)} AS domain_linkedin,
|
|
264
|
+
NULL AS hunter_emailCount,
|
|
265
|
+
NULL AS domainFinder_name,
|
|
266
|
+
NULL AS domain,
|
|
267
|
+
NULL AS emailDomainFinder_ts,
|
|
268
|
+
NULL AS company_missingHeadquarters,
|
|
269
|
+
FALSE AS blacklisted_bySalesPrompter,
|
|
270
|
+
FALSE AS company_countryCodeToBeProcessed,
|
|
271
|
+
FALSE AS domainBlacklisted,
|
|
272
|
+
FALSE AS company_emailDomainFinder_toBeProcessed,
|
|
273
|
+
FALSE AS company_emailDomainNotFound,
|
|
274
|
+
${sqlInteger(row.employeesOnLinkedIn)} AS employeesOnLinkedIn,
|
|
275
|
+
NULL AS growth6Mth,
|
|
276
|
+
NULL AS growth1Yr,
|
|
277
|
+
NULL AS growth2Yr,
|
|
278
|
+
${sqlString(row.description)} AS description,
|
|
279
|
+
${sqlString(row.tagline)} AS tagline,
|
|
280
|
+
${sqlString(row.specialties)} AS specialties,
|
|
281
|
+
${sqlInteger(row.founded)} AS founded,
|
|
282
|
+
${sqlString(row.error)} AS error,
|
|
283
|
+
${sqlBoolean(row.unavailable)} AS linkedin_companies_companyUnavailable,
|
|
284
|
+
TIMESTAMP(${sqlString(row.timestamp)}) AS timestamp
|
|
285
|
+
)`)
|
|
286
|
+
.join(",\n");
|
|
287
|
+
return `MERGE ${LINKEDIN_COMPANIES_TABLE} AS target
|
|
288
|
+
USING (
|
|
289
|
+
SELECT *
|
|
290
|
+
FROM UNNEST([
|
|
291
|
+
${sourceRows}
|
|
292
|
+
])
|
|
293
|
+
) AS source
|
|
294
|
+
ON target.id = source.id
|
|
295
|
+
WHEN MATCHED THEN UPDATE SET
|
|
296
|
+
handle = source.handle,
|
|
297
|
+
name = source.name,
|
|
298
|
+
industryId = source.industryId,
|
|
299
|
+
industry = source.industry,
|
|
300
|
+
industry_input = source.industry_input,
|
|
301
|
+
companySizeId = source.companySizeId,
|
|
302
|
+
companySize = source.companySize,
|
|
303
|
+
headquarters = source.headquarters,
|
|
304
|
+
countryCode = source.countryCode,
|
|
305
|
+
website_linkedin = source.website_linkedin,
|
|
306
|
+
domain_linkedin = source.domain_linkedin,
|
|
307
|
+
hunter_emailCount = source.hunter_emailCount,
|
|
308
|
+
domainFinder_name = source.domainFinder_name,
|
|
309
|
+
domain = source.domain,
|
|
310
|
+
emailDomainFinder_ts = source.emailDomainFinder_ts,
|
|
311
|
+
company_missingHeadquarters = source.company_missingHeadquarters,
|
|
312
|
+
blacklisted_bySalesPrompter = source.blacklisted_bySalesPrompter,
|
|
313
|
+
company_countryCodeToBeProcessed = source.company_countryCodeToBeProcessed,
|
|
314
|
+
domainBlacklisted = source.domainBlacklisted,
|
|
315
|
+
company_emailDomainFinder_toBeProcessed = source.company_emailDomainFinder_toBeProcessed,
|
|
316
|
+
company_emailDomainNotFound = source.company_emailDomainNotFound,
|
|
317
|
+
employeesOnLinkedIn = source.employeesOnLinkedIn,
|
|
318
|
+
growth6Mth = source.growth6Mth,
|
|
319
|
+
growth1Yr = source.growth1Yr,
|
|
320
|
+
growth2Yr = source.growth2Yr,
|
|
321
|
+
description = source.description,
|
|
322
|
+
tagline = source.tagline,
|
|
323
|
+
specialties = source.specialties,
|
|
324
|
+
founded = source.founded,
|
|
325
|
+
error = source.error,
|
|
326
|
+
linkedin_companies_companyUnavailable = source.linkedin_companies_companyUnavailable,
|
|
327
|
+
timestamp = source.timestamp
|
|
328
|
+
WHEN NOT MATCHED THEN INSERT (
|
|
329
|
+
id,
|
|
330
|
+
handle,
|
|
331
|
+
name,
|
|
332
|
+
industryId,
|
|
333
|
+
industry,
|
|
334
|
+
industry_input,
|
|
335
|
+
companySizeId,
|
|
336
|
+
companySize,
|
|
337
|
+
headquarters,
|
|
338
|
+
countryCode,
|
|
339
|
+
website_linkedin,
|
|
340
|
+
domain_linkedin,
|
|
341
|
+
hunter_emailCount,
|
|
342
|
+
domainFinder_name,
|
|
343
|
+
domain,
|
|
344
|
+
emailDomainFinder_ts,
|
|
345
|
+
company_missingHeadquarters,
|
|
346
|
+
blacklisted_bySalesPrompter,
|
|
347
|
+
company_countryCodeToBeProcessed,
|
|
348
|
+
domainBlacklisted,
|
|
349
|
+
company_emailDomainFinder_toBeProcessed,
|
|
350
|
+
company_emailDomainNotFound,
|
|
351
|
+
employeesOnLinkedIn,
|
|
352
|
+
growth6Mth,
|
|
353
|
+
growth1Yr,
|
|
354
|
+
growth2Yr,
|
|
355
|
+
description,
|
|
356
|
+
tagline,
|
|
357
|
+
specialties,
|
|
358
|
+
founded,
|
|
359
|
+
error,
|
|
360
|
+
linkedin_companies_companyUnavailable,
|
|
361
|
+
timestamp
|
|
362
|
+
) VALUES (
|
|
363
|
+
source.id,
|
|
364
|
+
source.handle,
|
|
365
|
+
source.name,
|
|
366
|
+
source.industryId,
|
|
367
|
+
source.industry,
|
|
368
|
+
source.industry_input,
|
|
369
|
+
source.companySizeId,
|
|
370
|
+
source.companySize,
|
|
371
|
+
source.headquarters,
|
|
372
|
+
source.countryCode,
|
|
373
|
+
source.website_linkedin,
|
|
374
|
+
source.domain_linkedin,
|
|
375
|
+
source.hunter_emailCount,
|
|
376
|
+
source.domainFinder_name,
|
|
377
|
+
source.domain,
|
|
378
|
+
source.emailDomainFinder_ts,
|
|
379
|
+
source.company_missingHeadquarters,
|
|
380
|
+
source.blacklisted_bySalesPrompter,
|
|
381
|
+
source.company_countryCodeToBeProcessed,
|
|
382
|
+
source.domainBlacklisted,
|
|
383
|
+
source.company_emailDomainFinder_toBeProcessed,
|
|
384
|
+
source.company_emailDomainNotFound,
|
|
385
|
+
source.employeesOnLinkedIn,
|
|
386
|
+
source.growth6Mth,
|
|
387
|
+
source.growth1Yr,
|
|
388
|
+
source.growth2Yr,
|
|
389
|
+
source.description,
|
|
390
|
+
source.tagline,
|
|
391
|
+
source.specialties,
|
|
392
|
+
source.founded,
|
|
393
|
+
source.error,
|
|
394
|
+
source.linkedin_companies_companyUnavailable,
|
|
395
|
+
source.timestamp
|
|
396
|
+
)`;
|
|
397
|
+
}
|
|
398
|
+
export function createLinkedInCompaniesBigQueryClient(env = process.env) {
|
|
399
|
+
const credentialsJson = env.GOOGLE_SERVICE_ACCOUNT_KEY?.trim();
|
|
400
|
+
if (credentialsJson) {
|
|
401
|
+
const normalizedCredentialsJson = credentialsJson.replace(/"private_key":"([\s\S]*?)"/, (_, privateKey) => `"private_key":"${privateKey.replace(/\n/g, "\\n")}"`);
|
|
402
|
+
return new BigQuery({
|
|
403
|
+
projectId: DEFAULT_BIGQUERY_PROJECT_ID,
|
|
404
|
+
credentials: JSON.parse(normalizedCredentialsJson)
|
|
405
|
+
});
|
|
406
|
+
}
|
|
407
|
+
return new BigQuery({
|
|
408
|
+
projectId: DEFAULT_BIGQUERY_PROJECT_ID
|
|
409
|
+
});
|
|
410
|
+
}
|
|
411
|
+
export async function fetchLinkedInCompanyBackfillCandidates(bigQuery, clientId, limit) {
|
|
412
|
+
const [rows] = await bigQuery.query({
|
|
413
|
+
query: buildLinkedInCompanyBackfillSql(clientId, limit),
|
|
414
|
+
useLegacySql: false
|
|
415
|
+
});
|
|
416
|
+
return rows.map((row) => {
|
|
417
|
+
const companyId = Number(row.companyId);
|
|
418
|
+
if (!Number.isFinite(companyId)) {
|
|
419
|
+
throw new Error(`BigQuery backlog row is missing a numeric companyId: ${JSON.stringify(row)}`);
|
|
420
|
+
}
|
|
421
|
+
return {
|
|
422
|
+
companyId,
|
|
423
|
+
companyName: normalizeWhitespace(String(row.companyName ?? "")) || undefined,
|
|
424
|
+
companyFilter: normalizeWhitespace(String(row.companyFilter ?? "")) || undefined,
|
|
425
|
+
companyUrl: buildLinkedInCompanyUrl(companyId)
|
|
426
|
+
};
|
|
427
|
+
});
|
|
428
|
+
}
|
|
429
|
+
async function fetchHtml(url) {
|
|
430
|
+
const response = await fetch(url, {
|
|
431
|
+
headers: {
|
|
432
|
+
"User-Agent": DEFAULT_USER_AGENT
|
|
433
|
+
}
|
|
434
|
+
});
|
|
435
|
+
return {
|
|
436
|
+
status: response.status,
|
|
437
|
+
html: await response.text()
|
|
438
|
+
};
|
|
439
|
+
}
|
|
440
|
+
export async function scrapeLinkedInCompany(candidate) {
|
|
441
|
+
const rapidApiConfig = getRapidApiLinkedInCompanyConfig();
|
|
442
|
+
if (rapidApiConfig) {
|
|
443
|
+
const url = new URL(rapidApiConfig.endpoint);
|
|
444
|
+
url.searchParams.set("linkedin_url", candidate.companyUrl);
|
|
445
|
+
const response = await fetch(url, {
|
|
446
|
+
headers: {
|
|
447
|
+
"x-rapidapi-key": rapidApiConfig.apiKey,
|
|
448
|
+
"x-rapidapi-host": rapidApiConfig.host,
|
|
449
|
+
"User-Agent": DEFAULT_USER_AGENT
|
|
450
|
+
}
|
|
451
|
+
});
|
|
452
|
+
const text = await response.text();
|
|
453
|
+
const parsed = (text ? JSON.parse(text) : {});
|
|
454
|
+
if (response.status === 404) {
|
|
455
|
+
return {
|
|
456
|
+
id: candidate.companyId,
|
|
457
|
+
name: candidate.companyName,
|
|
458
|
+
unavailable: true,
|
|
459
|
+
error: `RapidAPI company endpoint returned 404 for ${candidate.companyUrl}`,
|
|
460
|
+
timestamp: new Date().toISOString()
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
if (!response.ok) {
|
|
464
|
+
throw new Error(`RapidAPI company endpoint returned ${response.status} for ${candidate.companyUrl}`);
|
|
465
|
+
}
|
|
466
|
+
return parseRapidApiCompanyProfile(parsed, candidate);
|
|
467
|
+
}
|
|
468
|
+
const response = await fetchHtml(candidate.companyUrl);
|
|
469
|
+
if (response.status === 404 || response.status === 410) {
|
|
470
|
+
return {
|
|
471
|
+
id: candidate.companyId,
|
|
472
|
+
name: candidate.companyName,
|
|
473
|
+
unavailable: true,
|
|
474
|
+
error: `LinkedIn company page returned ${response.status}`,
|
|
475
|
+
timestamp: new Date().toISOString()
|
|
476
|
+
};
|
|
477
|
+
}
|
|
478
|
+
if (response.status >= 400) {
|
|
479
|
+
throw new Error(`LinkedIn company page returned ${response.status} for ${candidate.companyUrl}`);
|
|
480
|
+
}
|
|
481
|
+
const parsed = parseLinkedInCompanyPage(response.html, candidate.companyUrl);
|
|
482
|
+
return {
|
|
483
|
+
...parsed,
|
|
484
|
+
id: candidate.companyId,
|
|
485
|
+
name: parsed.name ?? candidate.companyName
|
|
486
|
+
};
|
|
487
|
+
}
|
|
488
|
+
async function mapWithConcurrency(values, concurrency, mapper) {
|
|
489
|
+
const results = new Array(values.length);
|
|
490
|
+
let currentIndex = 0;
|
|
491
|
+
async function worker() {
|
|
492
|
+
while (currentIndex < values.length) {
|
|
493
|
+
const index = currentIndex;
|
|
494
|
+
currentIndex += 1;
|
|
495
|
+
results[index] = await mapper(values[index]);
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
const workers = Array.from({ length: Math.max(1, Math.min(concurrency, values.length || 1)) }, () => worker());
|
|
499
|
+
await Promise.all(workers);
|
|
500
|
+
return results;
|
|
501
|
+
}
|
|
502
|
+
export async function backfillLinkedInCompanies(options) {
|
|
503
|
+
const limit = options.limit ?? 25;
|
|
504
|
+
const concurrency = options.concurrency ?? 4;
|
|
505
|
+
const bigQuery = createLinkedInCompaniesBigQueryClient();
|
|
506
|
+
const candidates = await fetchLinkedInCompanyBackfillCandidates(bigQuery, options.clientId, limit);
|
|
507
|
+
const attempts = await mapWithConcurrency(candidates, concurrency, async (candidate) => {
|
|
508
|
+
try {
|
|
509
|
+
return {
|
|
510
|
+
candidate,
|
|
511
|
+
ok: true,
|
|
512
|
+
profile: await scrapeLinkedInCompany(candidate)
|
|
513
|
+
};
|
|
514
|
+
}
|
|
515
|
+
catch (error) {
|
|
516
|
+
return {
|
|
517
|
+
candidate,
|
|
518
|
+
ok: false,
|
|
519
|
+
error: error instanceof Error ? error.message : "LinkedIn company scrape failed"
|
|
520
|
+
};
|
|
521
|
+
}
|
|
522
|
+
});
|
|
523
|
+
const profiles = attempts.filter((attempt) => attempt.ok)
|
|
524
|
+
.map((attempt) => attempt.profile);
|
|
525
|
+
const results = attempts.map((attempt) => ({
|
|
526
|
+
companyId: attempt.candidate.companyId,
|
|
527
|
+
companyUrl: attempt.candidate.companyUrl,
|
|
528
|
+
companyName: attempt.ok ? attempt.profile.name : attempt.candidate.companyName,
|
|
529
|
+
unavailable: attempt.ok ? attempt.profile.unavailable : false,
|
|
530
|
+
error: attempt.ok ? attempt.profile.error : attempt.error
|
|
531
|
+
}));
|
|
532
|
+
if (options.dryRun || profiles.length === 0) {
|
|
533
|
+
return {
|
|
534
|
+
clientId: options.clientId,
|
|
535
|
+
candidates,
|
|
536
|
+
results,
|
|
537
|
+
mergeSql: profiles.length > 0 ? buildLinkedInCompaniesMergeSql(profiles) : undefined
|
|
538
|
+
};
|
|
539
|
+
}
|
|
540
|
+
const mergeSql = buildLinkedInCompaniesMergeSql(profiles);
|
|
541
|
+
await bigQuery.query({
|
|
542
|
+
query: mergeSql,
|
|
543
|
+
useLegacySql: false
|
|
544
|
+
});
|
|
545
|
+
return {
|
|
546
|
+
clientId: options.clientId,
|
|
547
|
+
candidates,
|
|
548
|
+
results
|
|
549
|
+
};
|
|
550
|
+
}
|
|
@@ -547,10 +547,15 @@ async function findCategoryByCode(categoryCode, fetchHtml) {
|
|
|
547
547
|
}
|
|
548
548
|
});
|
|
549
549
|
for (const url of urls) {
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
550
|
+
try {
|
|
551
|
+
const html = await fetchHtml(url);
|
|
552
|
+
const parsed = parseLinkedInCategoryPage(html, url);
|
|
553
|
+
if (parsed.category.code === categoryCode) {
|
|
554
|
+
return parsed.category;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
catch {
|
|
558
|
+
continue;
|
|
554
559
|
}
|
|
555
560
|
}
|
|
556
561
|
throw new Error(`Could not resolve LinkedIn product category code ${categoryCode}.`);
|
|
@@ -606,9 +611,38 @@ export async function resolveLinkedInProductSource(input, fetchHtml) {
|
|
|
606
611
|
: resolvedInput.url;
|
|
607
612
|
const html = await fetchHtml(searchUrl);
|
|
608
613
|
const search = parseLinkedInProductSearchPage(html, searchUrl);
|
|
609
|
-
const
|
|
610
|
-
|
|
611
|
-
|
|
614
|
+
const rankedCandidates = (() => {
|
|
615
|
+
const best = pickBestSearchMatch(search.items, searchQuery);
|
|
616
|
+
const seen = new Set();
|
|
617
|
+
const ordered = [];
|
|
618
|
+
for (const candidate of [best, ...search.items]) {
|
|
619
|
+
if (!candidate || seen.has(candidate.productUrl)) {
|
|
620
|
+
continue;
|
|
621
|
+
}
|
|
622
|
+
seen.add(candidate.productUrl);
|
|
623
|
+
ordered.push(candidate);
|
|
624
|
+
}
|
|
625
|
+
return ordered;
|
|
626
|
+
})();
|
|
627
|
+
let product = null;
|
|
628
|
+
let matched = null;
|
|
629
|
+
let lastError = null;
|
|
630
|
+
for (const candidate of rankedCandidates) {
|
|
631
|
+
try {
|
|
632
|
+
const productHtml = await fetchHtml(candidate.productUrl);
|
|
633
|
+
product = parseLinkedInProductPage(productHtml, candidate.productUrl);
|
|
634
|
+
matched = candidate;
|
|
635
|
+
break;
|
|
636
|
+
}
|
|
637
|
+
catch (error) {
|
|
638
|
+
lastError = error instanceof Error ? error.message : String(error);
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
if (!product || !matched) {
|
|
642
|
+
throw new Error(lastError
|
|
643
|
+
? `Could not resolve a LinkedIn product page from search results. Last error: ${lastError}`
|
|
644
|
+
: "Could not resolve a LinkedIn product page from search results.");
|
|
645
|
+
}
|
|
612
646
|
return {
|
|
613
647
|
input,
|
|
614
648
|
kind: resolvedInput.kind,
|
|
@@ -670,14 +704,19 @@ export async function crawlLinkedInProductCategory(options) {
|
|
|
670
704
|
const itemsByUrl = new Map();
|
|
671
705
|
let totalPagesFetched = 0;
|
|
672
706
|
if (options.enrichDetails !== false && source.productUrl) {
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
707
|
+
try {
|
|
708
|
+
const sourceDetailHtml = await fetchHtml(source.productUrl);
|
|
709
|
+
const sourceDetail = parseLinkedInProductPage(sourceDetailHtml, source.productUrl);
|
|
710
|
+
itemsByUrl.set(sourceDetail.productUrl, toRecordFromProductPage(sourceDetail, {
|
|
711
|
+
rawPayload: {
|
|
712
|
+
source: "resolved-product-page",
|
|
713
|
+
resolvedFromInput: true
|
|
714
|
+
}
|
|
715
|
+
}));
|
|
716
|
+
}
|
|
717
|
+
catch {
|
|
718
|
+
// Keep crawling the category even if the resolved source product detail page is temporarily unavailable.
|
|
719
|
+
}
|
|
681
720
|
}
|
|
682
721
|
for (let pageNumber = 1; pageNumber <= maxPages; pageNumber += 1) {
|
|
683
722
|
const pageUrl = buildCategoryPageUrl(source.category.url, pageNumber);
|
|
@@ -722,9 +761,20 @@ export async function crawlLinkedInProductCategory(options) {
|
|
|
722
761
|
}
|
|
723
762
|
if (options.enrichDetails !== false && items.length > 0) {
|
|
724
763
|
const detailed = await mapWithConcurrency(items, options.detailConcurrency ?? 4, async (item) => {
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
764
|
+
try {
|
|
765
|
+
const html = await fetchHtml(item.productUrl);
|
|
766
|
+
const detail = parseLinkedInProductPage(html, item.productUrl);
|
|
767
|
+
return mergeDetailIntoRecord(item, detail);
|
|
768
|
+
}
|
|
769
|
+
catch (error) {
|
|
770
|
+
return {
|
|
771
|
+
...item,
|
|
772
|
+
rawPayload: {
|
|
773
|
+
...(item.rawPayload && typeof item.rawPayload === "object" ? item.rawPayload : {}),
|
|
774
|
+
detailFetchError: error instanceof Error ? error.message : String(error)
|
|
775
|
+
}
|
|
776
|
+
};
|
|
777
|
+
}
|
|
728
778
|
});
|
|
729
779
|
items = detailed;
|
|
730
780
|
}
|