salesprompter-cli 0.1.23 → 0.1.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,550 @@
1
+ import { load } from "cheerio";
2
+ import { BigQuery } from "@google-cloud/bigquery";
3
+ const DEFAULT_LINKEDIN_BASE_URL = "https://www.linkedin.com";
4
+ const DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36";
5
+ const DEFAULT_RAPIDAPI_LINKEDIN_COMPANY_HOST = "web-scraping-api2.p.rapidapi.com";
6
+ const DEFAULT_RAPIDAPI_LINKEDIN_COMPANY_ENDPOINT = "https://web-scraping-api2.p.rapidapi.com/get-company-by-linkedinurl";
7
+ const DEFAULT_BIGQUERY_PROJECT_ID = process.env.BQ_PROJECT_ID ?? process.env.GOOGLE_CLOUD_PROJECT ?? process.env.GCLOUD_PROJECT ?? "icpidentifier";
8
+ const LINKEDIN_COMPANIES_TABLE = "`icpidentifier.SalesGPT.linkedin_companies_processed`";
9
+ const LEADPOOL_TABLE = "`icpidentifier.SalesGPT.leadPool_new`";
10
+ function normalizeWhitespace(value) {
11
+ return (value ?? "").replace(/\s+/g, " ").trim();
12
+ }
13
+ function getLinkedInBaseUrl(env = process.env) {
14
+ const override = env.SALESPROMPTER_LINKEDIN_BASE_URL?.trim();
15
+ if (!override) {
16
+ return DEFAULT_LINKEDIN_BASE_URL;
17
+ }
18
+ return override.replace(/\/+$/, "");
19
+ }
20
+ function buildLinkedInCompanyUrl(companyId, env = process.env) {
21
+ return new URL(`/company/${companyId}/`, `${getLinkedInBaseUrl(env)}/`).toString().replace(/\/+$/, "");
22
+ }
23
+ function getCompanyHandleFromUrl(url) {
24
+ if (!url) {
25
+ return undefined;
26
+ }
27
+ const pathname = new URL(url).pathname;
28
+ const segments = pathname.split("/").filter(Boolean);
29
+ const companyIndex = segments.findIndex((segment) => segment.toLowerCase() === "company");
30
+ if (companyIndex === -1) {
31
+ return undefined;
32
+ }
33
+ const handle = normalizeWhitespace(segments[companyIndex + 1]);
34
+ if (!handle || /^\d+$/.test(handle)) {
35
+ return undefined;
36
+ }
37
+ return handle.toLowerCase();
38
+ }
39
+ function extractDomainFromWebsite(website) {
40
+ if (!website) {
41
+ return undefined;
42
+ }
43
+ try {
44
+ return new URL(website).hostname.replace(/^www\./i, "").toLowerCase();
45
+ }
46
+ catch {
47
+ return undefined;
48
+ }
49
+ }
50
+ function getRapidApiLinkedInCompanyConfig(env = process.env) {
51
+ const apiKey = env.RAPIDAPI_KEY?.trim();
52
+ if (!apiKey) {
53
+ return null;
54
+ }
55
+ return {
56
+ apiKey,
57
+ host: env.RAPIDAPI_LINKEDIN_COMPANY_HOST?.trim() || DEFAULT_RAPIDAPI_LINKEDIN_COMPANY_HOST,
58
+ endpoint: env.RAPIDAPI_LINKEDIN_COMPANY_ENDPOINT?.trim() || DEFAULT_RAPIDAPI_LINKEDIN_COMPANY_ENDPOINT
59
+ };
60
+ }
61
+ function parseJsonLdOrganization($) {
62
+ const scripts = $('script[type="application/ld+json"]')
63
+ .map((_, element) => $(element).text())
64
+ .get();
65
+ for (const rawText of scripts) {
66
+ const text = rawText.trim();
67
+ if (!text) {
68
+ continue;
69
+ }
70
+ try {
71
+ const parsed = JSON.parse(text);
72
+ const items = Array.isArray(parsed) ? parsed : [parsed];
73
+ for (const item of items) {
74
+ if (!item || typeof item !== "object") {
75
+ continue;
76
+ }
77
+ const candidate = item;
78
+ const type = normalizeWhitespace(String(candidate["@type"] ?? ""));
79
+ if (type.toLowerCase() === "organization") {
80
+ return candidate;
81
+ }
82
+ }
83
+ }
84
+ catch {
85
+ continue;
86
+ }
87
+ }
88
+ return null;
89
+ }
90
+ function readDefinitionMap($) {
91
+ const entries = new Map();
92
+ $("dt").each((_, element) => {
93
+ const term = normalizeWhitespace($(element).text()).toLowerCase();
94
+ const description = normalizeWhitespace($(element).next("dd").text());
95
+ if (term && description) {
96
+ entries.set(term, description);
97
+ }
98
+ });
99
+ return entries;
100
+ }
101
+ function parseEmployeeCount(value) {
102
+ const digits = normalizeWhitespace(value).replace(/[^\d]/g, "");
103
+ if (!digits) {
104
+ return undefined;
105
+ }
106
+ const parsed = Number(digits);
107
+ return Number.isFinite(parsed) ? parsed : undefined;
108
+ }
109
+ function parseFoundedYear(value) {
110
+ const match = normalizeWhitespace(value).match(/\b(19|20)\d{2}\b/);
111
+ if (!match) {
112
+ return undefined;
113
+ }
114
+ return Number(match[0]);
115
+ }
116
+ export function parseLinkedInCompanyPage(html, requestUrl) {
117
+ const $ = load(html);
118
+ const timestamp = new Date().toISOString();
119
+ const canonicalUrl = normalizeWhitespace($('link[rel="canonical"]').attr("href")) || requestUrl;
120
+ const unavailableTitle = normalizeWhitespace($("title").text()).toLowerCase();
121
+ const unavailableBody = normalizeWhitespace($("body").text()).toLowerCase();
122
+ const unavailable = unavailableTitle.includes("page not found") ||
123
+ unavailableTitle.includes("linkedin") && unavailableTitle.includes("unavailable") ||
124
+ unavailableBody.includes("page isn't available") ||
125
+ unavailableBody.includes("this page doesn't exist");
126
+ const jsonLd = parseJsonLdOrganization($);
127
+ const definitions = readDefinitionMap($);
128
+ const requestIdMatch = requestUrl.match(/\/company\/(\d+)(?:\/|$)/i);
129
+ const canonicalIdMatch = canonicalUrl.match(/\/company\/(\d+)(?:\/|$)/i);
130
+ const resolvedId = Number(canonicalIdMatch?.[1] ?? requestIdMatch?.[1]);
131
+ if (!Number.isFinite(resolvedId)) {
132
+ throw new Error(`LinkedIn company URL is missing a numeric company id: ${requestUrl}`);
133
+ }
134
+ const name = normalizeWhitespace(String(jsonLd?.name ?? "")) ||
135
+ normalizeWhitespace($("h1").first().text()) ||
136
+ normalizeWhitespace($('meta[property="og:title"]').attr("content")) ||
137
+ undefined;
138
+ const website = normalizeWhitespace(String(jsonLd?.url ?? "")) ||
139
+ normalizeWhitespace(definitions.get("website")) ||
140
+ undefined;
141
+ const description = normalizeWhitespace(String(jsonLd?.description ?? "")) ||
142
+ normalizeWhitespace($('meta[name="description"]').attr("content")) ||
143
+ undefined;
144
+ const tagline = normalizeWhitespace($(".top-card-layout__headline").first().text()) ||
145
+ normalizeWhitespace($(".org-top-card-summary__tagline").first().text()) ||
146
+ undefined;
147
+ const industry = normalizeWhitespace(definitions.get("industry")) || undefined;
148
+ const companySize = normalizeWhitespace(definitions.get("company size")) || undefined;
149
+ const headquarters = normalizeWhitespace(definitions.get("headquarters")) || undefined;
150
+ const specialties = normalizeWhitespace(definitions.get("specialties")) || undefined;
151
+ const employeesOnLinkedIn = parseEmployeeCount(definitions.get("employees"));
152
+ const founded = parseFoundedYear(definitions.get("founded"));
153
+ return {
154
+ id: resolvedId,
155
+ handle: getCompanyHandleFromUrl(canonicalUrl),
156
+ name,
157
+ industry,
158
+ companySize,
159
+ headquarters,
160
+ website,
161
+ domain: extractDomainFromWebsite(website),
162
+ employeesOnLinkedIn,
163
+ description,
164
+ tagline,
165
+ specialties,
166
+ founded,
167
+ unavailable,
168
+ error: unavailable ? "LinkedIn company page is unavailable" : undefined,
169
+ timestamp
170
+ };
171
+ }
172
+ function parseRapidApiCompanyProfile(payload, candidate) {
173
+ const data = payload.data;
174
+ if (!data) {
175
+ throw new Error(`RapidAPI company response did not include data for ${candidate.companyUrl}`);
176
+ }
177
+ const id = Number(data.company_id ?? candidate.companyId);
178
+ if (!Number.isFinite(id)) {
179
+ throw new Error(`RapidAPI company response is missing a numeric company id for ${candidate.companyUrl}`);
180
+ }
181
+ const website = normalizeWhitespace(data.website) || undefined;
182
+ const linkedinUrl = normalizeWhitespace(data.linkedin_url) || candidate.companyUrl;
183
+ const industries = Array.isArray(data.industries) ? data.industries.map((value) => normalizeWhitespace(value)).filter(Boolean) : [];
184
+ return {
185
+ id,
186
+ handle: getCompanyHandleFromUrl(linkedinUrl),
187
+ name: normalizeWhitespace(data.company_name) || candidate.companyName,
188
+ industry: industries[0],
189
+ companySize: normalizeWhitespace(data.employee_range) || undefined,
190
+ headquarters: normalizeWhitespace(data.hq_full_address) || undefined,
191
+ website,
192
+ domain: normalizeWhitespace(data.domain) || extractDomainFromWebsite(website),
193
+ employeesOnLinkedIn: typeof data.employee_count === "number" && Number.isFinite(data.employee_count) ? data.employee_count : undefined,
194
+ description: normalizeWhitespace(data.description) || undefined,
195
+ tagline: normalizeWhitespace(data.tagline) || undefined,
196
+ specialties: normalizeWhitespace(data.specialties) || undefined,
197
+ founded: typeof data.year_founded === "number" && Number.isFinite(data.year_founded) ? data.year_founded : undefined,
198
+ unavailable: false,
199
+ timestamp: new Date().toISOString()
200
+ };
201
+ }
202
+ export function buildLinkedInCompanyBackfillSql(clientId, limit) {
203
+ return `WITH backlog AS (
204
+ SELECT
205
+ companyId,
206
+ ANY_VALUE(companyName) AS companyName,
207
+ ANY_VALUE(company_filter) AS companyFilter
208
+ FROM ${LEADPOOL_TABLE}
209
+ WHERE clientId = ${clientId}
210
+ AND companyId IS NOT NULL
211
+ AND (
212
+ COALESCE(company_toBeCrawled, FALSE) = TRUE
213
+ OR LOWER(CAST(company_filter AS STRING)) IN (
214
+ '02: company unavailable',
215
+ '02: linkedin companies unavailable',
216
+ '03: company to be crawled'
217
+ )
218
+ )
219
+ GROUP BY companyId
220
+ )
221
+ SELECT
222
+ companyId,
223
+ companyName,
224
+ companyFilter
225
+ FROM backlog
226
+ WHERE NOT EXISTS (
227
+ SELECT 1
228
+ FROM ${LINKEDIN_COMPANIES_TABLE} processed
229
+ WHERE processed.query = CONCAT('https://www.linkedin.com/company/', CAST(backlog.companyId AS STRING))
230
+ )
231
+ ORDER BY companyId
232
+ LIMIT ${limit}`;
233
+ }
234
+ function sqlString(value) {
235
+ if (value === undefined) {
236
+ return "NULL";
237
+ }
238
+ return `'${value.replaceAll("\\", "\\\\").replaceAll("'", "\\'")}'`;
239
+ }
240
+ function sqlInteger(value) {
241
+ return typeof value === "number" && Number.isFinite(value) ? String(Math.trunc(value)) : "NULL";
242
+ }
243
+ function sqlBoolean(value) {
244
+ return value ? "TRUE" : "FALSE";
245
+ }
246
+ export function buildLinkedInCompaniesMergeSql(rows) {
247
+ if (rows.length === 0) {
248
+ throw new Error("At least one LinkedIn company row is required for merge SQL.");
249
+ }
250
+ const sourceRows = rows
251
+ .map((row) => `STRUCT(
252
+ ${sqlInteger(row.id)} AS id,
253
+ ${sqlString(row.handle)} AS handle,
254
+ ${sqlString(row.name)} AS name,
255
+ NULL AS industryId,
256
+ ${sqlString(row.industry)} AS industry,
257
+ NULL AS industry_input,
258
+ NULL AS companySizeId,
259
+ ${sqlString(row.companySize)} AS companySize,
260
+ ${sqlString(row.headquarters)} AS headquarters,
261
+ NULL AS countryCode,
262
+ ${sqlString(row.website)} AS website_linkedin,
263
+ ${sqlString(row.domain)} AS domain_linkedin,
264
+ NULL AS hunter_emailCount,
265
+ NULL AS domainFinder_name,
266
+ NULL AS domain,
267
+ NULL AS emailDomainFinder_ts,
268
+ NULL AS company_missingHeadquarters,
269
+ FALSE AS blacklisted_bySalesPrompter,
270
+ FALSE AS company_countryCodeToBeProcessed,
271
+ FALSE AS domainBlacklisted,
272
+ FALSE AS company_emailDomainFinder_toBeProcessed,
273
+ FALSE AS company_emailDomainNotFound,
274
+ ${sqlInteger(row.employeesOnLinkedIn)} AS employeesOnLinkedIn,
275
+ NULL AS growth6Mth,
276
+ NULL AS growth1Yr,
277
+ NULL AS growth2Yr,
278
+ ${sqlString(row.description)} AS description,
279
+ ${sqlString(row.tagline)} AS tagline,
280
+ ${sqlString(row.specialties)} AS specialties,
281
+ ${sqlInteger(row.founded)} AS founded,
282
+ ${sqlString(row.error)} AS error,
283
+ ${sqlBoolean(row.unavailable)} AS linkedin_companies_companyUnavailable,
284
+ TIMESTAMP(${sqlString(row.timestamp)}) AS timestamp
285
+ )`)
286
+ .join(",\n");
287
+ return `MERGE ${LINKEDIN_COMPANIES_TABLE} AS target
288
+ USING (
289
+ SELECT *
290
+ FROM UNNEST([
291
+ ${sourceRows}
292
+ ])
293
+ ) AS source
294
+ ON target.id = source.id
295
+ WHEN MATCHED THEN UPDATE SET
296
+ handle = source.handle,
297
+ name = source.name,
298
+ industryId = source.industryId,
299
+ industry = source.industry,
300
+ industry_input = source.industry_input,
301
+ companySizeId = source.companySizeId,
302
+ companySize = source.companySize,
303
+ headquarters = source.headquarters,
304
+ countryCode = source.countryCode,
305
+ website_linkedin = source.website_linkedin,
306
+ domain_linkedin = source.domain_linkedin,
307
+ hunter_emailCount = source.hunter_emailCount,
308
+ domainFinder_name = source.domainFinder_name,
309
+ domain = source.domain,
310
+ emailDomainFinder_ts = source.emailDomainFinder_ts,
311
+ company_missingHeadquarters = source.company_missingHeadquarters,
312
+ blacklisted_bySalesPrompter = source.blacklisted_bySalesPrompter,
313
+ company_countryCodeToBeProcessed = source.company_countryCodeToBeProcessed,
314
+ domainBlacklisted = source.domainBlacklisted,
315
+ company_emailDomainFinder_toBeProcessed = source.company_emailDomainFinder_toBeProcessed,
316
+ company_emailDomainNotFound = source.company_emailDomainNotFound,
317
+ employeesOnLinkedIn = source.employeesOnLinkedIn,
318
+ growth6Mth = source.growth6Mth,
319
+ growth1Yr = source.growth1Yr,
320
+ growth2Yr = source.growth2Yr,
321
+ description = source.description,
322
+ tagline = source.tagline,
323
+ specialties = source.specialties,
324
+ founded = source.founded,
325
+ error = source.error,
326
+ linkedin_companies_companyUnavailable = source.linkedin_companies_companyUnavailable,
327
+ timestamp = source.timestamp
328
+ WHEN NOT MATCHED THEN INSERT (
329
+ id,
330
+ handle,
331
+ name,
332
+ industryId,
333
+ industry,
334
+ industry_input,
335
+ companySizeId,
336
+ companySize,
337
+ headquarters,
338
+ countryCode,
339
+ website_linkedin,
340
+ domain_linkedin,
341
+ hunter_emailCount,
342
+ domainFinder_name,
343
+ domain,
344
+ emailDomainFinder_ts,
345
+ company_missingHeadquarters,
346
+ blacklisted_bySalesPrompter,
347
+ company_countryCodeToBeProcessed,
348
+ domainBlacklisted,
349
+ company_emailDomainFinder_toBeProcessed,
350
+ company_emailDomainNotFound,
351
+ employeesOnLinkedIn,
352
+ growth6Mth,
353
+ growth1Yr,
354
+ growth2Yr,
355
+ description,
356
+ tagline,
357
+ specialties,
358
+ founded,
359
+ error,
360
+ linkedin_companies_companyUnavailable,
361
+ timestamp
362
+ ) VALUES (
363
+ source.id,
364
+ source.handle,
365
+ source.name,
366
+ source.industryId,
367
+ source.industry,
368
+ source.industry_input,
369
+ source.companySizeId,
370
+ source.companySize,
371
+ source.headquarters,
372
+ source.countryCode,
373
+ source.website_linkedin,
374
+ source.domain_linkedin,
375
+ source.hunter_emailCount,
376
+ source.domainFinder_name,
377
+ source.domain,
378
+ source.emailDomainFinder_ts,
379
+ source.company_missingHeadquarters,
380
+ source.blacklisted_bySalesPrompter,
381
+ source.company_countryCodeToBeProcessed,
382
+ source.domainBlacklisted,
383
+ source.company_emailDomainFinder_toBeProcessed,
384
+ source.company_emailDomainNotFound,
385
+ source.employeesOnLinkedIn,
386
+ source.growth6Mth,
387
+ source.growth1Yr,
388
+ source.growth2Yr,
389
+ source.description,
390
+ source.tagline,
391
+ source.specialties,
392
+ source.founded,
393
+ source.error,
394
+ source.linkedin_companies_companyUnavailable,
395
+ source.timestamp
396
+ )`;
397
+ }
398
+ export function createLinkedInCompaniesBigQueryClient(env = process.env) {
399
+ const credentialsJson = env.GOOGLE_SERVICE_ACCOUNT_KEY?.trim();
400
+ if (credentialsJson) {
401
+ const normalizedCredentialsJson = credentialsJson.replace(/"private_key":"([\s\S]*?)"/, (_, privateKey) => `"private_key":"${privateKey.replace(/\n/g, "\\n")}"`);
402
+ return new BigQuery({
403
+ projectId: DEFAULT_BIGQUERY_PROJECT_ID,
404
+ credentials: JSON.parse(normalizedCredentialsJson)
405
+ });
406
+ }
407
+ return new BigQuery({
408
+ projectId: DEFAULT_BIGQUERY_PROJECT_ID
409
+ });
410
+ }
411
+ export async function fetchLinkedInCompanyBackfillCandidates(bigQuery, clientId, limit) {
412
+ const [rows] = await bigQuery.query({
413
+ query: buildLinkedInCompanyBackfillSql(clientId, limit),
414
+ useLegacySql: false
415
+ });
416
+ return rows.map((row) => {
417
+ const companyId = Number(row.companyId);
418
+ if (!Number.isFinite(companyId)) {
419
+ throw new Error(`BigQuery backlog row is missing a numeric companyId: ${JSON.stringify(row)}`);
420
+ }
421
+ return {
422
+ companyId,
423
+ companyName: normalizeWhitespace(String(row.companyName ?? "")) || undefined,
424
+ companyFilter: normalizeWhitespace(String(row.companyFilter ?? "")) || undefined,
425
+ companyUrl: buildLinkedInCompanyUrl(companyId)
426
+ };
427
+ });
428
+ }
429
+ async function fetchHtml(url) {
430
+ const response = await fetch(url, {
431
+ headers: {
432
+ "User-Agent": DEFAULT_USER_AGENT
433
+ }
434
+ });
435
+ return {
436
+ status: response.status,
437
+ html: await response.text()
438
+ };
439
+ }
440
+ export async function scrapeLinkedInCompany(candidate) {
441
+ const rapidApiConfig = getRapidApiLinkedInCompanyConfig();
442
+ if (rapidApiConfig) {
443
+ const url = new URL(rapidApiConfig.endpoint);
444
+ url.searchParams.set("linkedin_url", candidate.companyUrl);
445
+ const response = await fetch(url, {
446
+ headers: {
447
+ "x-rapidapi-key": rapidApiConfig.apiKey,
448
+ "x-rapidapi-host": rapidApiConfig.host,
449
+ "User-Agent": DEFAULT_USER_AGENT
450
+ }
451
+ });
452
+ const text = await response.text();
453
+ const parsed = (text ? JSON.parse(text) : {});
454
+ if (response.status === 404) {
455
+ return {
456
+ id: candidate.companyId,
457
+ name: candidate.companyName,
458
+ unavailable: true,
459
+ error: `RapidAPI company endpoint returned 404 for ${candidate.companyUrl}`,
460
+ timestamp: new Date().toISOString()
461
+ };
462
+ }
463
+ if (!response.ok) {
464
+ throw new Error(`RapidAPI company endpoint returned ${response.status} for ${candidate.companyUrl}`);
465
+ }
466
+ return parseRapidApiCompanyProfile(parsed, candidate);
467
+ }
468
+ const response = await fetchHtml(candidate.companyUrl);
469
+ if (response.status === 404 || response.status === 410) {
470
+ return {
471
+ id: candidate.companyId,
472
+ name: candidate.companyName,
473
+ unavailable: true,
474
+ error: `LinkedIn company page returned ${response.status}`,
475
+ timestamp: new Date().toISOString()
476
+ };
477
+ }
478
+ if (response.status >= 400) {
479
+ throw new Error(`LinkedIn company page returned ${response.status} for ${candidate.companyUrl}`);
480
+ }
481
+ const parsed = parseLinkedInCompanyPage(response.html, candidate.companyUrl);
482
+ return {
483
+ ...parsed,
484
+ id: candidate.companyId,
485
+ name: parsed.name ?? candidate.companyName
486
+ };
487
+ }
488
+ async function mapWithConcurrency(values, concurrency, mapper) {
489
+ const results = new Array(values.length);
490
+ let currentIndex = 0;
491
+ async function worker() {
492
+ while (currentIndex < values.length) {
493
+ const index = currentIndex;
494
+ currentIndex += 1;
495
+ results[index] = await mapper(values[index]);
496
+ }
497
+ }
498
+ const workers = Array.from({ length: Math.max(1, Math.min(concurrency, values.length || 1)) }, () => worker());
499
+ await Promise.all(workers);
500
+ return results;
501
+ }
502
+ export async function backfillLinkedInCompanies(options) {
503
+ const limit = options.limit ?? 25;
504
+ const concurrency = options.concurrency ?? 4;
505
+ const bigQuery = createLinkedInCompaniesBigQueryClient();
506
+ const candidates = await fetchLinkedInCompanyBackfillCandidates(bigQuery, options.clientId, limit);
507
+ const attempts = await mapWithConcurrency(candidates, concurrency, async (candidate) => {
508
+ try {
509
+ return {
510
+ candidate,
511
+ ok: true,
512
+ profile: await scrapeLinkedInCompany(candidate)
513
+ };
514
+ }
515
+ catch (error) {
516
+ return {
517
+ candidate,
518
+ ok: false,
519
+ error: error instanceof Error ? error.message : "LinkedIn company scrape failed"
520
+ };
521
+ }
522
+ });
523
+ const profiles = attempts.filter((attempt) => attempt.ok)
524
+ .map((attempt) => attempt.profile);
525
+ const results = attempts.map((attempt) => ({
526
+ companyId: attempt.candidate.companyId,
527
+ companyUrl: attempt.candidate.companyUrl,
528
+ companyName: attempt.ok ? attempt.profile.name : attempt.candidate.companyName,
529
+ unavailable: attempt.ok ? attempt.profile.unavailable : false,
530
+ error: attempt.ok ? attempt.profile.error : attempt.error
531
+ }));
532
+ if (options.dryRun || profiles.length === 0) {
533
+ return {
534
+ clientId: options.clientId,
535
+ candidates,
536
+ results,
537
+ mergeSql: profiles.length > 0 ? buildLinkedInCompaniesMergeSql(profiles) : undefined
538
+ };
539
+ }
540
+ const mergeSql = buildLinkedInCompaniesMergeSql(profiles);
541
+ await bigQuery.query({
542
+ query: mergeSql,
543
+ useLegacySql: false
544
+ });
545
+ return {
546
+ clientId: options.clientId,
547
+ candidates,
548
+ results
549
+ };
550
+ }
@@ -547,10 +547,15 @@ async function findCategoryByCode(categoryCode, fetchHtml) {
547
547
  }
548
548
  });
549
549
  for (const url of urls) {
550
- const html = await fetchHtml(url);
551
- const parsed = parseLinkedInCategoryPage(html, url);
552
- if (parsed.category.code === categoryCode) {
553
- return parsed.category;
550
+ try {
551
+ const html = await fetchHtml(url);
552
+ const parsed = parseLinkedInCategoryPage(html, url);
553
+ if (parsed.category.code === categoryCode) {
554
+ return parsed.category;
555
+ }
556
+ }
557
+ catch {
558
+ continue;
554
559
  }
555
560
  }
556
561
  throw new Error(`Could not resolve LinkedIn product category code ${categoryCode}.`);
@@ -606,9 +611,38 @@ export async function resolveLinkedInProductSource(input, fetchHtml) {
606
611
  : resolvedInput.url;
607
612
  const html = await fetchHtml(searchUrl);
608
613
  const search = parseLinkedInProductSearchPage(html, searchUrl);
609
- const matched = pickBestSearchMatch(search.items, searchQuery);
610
- const productHtml = await fetchHtml(matched.productUrl);
611
- const product = parseLinkedInProductPage(productHtml, matched.productUrl);
614
+ const rankedCandidates = (() => {
615
+ const best = pickBestSearchMatch(search.items, searchQuery);
616
+ const seen = new Set();
617
+ const ordered = [];
618
+ for (const candidate of [best, ...search.items]) {
619
+ if (!candidate || seen.has(candidate.productUrl)) {
620
+ continue;
621
+ }
622
+ seen.add(candidate.productUrl);
623
+ ordered.push(candidate);
624
+ }
625
+ return ordered;
626
+ })();
627
+ let product = null;
628
+ let matched = null;
629
+ let lastError = null;
630
+ for (const candidate of rankedCandidates) {
631
+ try {
632
+ const productHtml = await fetchHtml(candidate.productUrl);
633
+ product = parseLinkedInProductPage(productHtml, candidate.productUrl);
634
+ matched = candidate;
635
+ break;
636
+ }
637
+ catch (error) {
638
+ lastError = error instanceof Error ? error.message : String(error);
639
+ }
640
+ }
641
+ if (!product || !matched) {
642
+ throw new Error(lastError
643
+ ? `Could not resolve a LinkedIn product page from search results. Last error: ${lastError}`
644
+ : "Could not resolve a LinkedIn product page from search results.");
645
+ }
612
646
  return {
613
647
  input,
614
648
  kind: resolvedInput.kind,
@@ -670,14 +704,19 @@ export async function crawlLinkedInProductCategory(options) {
670
704
  const itemsByUrl = new Map();
671
705
  let totalPagesFetched = 0;
672
706
  if (options.enrichDetails !== false && source.productUrl) {
673
- const sourceDetailHtml = await fetchHtml(source.productUrl);
674
- const sourceDetail = parseLinkedInProductPage(sourceDetailHtml, source.productUrl);
675
- itemsByUrl.set(sourceDetail.productUrl, toRecordFromProductPage(sourceDetail, {
676
- rawPayload: {
677
- source: "resolved-product-page",
678
- resolvedFromInput: true
679
- }
680
- }));
707
+ try {
708
+ const sourceDetailHtml = await fetchHtml(source.productUrl);
709
+ const sourceDetail = parseLinkedInProductPage(sourceDetailHtml, source.productUrl);
710
+ itemsByUrl.set(sourceDetail.productUrl, toRecordFromProductPage(sourceDetail, {
711
+ rawPayload: {
712
+ source: "resolved-product-page",
713
+ resolvedFromInput: true
714
+ }
715
+ }));
716
+ }
717
+ catch {
718
+ // Keep crawling the category even if the resolved source product detail page is temporarily unavailable.
719
+ }
681
720
  }
682
721
  for (let pageNumber = 1; pageNumber <= maxPages; pageNumber += 1) {
683
722
  const pageUrl = buildCategoryPageUrl(source.category.url, pageNumber);
@@ -722,9 +761,20 @@ export async function crawlLinkedInProductCategory(options) {
722
761
  }
723
762
  if (options.enrichDetails !== false && items.length > 0) {
724
763
  const detailed = await mapWithConcurrency(items, options.detailConcurrency ?? 4, async (item) => {
725
- const html = await fetchHtml(item.productUrl);
726
- const detail = parseLinkedInProductPage(html, item.productUrl);
727
- return mergeDetailIntoRecord(item, detail);
764
+ try {
765
+ const html = await fetchHtml(item.productUrl);
766
+ const detail = parseLinkedInProductPage(html, item.productUrl);
767
+ return mergeDetailIntoRecord(item, detail);
768
+ }
769
+ catch (error) {
770
+ return {
771
+ ...item,
772
+ rawPayload: {
773
+ ...(item.rawPayload && typeof item.rawPayload === "object" ? item.rawPayload : {}),
774
+ detailFetchError: error instanceof Error ? error.message : String(error)
775
+ }
776
+ };
777
+ }
728
778
  });
729
779
  items = detailed;
730
780
  }