salesprompter-cli 0.1.17 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/dist/bigquery.js +9 -4
- package/dist/cli.js +1179 -305
- package/dist/direct-path.js +310 -0
- package/dist/linkedin-products.js +715 -0
- package/dist/sales-navigator.js +475 -0
- package/package.json +2 -1
|
@@ -0,0 +1,715 @@
|
|
|
1
|
+
import { load } from "cheerio";
|
|
2
|
+
const LINKEDIN_BASE_URL = "https://www.linkedin.com";
|
|
3
|
+
const DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36";
|
|
4
|
+
function normalizeWhitespace(value) {
|
|
5
|
+
return (value ?? "").replace(/\s+/g, " ").trim();
|
|
6
|
+
}
|
|
7
|
+
function normalizeDomainInput(value) {
|
|
8
|
+
return value
|
|
9
|
+
.trim()
|
|
10
|
+
.toLowerCase()
|
|
11
|
+
.replace(/^https?:\/\//, "")
|
|
12
|
+
.replace(/^www\./, "")
|
|
13
|
+
.split("/")[0] ?? "";
|
|
14
|
+
}
|
|
15
|
+
function toAbsoluteLinkedInUrl(value) {
|
|
16
|
+
const trimmed = normalizeWhitespace(value);
|
|
17
|
+
if (trimmed.length === 0) {
|
|
18
|
+
return undefined;
|
|
19
|
+
}
|
|
20
|
+
const url = trimmed.startsWith("http://") || trimmed.startsWith("https://")
|
|
21
|
+
? new URL(trimmed)
|
|
22
|
+
: new URL(trimmed, LINKEDIN_BASE_URL);
|
|
23
|
+
if (!/(^|\.)linkedin\.com$/i.test(url.hostname)) {
|
|
24
|
+
return undefined;
|
|
25
|
+
}
|
|
26
|
+
url.search = "";
|
|
27
|
+
url.hash = "";
|
|
28
|
+
return url.toString().replace(/\/+$/, "");
|
|
29
|
+
}
|
|
30
|
+
function toAbsoluteUrl(value) {
|
|
31
|
+
const trimmed = normalizeWhitespace(value);
|
|
32
|
+
if (trimmed.length === 0) {
|
|
33
|
+
return undefined;
|
|
34
|
+
}
|
|
35
|
+
try {
|
|
36
|
+
return new URL(trimmed, LINKEDIN_BASE_URL).toString();
|
|
37
|
+
}
|
|
38
|
+
catch {
|
|
39
|
+
return undefined;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
function getLinkedInPathSegments(url) {
|
|
43
|
+
const parsed = new URL(url);
|
|
44
|
+
return parsed.pathname.split("/").filter((segment) => segment.length > 0);
|
|
45
|
+
}
|
|
46
|
+
function getCompanyHandle(url) {
|
|
47
|
+
if (!url) {
|
|
48
|
+
return undefined;
|
|
49
|
+
}
|
|
50
|
+
const segments = getLinkedInPathSegments(url);
|
|
51
|
+
const companyIndex = segments.findIndex((segment) => segment.toLowerCase() === "company");
|
|
52
|
+
if (companyIndex === -1) {
|
|
53
|
+
return undefined;
|
|
54
|
+
}
|
|
55
|
+
const handle = normalizeWhitespace(segments[companyIndex + 1]);
|
|
56
|
+
return handle.length > 0 ? handle.toLowerCase() : undefined;
|
|
57
|
+
}
|
|
58
|
+
function getProductSlug(url) {
|
|
59
|
+
const segments = getLinkedInPathSegments(url);
|
|
60
|
+
const productsIndex = segments.findIndex((segment) => segment.toLowerCase() === "products");
|
|
61
|
+
const slug = normalizeWhitespace(segments[productsIndex + 1]);
|
|
62
|
+
if (productsIndex === -1 || slug.length === 0) {
|
|
63
|
+
throw new Error(`LinkedIn product URL is invalid: ${url}`);
|
|
64
|
+
}
|
|
65
|
+
return decodeURIComponent(slug).toLowerCase();
|
|
66
|
+
}
|
|
67
|
+
function getCategorySlug(url) {
|
|
68
|
+
const segments = getLinkedInPathSegments(url);
|
|
69
|
+
const categoriesIndex = segments.findIndex((segment) => segment.toLowerCase() === "categories");
|
|
70
|
+
const slug = normalizeWhitespace(segments[categoriesIndex + 1]);
|
|
71
|
+
if (categoriesIndex === -1 || slug.length === 0) {
|
|
72
|
+
throw new Error(`LinkedIn category URL is invalid: ${url}`);
|
|
73
|
+
}
|
|
74
|
+
return decodeURIComponent(slug).toLowerCase();
|
|
75
|
+
}
|
|
76
|
+
function parseResultCount(value) {
|
|
77
|
+
const match = normalizeWhitespace(value).match(/([\d,.]+)\s+results/i);
|
|
78
|
+
if (!match) {
|
|
79
|
+
return undefined;
|
|
80
|
+
}
|
|
81
|
+
const digits = match[1]?.replace(/[^\d]/g, "");
|
|
82
|
+
if (!digits) {
|
|
83
|
+
return undefined;
|
|
84
|
+
}
|
|
85
|
+
const parsed = Number(digits);
|
|
86
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
87
|
+
}
|
|
88
|
+
function parseCategoryCodeFromHtml($) {
|
|
89
|
+
const rawValue = $("#filterValues").html();
|
|
90
|
+
if (!rawValue) {
|
|
91
|
+
return undefined;
|
|
92
|
+
}
|
|
93
|
+
const normalized = rawValue.replace(/^<!--/, "").replace(/-->$/, "").trim();
|
|
94
|
+
if (normalized.length === 0) {
|
|
95
|
+
return undefined;
|
|
96
|
+
}
|
|
97
|
+
try {
|
|
98
|
+
const parsed = JSON.parse(normalized);
|
|
99
|
+
const currentUrn = parsed.currentCategoryUrn ?? parsed.productCategoryUrns?.[0];
|
|
100
|
+
const code = currentUrn?.split(":").at(-1)?.trim();
|
|
101
|
+
return code && code.length > 0 ? code : undefined;
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
return undefined;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
function getImageUrl(element) {
|
|
108
|
+
const delayed = normalizeWhitespace(element.attr("data-delayed-url"));
|
|
109
|
+
if (delayed.length > 0) {
|
|
110
|
+
return toAbsoluteUrl(delayed);
|
|
111
|
+
}
|
|
112
|
+
const src = normalizeWhitespace(element.attr("src"));
|
|
113
|
+
return src.length > 0 ? toAbsoluteUrl(src) : undefined;
|
|
114
|
+
}
|
|
115
|
+
function buildCategoryFromUrl(url, name, options) {
|
|
116
|
+
return {
|
|
117
|
+
name,
|
|
118
|
+
slug: getCategorySlug(url),
|
|
119
|
+
url,
|
|
120
|
+
code: options?.code,
|
|
121
|
+
description: options?.description,
|
|
122
|
+
totalResults: options?.totalResults
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
function parseSubtitleForCategory(subtitle) {
|
|
126
|
+
const normalized = normalizeWhitespace(subtitle);
|
|
127
|
+
const match = normalized.match(/^(.*?)\s+by\s+(.*)$/i);
|
|
128
|
+
if (!match) {
|
|
129
|
+
return {};
|
|
130
|
+
}
|
|
131
|
+
return {
|
|
132
|
+
categoryName: normalizeWhitespace(match[1]),
|
|
133
|
+
vendorName: normalizeWhitespace(match[2])
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
function uniqueNonEmptyText(values) {
|
|
137
|
+
const seen = new Set();
|
|
138
|
+
const result = [];
|
|
139
|
+
for (const value of values) {
|
|
140
|
+
const normalized = normalizeWhitespace(value);
|
|
141
|
+
if (normalized.length === 0) {
|
|
142
|
+
continue;
|
|
143
|
+
}
|
|
144
|
+
if (seen.has(normalized)) {
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
seen.add(normalized);
|
|
148
|
+
result.push(normalized);
|
|
149
|
+
}
|
|
150
|
+
return result;
|
|
151
|
+
}
|
|
152
|
+
function pickLongerText(...values) {
|
|
153
|
+
const normalized = values
|
|
154
|
+
.map((value) => normalizeWhitespace(value))
|
|
155
|
+
.filter((value) => value.length > 0);
|
|
156
|
+
normalized.sort((left, right) => right.length - left.length);
|
|
157
|
+
return normalized[0];
|
|
158
|
+
}
|
|
159
|
+
function toRecordFromProductPage(detail, options) {
|
|
160
|
+
return {
|
|
161
|
+
productName: detail.productName,
|
|
162
|
+
productSlug: detail.productSlug,
|
|
163
|
+
productUrl: detail.productUrl,
|
|
164
|
+
imageUrl: detail.imageUrl,
|
|
165
|
+
description: detail.description,
|
|
166
|
+
vendor: detail.vendor,
|
|
167
|
+
category: detail.category,
|
|
168
|
+
pageNumber: options?.pageNumber ?? 1,
|
|
169
|
+
positionOnPage: options?.positionOnPage ?? 1,
|
|
170
|
+
learnMoreUrl: detail.learnMoreUrl,
|
|
171
|
+
intendedRoles: detail.intendedRoles,
|
|
172
|
+
usedBy: detail.usedBy,
|
|
173
|
+
rawPayload: options?.rawPayload ?? {
|
|
174
|
+
source: "product-page"
|
|
175
|
+
}
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
function parseProductCard($, card, pageNumber, positionOnPage, fallbackCategory) {
|
|
179
|
+
const productAnchor = card.find("h3 a, .product-serp-card__image-container").first();
|
|
180
|
+
const productUrl = toAbsoluteLinkedInUrl(productAnchor.attr("href"));
|
|
181
|
+
const productName = normalizeWhitespace(card.find("h3").text());
|
|
182
|
+
if (!productUrl || productName.length === 0) {
|
|
183
|
+
return null;
|
|
184
|
+
}
|
|
185
|
+
const subtitle = normalizeWhitespace(card.find("h4").text());
|
|
186
|
+
const subtitleParts = parseSubtitleForCategory(subtitle);
|
|
187
|
+
const vendorAnchor = card.find("h4 a").first();
|
|
188
|
+
const vendorName = normalizeWhitespace(vendorAnchor.text()) || subtitleParts.vendorName;
|
|
189
|
+
const vendorCompanyUrl = toAbsoluteLinkedInUrl(vendorAnchor.attr("href"));
|
|
190
|
+
const vendorHandle = getCompanyHandle(vendorCompanyUrl);
|
|
191
|
+
const categoryName = subtitleParts.categoryName ?? fallbackCategory?.name;
|
|
192
|
+
if (!categoryName || categoryName.length === 0) {
|
|
193
|
+
return null;
|
|
194
|
+
}
|
|
195
|
+
const category = fallbackCategory ?? {
|
|
196
|
+
name: categoryName,
|
|
197
|
+
slug: categoryName.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, ""),
|
|
198
|
+
url: ""
|
|
199
|
+
};
|
|
200
|
+
return {
|
|
201
|
+
productName,
|
|
202
|
+
productSlug: getProductSlug(productUrl),
|
|
203
|
+
productUrl,
|
|
204
|
+
imageUrl: getImageUrl(card.find("img").first()),
|
|
205
|
+
description: normalizeWhitespace(card.find(".product-serp-card__description").text()) || undefined,
|
|
206
|
+
vendor: vendorName
|
|
207
|
+
? {
|
|
208
|
+
name: vendorName,
|
|
209
|
+
companyUrl: vendorCompanyUrl,
|
|
210
|
+
handle: vendorHandle
|
|
211
|
+
}
|
|
212
|
+
: undefined,
|
|
213
|
+
category,
|
|
214
|
+
pageNumber,
|
|
215
|
+
positionOnPage,
|
|
216
|
+
intendedRoles: [],
|
|
217
|
+
usedBy: [],
|
|
218
|
+
rawPayload: {
|
|
219
|
+
subtitle,
|
|
220
|
+
source: "category-card"
|
|
221
|
+
}
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
export function parseLinkedInCategoryPage(html, requestUrl) {
|
|
225
|
+
const $ = load(html);
|
|
226
|
+
const canonicalUrl = toAbsoluteLinkedInUrl($('link[rel="canonical"]').attr("href") ?? requestUrl);
|
|
227
|
+
if (!canonicalUrl || !canonicalUrl.includes("/products/categories/")) {
|
|
228
|
+
throw new Error("LinkedIn category page did not expose a canonical category URL.");
|
|
229
|
+
}
|
|
230
|
+
const categoryName = normalizeWhitespace($(".serp-hero__title").text())
|
|
231
|
+
.replace(/^Find top products in\s+/i, "")
|
|
232
|
+
.replace(/\s+category$/i, "");
|
|
233
|
+
const categoryDescription = normalizeWhitespace($(".serp-hero__subtitle").text()) || undefined;
|
|
234
|
+
const totalResults = parseResultCount($(".serp-hero__results-title").text());
|
|
235
|
+
const categoryCode = parseCategoryCodeFromHtml($);
|
|
236
|
+
const category = buildCategoryFromUrl(canonicalUrl, categoryName, {
|
|
237
|
+
code: categoryCode,
|
|
238
|
+
description: categoryDescription,
|
|
239
|
+
totalResults
|
|
240
|
+
});
|
|
241
|
+
const items = [];
|
|
242
|
+
$('[data-product-cards-list] > li').each((index, element) => {
|
|
243
|
+
const parsed = parseProductCard($, $(element), getRequestedPageNumber(requestUrl), index + 1, category);
|
|
244
|
+
if (parsed) {
|
|
245
|
+
items.push(parsed);
|
|
246
|
+
}
|
|
247
|
+
});
|
|
248
|
+
return {
|
|
249
|
+
category,
|
|
250
|
+
items
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
export function parseLinkedInProductPage(html, requestUrl) {
|
|
254
|
+
const $ = load(html);
|
|
255
|
+
const canonicalUrl = toAbsoluteLinkedInUrl($('link[rel="canonical"]').attr("href") ?? requestUrl);
|
|
256
|
+
if (!canonicalUrl || !canonicalUrl.includes("/products/")) {
|
|
257
|
+
throw new Error("LinkedIn product page did not expose a canonical product URL.");
|
|
258
|
+
}
|
|
259
|
+
const productName = normalizeWhitespace($("h1.top-card-layout__title").first().text());
|
|
260
|
+
const categoryAnchor = $('[data-tracking-control-name="products_details_guest_product_category"]').first();
|
|
261
|
+
const categoryUrl = toAbsoluteLinkedInUrl(categoryAnchor.attr("href"));
|
|
262
|
+
const categoryName = normalizeWhitespace(categoryAnchor.text());
|
|
263
|
+
if (!categoryUrl || categoryName.length === 0) {
|
|
264
|
+
throw new Error("LinkedIn product page did not expose a product category.");
|
|
265
|
+
}
|
|
266
|
+
const category = buildCategoryFromUrl(categoryUrl, categoryName);
|
|
267
|
+
const vendorAnchor = $('[data-tracking-control-name="products_details_guest_organization_page"]').first();
|
|
268
|
+
const vendorCompanyUrl = toAbsoluteLinkedInUrl(vendorAnchor.attr("href"));
|
|
269
|
+
const vendorName = normalizeWhitespace(vendorAnchor.text());
|
|
270
|
+
const usedBy = [];
|
|
271
|
+
$(".customer__organization-name").each((_, element) => {
|
|
272
|
+
const anchor = $(element);
|
|
273
|
+
const name = normalizeWhitespace(anchor.text());
|
|
274
|
+
if (!name) {
|
|
275
|
+
return;
|
|
276
|
+
}
|
|
277
|
+
const container = anchor.closest("li, .customer");
|
|
278
|
+
usedBy.push({
|
|
279
|
+
name,
|
|
280
|
+
companyUrl: toAbsoluteLinkedInUrl(anchor.attr("href")),
|
|
281
|
+
logoUrl: getImageUrl(container.find("img").first())
|
|
282
|
+
});
|
|
283
|
+
});
|
|
284
|
+
const metaDescription = normalizeWhitespace($('meta[name="description"]').attr("content"));
|
|
285
|
+
const cleanedMetaDescription = metaDescription.startsWith(`${productName} |`)
|
|
286
|
+
? normalizeWhitespace(metaDescription.slice(productName.length + 2))
|
|
287
|
+
: metaDescription;
|
|
288
|
+
const aboutDescription = normalizeWhitespace($(".about > p")
|
|
289
|
+
.not(".about__roles-title")
|
|
290
|
+
.first()
|
|
291
|
+
.text());
|
|
292
|
+
const intendedRoles = uniqueNonEmptyText($(".about__roles-item")
|
|
293
|
+
.map((_, element) => $(element).text())
|
|
294
|
+
.get());
|
|
295
|
+
return {
|
|
296
|
+
productName,
|
|
297
|
+
productSlug: getProductSlug(canonicalUrl),
|
|
298
|
+
productUrl: canonicalUrl,
|
|
299
|
+
imageUrl: getImageUrl($("img.top-card-layout__entity-image").first()),
|
|
300
|
+
description: pickLongerText(aboutDescription, cleanedMetaDescription),
|
|
301
|
+
category,
|
|
302
|
+
vendor: vendorName
|
|
303
|
+
? {
|
|
304
|
+
name: vendorName,
|
|
305
|
+
companyUrl: vendorCompanyUrl,
|
|
306
|
+
handle: getCompanyHandle(vendorCompanyUrl)
|
|
307
|
+
}
|
|
308
|
+
: undefined,
|
|
309
|
+
learnMoreUrl: toAbsoluteUrl($(".top-card-layout__cta--secondary").attr("href")),
|
|
310
|
+
intendedRoles,
|
|
311
|
+
usedBy,
|
|
312
|
+
};
|
|
313
|
+
}
|
|
314
|
+
export function parseLinkedInCompanyPageForMainProduct(html, requestUrl) {
|
|
315
|
+
const $ = load(html);
|
|
316
|
+
const categoryAnchor = $('[data-tracking-control-name="organization_guest_main_product_card_category_link"]').first();
|
|
317
|
+
const productAnchor = $('[data-tracking-control-name="organization_guest_main_product_card"]').first();
|
|
318
|
+
const productUrl = toAbsoluteLinkedInUrl(productAnchor.attr("href"));
|
|
319
|
+
const categoryUrl = toAbsoluteLinkedInUrl(categoryAnchor.attr("href"));
|
|
320
|
+
const categoryName = normalizeWhitespace(categoryAnchor.text());
|
|
321
|
+
const productName = normalizeWhitespace(categoryAnchor.closest(".base-main-card").find(".base-main-card__title").first().text());
|
|
322
|
+
const companyUrl = toAbsoluteLinkedInUrl(requestUrl) ?? requestUrl;
|
|
323
|
+
if (!productUrl || !categoryUrl || categoryName.length === 0 || productName.length === 0) {
|
|
324
|
+
throw new Error("LinkedIn company page did not expose a main product card.");
|
|
325
|
+
}
|
|
326
|
+
return {
|
|
327
|
+
productName,
|
|
328
|
+
productUrl,
|
|
329
|
+
companyUrl,
|
|
330
|
+
category: buildCategoryFromUrl(categoryUrl, categoryName),
|
|
331
|
+
description: normalizeWhitespace(categoryAnchor.closest(".base-main-card").find(".base-main-card__description").text()) || undefined
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
export function parseLinkedInProductSearchPage(html, requestUrl) {
|
|
335
|
+
const $ = load(html);
|
|
336
|
+
const query = normalizeWhitespace($(".serp-hero__title").text()).replace(/^Find top products in\s+/i, "").replace(/^"|"$/g, "");
|
|
337
|
+
const items = [];
|
|
338
|
+
$('[data-product-cards-list] > li').each((index, element) => {
|
|
339
|
+
const parsed = parseProductCard($, $(element), getRequestedPageNumber(requestUrl), index + 1);
|
|
340
|
+
if (parsed) {
|
|
341
|
+
items.push(parsed);
|
|
342
|
+
}
|
|
343
|
+
});
|
|
344
|
+
return { query: query || undefined, items };
|
|
345
|
+
}
|
|
346
|
+
function normalizeQueryKey(value) {
|
|
347
|
+
return value.toLowerCase().replace(/[^a-z0-9]+/g, "");
|
|
348
|
+
}
|
|
349
|
+
function pickBestSearchMatch(items, query) {
|
|
350
|
+
const queryKey = normalizeQueryKey(query);
|
|
351
|
+
if (queryKey.length === 0) {
|
|
352
|
+
const first = items[0];
|
|
353
|
+
if (!first) {
|
|
354
|
+
throw new Error("LinkedIn product search did not return any products.");
|
|
355
|
+
}
|
|
356
|
+
return first;
|
|
357
|
+
}
|
|
358
|
+
const scored = items.map((item) => {
|
|
359
|
+
const productKey = normalizeQueryKey(item.productName);
|
|
360
|
+
const productSlugKey = normalizeQueryKey(item.productSlug);
|
|
361
|
+
const vendorKey = normalizeQueryKey(item.vendor?.name ?? "");
|
|
362
|
+
const vendorHandleKey = normalizeQueryKey(item.vendor?.handle ?? "");
|
|
363
|
+
let score = 0;
|
|
364
|
+
for (const candidate of [productKey, productSlugKey, vendorKey, vendorHandleKey]) {
|
|
365
|
+
if (candidate.length === 0) {
|
|
366
|
+
continue;
|
|
367
|
+
}
|
|
368
|
+
if (candidate === queryKey) {
|
|
369
|
+
score += 100;
|
|
370
|
+
}
|
|
371
|
+
else if (candidate.startsWith(queryKey)) {
|
|
372
|
+
score += 40;
|
|
373
|
+
}
|
|
374
|
+
else if (candidate.includes(queryKey)) {
|
|
375
|
+
score += 20;
|
|
376
|
+
}
|
|
377
|
+
if (queryKey.startsWith(candidate) && candidate.length > 1) {
|
|
378
|
+
score += 15;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
return { item, score };
|
|
382
|
+
});
|
|
383
|
+
scored.sort((left, right) => right.score - left.score);
|
|
384
|
+
return scored[0]?.item ?? items[0];
|
|
385
|
+
}
|
|
386
|
+
function classifyLinkedInInput(input) {
|
|
387
|
+
const trimmed = normalizeWhitespace(input);
|
|
388
|
+
if (trimmed.length === 0) {
|
|
389
|
+
throw new Error("A company domain or LinkedIn URL is required.");
|
|
390
|
+
}
|
|
391
|
+
const isUrl = /^https?:\/\//i.test(trimmed);
|
|
392
|
+
if (!isUrl) {
|
|
393
|
+
const domain = normalizeDomainInput(trimmed);
|
|
394
|
+
if (domain.length === 0) {
|
|
395
|
+
throw new Error("Could not understand the input. Pass a domain or LinkedIn URL.");
|
|
396
|
+
}
|
|
397
|
+
return {
|
|
398
|
+
kind: "domain",
|
|
399
|
+
input: trimmed,
|
|
400
|
+
query: domain.split(".")[0] ?? domain
|
|
401
|
+
};
|
|
402
|
+
}
|
|
403
|
+
const parsed = new URL(trimmed);
|
|
404
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
405
|
+
if (!/(^|\.)linkedin\.com$/.test(hostname)) {
|
|
406
|
+
const domain = normalizeDomainInput(trimmed);
|
|
407
|
+
return {
|
|
408
|
+
kind: "domain",
|
|
409
|
+
input: trimmed,
|
|
410
|
+
query: domain.split(".")[0] ?? domain
|
|
411
|
+
};
|
|
412
|
+
}
|
|
413
|
+
if (parsed.pathname.includes("/search/results/products")) {
|
|
414
|
+
const privateCategory = parsePrivateCategoryCode(parsed.searchParams.get("productCategory"));
|
|
415
|
+
if (privateCategory) {
|
|
416
|
+
return {
|
|
417
|
+
kind: "private-category-code",
|
|
418
|
+
input: trimmed,
|
|
419
|
+
categoryCode: privateCategory
|
|
420
|
+
};
|
|
421
|
+
}
|
|
422
|
+
const query = normalizeWhitespace(parsed.searchParams.get("q"));
|
|
423
|
+
if (query) {
|
|
424
|
+
return {
|
|
425
|
+
kind: "search-query",
|
|
426
|
+
input: trimmed,
|
|
427
|
+
url: `${LINKEDIN_BASE_URL}/products/search/?q=${encodeURIComponent(query)}`,
|
|
428
|
+
query
|
|
429
|
+
};
|
|
430
|
+
}
|
|
431
|
+
throw new Error("LinkedIn private product search URLs need a category code or query.");
|
|
432
|
+
}
|
|
433
|
+
if (parsed.pathname.includes("/products/search")) {
|
|
434
|
+
const query = normalizeWhitespace(parsed.searchParams.get("q") ?? parsed.searchParams.get("keywords"));
|
|
435
|
+
return {
|
|
436
|
+
kind: "search-query",
|
|
437
|
+
input: trimmed,
|
|
438
|
+
url: `${LINKEDIN_BASE_URL}/products/search/?q=${encodeURIComponent(query)}`,
|
|
439
|
+
query
|
|
440
|
+
};
|
|
441
|
+
}
|
|
442
|
+
if (parsed.pathname.includes("/products/categories/")) {
|
|
443
|
+
return {
|
|
444
|
+
kind: "category-url",
|
|
445
|
+
input: trimmed,
|
|
446
|
+
url: toAbsoluteLinkedInUrl(trimmed) ?? trimmed
|
|
447
|
+
};
|
|
448
|
+
}
|
|
449
|
+
if (parsed.pathname.includes("/products/")) {
|
|
450
|
+
return {
|
|
451
|
+
kind: "product-url",
|
|
452
|
+
input: trimmed,
|
|
453
|
+
url: toAbsoluteLinkedInUrl(trimmed) ?? trimmed
|
|
454
|
+
};
|
|
455
|
+
}
|
|
456
|
+
if (parsed.pathname.includes("/company/")) {
|
|
457
|
+
return {
|
|
458
|
+
kind: "company-url",
|
|
459
|
+
input: trimmed,
|
|
460
|
+
url: toAbsoluteLinkedInUrl(trimmed) ?? trimmed
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
throw new Error("Pass a company domain, LinkedIn company page, LinkedIn product page, or LinkedIn category URL.");
|
|
464
|
+
}
|
|
465
|
+
function parsePrivateCategoryCode(value) {
|
|
466
|
+
const normalized = normalizeWhitespace(value);
|
|
467
|
+
if (normalized.length === 0) {
|
|
468
|
+
return undefined;
|
|
469
|
+
}
|
|
470
|
+
try {
|
|
471
|
+
const parsed = JSON.parse(normalized);
|
|
472
|
+
if (Array.isArray(parsed)) {
|
|
473
|
+
const first = normalizeWhitespace(String(parsed[0] ?? ""));
|
|
474
|
+
return first || undefined;
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
catch {
|
|
478
|
+
// Fall through to plain value parsing.
|
|
479
|
+
}
|
|
480
|
+
const match = normalized.match(/(\d{3,})/);
|
|
481
|
+
return match?.[1];
|
|
482
|
+
}
|
|
483
|
+
function getRequestedPageNumber(url) {
|
|
484
|
+
try {
|
|
485
|
+
const parsed = new URL(url);
|
|
486
|
+
const page = Number(parsed.searchParams.get("page") ?? "1");
|
|
487
|
+
return Number.isInteger(page) && page > 0 ? page : 1;
|
|
488
|
+
}
|
|
489
|
+
catch {
|
|
490
|
+
return 1;
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
function buildCategoryPageUrl(categoryUrl, pageNumber) {
|
|
494
|
+
const url = new URL(categoryUrl);
|
|
495
|
+
if (pageNumber <= 1) {
|
|
496
|
+
url.searchParams.delete("page");
|
|
497
|
+
}
|
|
498
|
+
else {
|
|
499
|
+
url.searchParams.set("page", String(pageNumber));
|
|
500
|
+
}
|
|
501
|
+
return url.toString();
|
|
502
|
+
}
|
|
503
|
+
export function createLinkedInHtmlFetcher(fetchImpl = fetch) {
|
|
504
|
+
return async (url) => {
|
|
505
|
+
const response = await fetchImpl(url, {
|
|
506
|
+
headers: {
|
|
507
|
+
"User-Agent": DEFAULT_USER_AGENT,
|
|
508
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
509
|
+
},
|
|
510
|
+
redirect: "follow"
|
|
511
|
+
});
|
|
512
|
+
if (!response.ok) {
|
|
513
|
+
throw new Error(`LinkedIn request failed (${response.status}) for ${url}`);
|
|
514
|
+
}
|
|
515
|
+
return await response.text();
|
|
516
|
+
};
|
|
517
|
+
}
|
|
518
|
+
async function findCategoryByCode(categoryCode, fetchHtml) {
|
|
519
|
+
const browseHtml = await fetchHtml(`${LINKEDIN_BASE_URL}/products/categories/browse`);
|
|
520
|
+
const $ = load(browseHtml);
|
|
521
|
+
const urls = new Set();
|
|
522
|
+
$('a[href*="/products/categories/"]').each((_, element) => {
|
|
523
|
+
const absolute = toAbsoluteLinkedInUrl($(element).attr("href"));
|
|
524
|
+
if (absolute?.includes("/products/categories/")) {
|
|
525
|
+
urls.add(absolute);
|
|
526
|
+
}
|
|
527
|
+
});
|
|
528
|
+
for (const url of urls) {
|
|
529
|
+
const html = await fetchHtml(url);
|
|
530
|
+
const parsed = parseLinkedInCategoryPage(html, url);
|
|
531
|
+
if (parsed.category.code === categoryCode) {
|
|
532
|
+
return parsed.category;
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
throw new Error(`Could not resolve LinkedIn product category code ${categoryCode}.`);
|
|
536
|
+
}
|
|
537
|
+
export async function resolveLinkedInProductSource(input, fetchHtml) {
|
|
538
|
+
const resolvedInput = classifyLinkedInInput(input);
|
|
539
|
+
if (resolvedInput.kind === "category-url") {
|
|
540
|
+
const html = await fetchHtml(resolvedInput.url);
|
|
541
|
+
const page = parseLinkedInCategoryPage(html, resolvedInput.url);
|
|
542
|
+
return {
|
|
543
|
+
input,
|
|
544
|
+
kind: resolvedInput.kind,
|
|
545
|
+
category: page.category
|
|
546
|
+
};
|
|
547
|
+
}
|
|
548
|
+
if (resolvedInput.kind === "product-url") {
|
|
549
|
+
const html = await fetchHtml(resolvedInput.url);
|
|
550
|
+
const product = parseLinkedInProductPage(html, resolvedInput.url);
|
|
551
|
+
return {
|
|
552
|
+
input,
|
|
553
|
+
kind: resolvedInput.kind,
|
|
554
|
+
productUrl: product.productUrl,
|
|
555
|
+
matchedProductName: product.productName,
|
|
556
|
+
category: product.category
|
|
557
|
+
};
|
|
558
|
+
}
|
|
559
|
+
if (resolvedInput.kind === "company-url") {
|
|
560
|
+
const html = await fetchHtml(resolvedInput.url);
|
|
561
|
+
const reference = parseLinkedInCompanyPageForMainProduct(html, resolvedInput.url);
|
|
562
|
+
return {
|
|
563
|
+
input,
|
|
564
|
+
kind: resolvedInput.kind,
|
|
565
|
+
companyUrl: reference.companyUrl,
|
|
566
|
+
productUrl: reference.productUrl,
|
|
567
|
+
matchedProductName: reference.productName,
|
|
568
|
+
category: reference.category
|
|
569
|
+
};
|
|
570
|
+
}
|
|
571
|
+
if (resolvedInput.kind === "private-category-code") {
|
|
572
|
+
const category = await findCategoryByCode(resolvedInput.categoryCode, fetchHtml);
|
|
573
|
+
return {
|
|
574
|
+
input,
|
|
575
|
+
kind: resolvedInput.kind,
|
|
576
|
+
category: {
|
|
577
|
+
...category,
|
|
578
|
+
code: resolvedInput.categoryCode
|
|
579
|
+
}
|
|
580
|
+
};
|
|
581
|
+
}
|
|
582
|
+
const searchQuery = resolvedInput.query ?? "";
|
|
583
|
+
const searchUrl = resolvedInput.kind === "domain"
|
|
584
|
+
? `${LINKEDIN_BASE_URL}/products/search/?q=${encodeURIComponent(searchQuery)}`
|
|
585
|
+
: resolvedInput.url;
|
|
586
|
+
const html = await fetchHtml(searchUrl);
|
|
587
|
+
const search = parseLinkedInProductSearchPage(html, searchUrl);
|
|
588
|
+
const matched = pickBestSearchMatch(search.items, searchQuery);
|
|
589
|
+
const productHtml = await fetchHtml(matched.productUrl);
|
|
590
|
+
const product = parseLinkedInProductPage(productHtml, matched.productUrl);
|
|
591
|
+
return {
|
|
592
|
+
input,
|
|
593
|
+
kind: resolvedInput.kind,
|
|
594
|
+
query: searchQuery,
|
|
595
|
+
productUrl: product.productUrl,
|
|
596
|
+
matchedProductName: product.productName,
|
|
597
|
+
category: product.category
|
|
598
|
+
};
|
|
599
|
+
}
|
|
600
|
+
function mergeDetailIntoRecord(record, detail) {
|
|
601
|
+
const description = record.description && record.description.length >= (detail.description?.length ?? 0)
|
|
602
|
+
? record.description
|
|
603
|
+
: detail.description;
|
|
604
|
+
return {
|
|
605
|
+
...record,
|
|
606
|
+
imageUrl: record.imageUrl ?? detail.imageUrl,
|
|
607
|
+
description,
|
|
608
|
+
vendor: record.vendor ?? detail.vendor,
|
|
609
|
+
category: {
|
|
610
|
+
...record.category,
|
|
611
|
+
code: record.category.code ?? detail.category.code,
|
|
612
|
+
description: record.category.description ?? detail.category.description,
|
|
613
|
+
totalResults: record.category.totalResults ?? detail.category.totalResults
|
|
614
|
+
},
|
|
615
|
+
learnMoreUrl: detail.learnMoreUrl ?? record.learnMoreUrl,
|
|
616
|
+
intendedRoles: detail.intendedRoles.length > 0 ? detail.intendedRoles : record.intendedRoles,
|
|
617
|
+
usedBy: detail.usedBy.length > 0 ? detail.usedBy : record.usedBy,
|
|
618
|
+
rawPayload: {
|
|
619
|
+
...record.rawPayload,
|
|
620
|
+
detailSource: "product-page"
|
|
621
|
+
}
|
|
622
|
+
};
|
|
623
|
+
}
|
|
624
|
+
async function mapWithConcurrency(input, concurrency, mapper) {
|
|
625
|
+
if (input.length === 0) {
|
|
626
|
+
return [];
|
|
627
|
+
}
|
|
628
|
+
const safeConcurrency = Math.max(1, Math.min(concurrency, input.length));
|
|
629
|
+
const results = new Array(input.length);
|
|
630
|
+
let nextIndex = 0;
|
|
631
|
+
const worker = async () => {
|
|
632
|
+
while (true) {
|
|
633
|
+
const current = nextIndex;
|
|
634
|
+
nextIndex += 1;
|
|
635
|
+
if (current >= input.length) {
|
|
636
|
+
return;
|
|
637
|
+
}
|
|
638
|
+
results[current] = await mapper(input[current], current);
|
|
639
|
+
}
|
|
640
|
+
};
|
|
641
|
+
await Promise.all(Array.from({ length: safeConcurrency }, () => worker()));
|
|
642
|
+
return results;
|
|
643
|
+
}
|
|
644
|
+
export async function crawlLinkedInProductCategory(options) {
|
|
645
|
+
const fetchHtml = options.fetchHtml ?? createLinkedInHtmlFetcher();
|
|
646
|
+
const source = await resolveLinkedInProductSource(options.input, fetchHtml);
|
|
647
|
+
const maxPages = Math.max(1, options.maxPages ?? 25);
|
|
648
|
+
const limit = options.limit !== undefined ? Math.max(1, options.limit) : undefined;
|
|
649
|
+
const itemsByUrl = new Map();
|
|
650
|
+
let totalPagesFetched = 0;
|
|
651
|
+
if (options.enrichDetails !== false && source.productUrl) {
|
|
652
|
+
const sourceDetailHtml = await fetchHtml(source.productUrl);
|
|
653
|
+
const sourceDetail = parseLinkedInProductPage(sourceDetailHtml, source.productUrl);
|
|
654
|
+
itemsByUrl.set(sourceDetail.productUrl, toRecordFromProductPage(sourceDetail, {
|
|
655
|
+
rawPayload: {
|
|
656
|
+
source: "resolved-product-page",
|
|
657
|
+
resolvedFromInput: true
|
|
658
|
+
}
|
|
659
|
+
}));
|
|
660
|
+
}
|
|
661
|
+
for (let pageNumber = 1; pageNumber <= maxPages; pageNumber += 1) {
|
|
662
|
+
const pageUrl = buildCategoryPageUrl(source.category.url, pageNumber);
|
|
663
|
+
const html = await fetchHtml(pageUrl);
|
|
664
|
+
const page = parseLinkedInCategoryPage(html, pageUrl);
|
|
665
|
+
totalPagesFetched = pageNumber;
|
|
666
|
+
if (!source.category.code && page.category.code) {
|
|
667
|
+
source.category.code = page.category.code;
|
|
668
|
+
}
|
|
669
|
+
if (!source.category.description && page.category.description) {
|
|
670
|
+
source.category.description = page.category.description;
|
|
671
|
+
}
|
|
672
|
+
if (!source.category.totalResults && page.category.totalResults) {
|
|
673
|
+
source.category.totalResults = page.category.totalResults;
|
|
674
|
+
}
|
|
675
|
+
if (limit !== undefined && itemsByUrl.size >= limit) {
|
|
676
|
+
break;
|
|
677
|
+
}
|
|
678
|
+
if (page.items.length === 0) {
|
|
679
|
+
break;
|
|
680
|
+
}
|
|
681
|
+
let newItemsOnPage = 0;
|
|
682
|
+
for (const item of page.items) {
|
|
683
|
+
if (!itemsByUrl.has(item.productUrl)) {
|
|
684
|
+
itemsByUrl.set(item.productUrl, item);
|
|
685
|
+
newItemsOnPage += 1;
|
|
686
|
+
}
|
|
687
|
+
if (limit !== undefined && itemsByUrl.size >= limit) {
|
|
688
|
+
break;
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
if (limit !== undefined && itemsByUrl.size >= limit) {
|
|
692
|
+
break;
|
|
693
|
+
}
|
|
694
|
+
if (newItemsOnPage === 0) {
|
|
695
|
+
break;
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
let items = Array.from(itemsByUrl.values());
|
|
699
|
+
if (limit !== undefined) {
|
|
700
|
+
items = items.slice(0, limit);
|
|
701
|
+
}
|
|
702
|
+
if (options.enrichDetails !== false && items.length > 0) {
|
|
703
|
+
const detailed = await mapWithConcurrency(items, options.detailConcurrency ?? 4, async (item) => {
|
|
704
|
+
const html = await fetchHtml(item.productUrl);
|
|
705
|
+
const detail = parseLinkedInProductPage(html, item.productUrl);
|
|
706
|
+
return mergeDetailIntoRecord(item, detail);
|
|
707
|
+
});
|
|
708
|
+
items = detailed;
|
|
709
|
+
}
|
|
710
|
+
return {
|
|
711
|
+
source,
|
|
712
|
+
items,
|
|
713
|
+
totalPagesFetched
|
|
714
|
+
};
|
|
715
|
+
}
|