autokap 1.4.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/skill/SKILL.md +3 -0
- package/assets/skill/references/STANDARDS.md +236 -0
- package/dist/cli.js +54 -2
- package/dist/crm/email-fallback.d.ts +16 -0
- package/dist/crm/email-fallback.js +217 -0
- package/dist/crm/run-campaign.d.ts +28 -0
- package/dist/crm/run-campaign.js +405 -0
- package/dist/crm/scrape-betalist.d.ts +20 -0
- package/dist/crm/scrape-betalist.js +194 -0
- package/dist/crm/scrape-landing.d.ts +24 -0
- package/dist/crm/scrape-landing.js +240 -0
- package/dist/crm/storage-upload.d.ts +14 -0
- package/dist/crm/storage-upload.js +40 -0
- package/dist/mockup.d.ts +7 -0
- package/dist/mockup.js +52 -6
- package/dist/types.d.ts +1 -1
- package/package.json +3 -2
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import { chromium } from 'playwright';
|
|
3
|
+
import { findEmail } from './email-fallback.js';
|
|
4
|
+
import { scrapeBetaListLaunches } from './scrape-betalist.js';
|
|
5
|
+
import { scrapeLanding } from './scrape-landing.js';
|
|
6
|
+
import { LandingShotEndpointMissingError, uploadLandingShot } from './storage-upload.js';
|
|
7
|
+
const CRAWLER_UA = 'AutoKap-Crawler/1.0 (+https://autokap.app/crawler)';
|
|
8
|
+
// TODO(parent): read CRM vision model from a config slot after Phase B settles.
|
|
9
|
+
const DEFAULT_VISION_MODEL = 'openai/gpt-5.2-mini';
|
|
10
|
+
export async function runCampaign(opts, deps = {}) {
|
|
11
|
+
const fetchImpl = deps.fetch ?? globalThis.fetch;
|
|
12
|
+
const apiBaseUrl = opts.apiBaseUrl.replace(/\/+$/, '');
|
|
13
|
+
const logger = opts.logger;
|
|
14
|
+
let browser = null;
|
|
15
|
+
const context = {
|
|
16
|
+
apiBaseUrl,
|
|
17
|
+
runId: opts.runId,
|
|
18
|
+
runToken: opts.runToken,
|
|
19
|
+
logger,
|
|
20
|
+
fetchImpl,
|
|
21
|
+
};
|
|
22
|
+
try {
|
|
23
|
+
await postCheckpoint(context, { type: 'run_start', message: 'CLI booted, fetching launches' });
|
|
24
|
+
const launches = await (deps.scrapeLaunches ?? scrapeBetaListLaunches)({
|
|
25
|
+
lookbackDays: opts.lookbackDays,
|
|
26
|
+
userAgent: CRAWLER_UA,
|
|
27
|
+
logger,
|
|
28
|
+
});
|
|
29
|
+
if (launches.length === 0) {
|
|
30
|
+
await postCheckpoint(context, { type: 'page_fetched', scraped_count: 0 });
|
|
31
|
+
await postFinish(context, 'completed');
|
|
32
|
+
return { scraped: 0, inserted: 0, disqualified: 0, skipped: 0 };
|
|
33
|
+
}
|
|
34
|
+
await postCheckpoint(context, {
|
|
35
|
+
type: 'page_fetched',
|
|
36
|
+
scraped_count: launches.length,
|
|
37
|
+
message: `Found ${launches.length} launches`,
|
|
38
|
+
});
|
|
39
|
+
browser = deps.launchBrowser
|
|
40
|
+
? await deps.launchBrowser()
|
|
41
|
+
: await chromium.launch({
|
|
42
|
+
args: process.platform === 'linux'
|
|
43
|
+
? ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
|
44
|
+
: [],
|
|
45
|
+
headless: true,
|
|
46
|
+
});
|
|
47
|
+
const counters = { inserted: 0, disqualified: 0, skipped: 0 };
|
|
48
|
+
let lastProgressAt = 0;
|
|
49
|
+
const sendProgress = async (force = false) => {
|
|
50
|
+
const now = Date.now();
|
|
51
|
+
if (!force && now - lastProgressAt < 5_000)
|
|
52
|
+
return;
|
|
53
|
+
lastProgressAt = now;
|
|
54
|
+
await postCheckpoint(context, {
|
|
55
|
+
type: 'progress',
|
|
56
|
+
scraped_count: launches.length,
|
|
57
|
+
inserted_count: counters.inserted,
|
|
58
|
+
disqualified_count: counters.disqualified,
|
|
59
|
+
skipped_count: counters.skipped,
|
|
60
|
+
});
|
|
61
|
+
};
|
|
62
|
+
await processWithConcurrency(launches, 2, async (launch) => {
|
|
63
|
+
await processLaunch({
|
|
64
|
+
launch,
|
|
65
|
+
browser: browser,
|
|
66
|
+
counters,
|
|
67
|
+
context,
|
|
68
|
+
scrapeLandingFn: deps.scrapeLanding ?? scrapeLanding,
|
|
69
|
+
findEmailFn: deps.findEmail ?? findEmail,
|
|
70
|
+
});
|
|
71
|
+
await sendProgress();
|
|
72
|
+
});
|
|
73
|
+
await sendProgress(true);
|
|
74
|
+
await postFinish(context, 'completed');
|
|
75
|
+
return {
|
|
76
|
+
scraped: launches.length,
|
|
77
|
+
inserted: counters.inserted,
|
|
78
|
+
disqualified: counters.disqualified,
|
|
79
|
+
skipped: counters.skipped,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
catch (error) {
|
|
83
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
84
|
+
await postCheckpoint(context, { type: 'error', errorMessage: message, message });
|
|
85
|
+
await postFinish(context, 'failed', message);
|
|
86
|
+
throw error;
|
|
87
|
+
}
|
|
88
|
+
finally {
|
|
89
|
+
if (browser) {
|
|
90
|
+
await browser.close().catch((error) => {
|
|
91
|
+
logger.warn(`[crm-run] Browser close failed: ${error.message}`);
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
async function processLaunch(args) {
|
|
97
|
+
const { launch, counters, context, scrapeLandingFn, findEmailFn } = args;
|
|
98
|
+
const noFitReason = disqualifyReason(launch);
|
|
99
|
+
if (noFitReason) {
|
|
100
|
+
const response = await postProspect(context, {
|
|
101
|
+
...baseProspect(launch),
|
|
102
|
+
status: 'closed_no_fit',
|
|
103
|
+
signals: { is_web_app: false },
|
|
104
|
+
free_signals: [{ label: noFitReason, weight: 0 }],
|
|
105
|
+
});
|
|
106
|
+
if (response?.status === 201)
|
|
107
|
+
counters.disqualified += 1;
|
|
108
|
+
else if (response?.status === 200)
|
|
109
|
+
counters.skipped += 1;
|
|
110
|
+
await postCheckpoint(context, {
|
|
111
|
+
type: 'launch_disqualified',
|
|
112
|
+
disqualified_count: counters.disqualified,
|
|
113
|
+
skipped_count: counters.skipped,
|
|
114
|
+
message: `${launch.productName} disqualified: ${noFitReason}`,
|
|
115
|
+
});
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
const productUrl = launch.productUrl;
|
|
119
|
+
const scrape = await scrapeLandingFn({
|
|
120
|
+
productUrl,
|
|
121
|
+
userAgent: CRAWLER_UA,
|
|
122
|
+
logger: context.logger,
|
|
123
|
+
browser: args.browser,
|
|
124
|
+
});
|
|
125
|
+
if (scrape.error === 'robots_disallowed' || scrape.screenshotPath === null) {
|
|
126
|
+
const contact = await findEmailFn({
|
|
127
|
+
betaListLaunchUrl: launch.sourceUrl,
|
|
128
|
+
productUrl: launch.productUrl,
|
|
129
|
+
logger: context.logger,
|
|
130
|
+
});
|
|
131
|
+
const response = await postProspect(context, {
|
|
132
|
+
...baseProspect(launch),
|
|
133
|
+
creator_email: contact.email ?? launch.creatorEmail,
|
|
134
|
+
creator_handle: contact.handle ?? launch.creatorHandle,
|
|
135
|
+
creator_lang: contact.lang ?? launch.creatorLang,
|
|
136
|
+
status: 'to_contact',
|
|
137
|
+
signals: { is_web_app: scrape.isWebApp, scrape_blocked: true },
|
|
138
|
+
free_signals: [{ label: 'scrape_blocked', weight: 0 }],
|
|
139
|
+
});
|
|
140
|
+
countInsertedOrSkipped(response, counters);
|
|
141
|
+
await postCheckpoint(context, {
|
|
142
|
+
type: 'launch_inserted',
|
|
143
|
+
inserted_count: counters.inserted,
|
|
144
|
+
skipped_count: counters.skipped,
|
|
145
|
+
message: `${launch.productName} inserted with scrape blocked`,
|
|
146
|
+
});
|
|
147
|
+
return;
|
|
148
|
+
}
|
|
149
|
+
const vision = await analyzeWithOptionalVision(context, launch, scrape);
|
|
150
|
+
const contact = await findEmailFn({
|
|
151
|
+
betaListLaunchUrl: launch.sourceUrl,
|
|
152
|
+
productUrl: launch.productUrl,
|
|
153
|
+
logger: context.logger,
|
|
154
|
+
});
|
|
155
|
+
const signals = mergeSignals(scrape, vision);
|
|
156
|
+
const freeSignals = sanitizeFreeSignals(vision?.free_signals ?? []);
|
|
157
|
+
const email = contact.email ?? launch.creatorEmail;
|
|
158
|
+
const response = await postProspect(context, {
|
|
159
|
+
...baseProspect(launch),
|
|
160
|
+
creator_email: email,
|
|
161
|
+
creator_handle: contact.handle ?? launch.creatorHandle,
|
|
162
|
+
creator_lang: contact.lang ?? vision?.creator_lang_guess ?? launch.creatorLang,
|
|
163
|
+
status: email ? 'to_contact' : 'email_missing',
|
|
164
|
+
signals,
|
|
165
|
+
free_signals: freeSignals,
|
|
166
|
+
});
|
|
167
|
+
countInsertedOrSkipped(response, counters);
|
|
168
|
+
await postCheckpoint(context, {
|
|
169
|
+
type: 'launch_inserted',
|
|
170
|
+
inserted_count: counters.inserted,
|
|
171
|
+
skipped_count: counters.skipped,
|
|
172
|
+
message: `${launch.productName} inserted`,
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
function disqualifyReason(launch) {
|
|
176
|
+
if (!launch.productUrl)
|
|
177
|
+
return 'no_product_url';
|
|
178
|
+
const text = `${launch.productName} ${launch.tagline ?? ''}`;
|
|
179
|
+
if (/\b(iOS only|App Store only|iPhone only|iPad only|Android only|Google Play only|hardware|physical product)\b/i.test(text)) {
|
|
180
|
+
return 'not_web_app';
|
|
181
|
+
}
|
|
182
|
+
return null;
|
|
183
|
+
}
|
|
184
|
+
function baseProspect(launch) {
|
|
185
|
+
return {
|
|
186
|
+
source_url: launch.sourceUrl,
|
|
187
|
+
product_name: launch.productName,
|
|
188
|
+
product_url: launch.productUrl,
|
|
189
|
+
creator_name: launch.creatorName,
|
|
190
|
+
creator_email: launch.creatorEmail,
|
|
191
|
+
creator_lang: launch.creatorLang,
|
|
192
|
+
creator_handle: launch.creatorHandle,
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
function countInsertedOrSkipped(response, counters) {
|
|
196
|
+
if (response?.status === 201)
|
|
197
|
+
counters.inserted += 1;
|
|
198
|
+
else if (response?.status === 200)
|
|
199
|
+
counters.skipped += 1;
|
|
200
|
+
}
|
|
201
|
+
async function analyzeWithOptionalVision(context, launch, scrape) {
|
|
202
|
+
if (!scrape.screenshotPath)
|
|
203
|
+
return null;
|
|
204
|
+
try {
|
|
205
|
+
const pngBuffer = await fs.readFile(scrape.screenshotPath);
|
|
206
|
+
const upload = await uploadLandingShot({
|
|
207
|
+
runId: context.runId,
|
|
208
|
+
sourceUrl: launch.sourceUrl,
|
|
209
|
+
pngBuffer,
|
|
210
|
+
apiBaseUrl: context.apiBaseUrl,
|
|
211
|
+
runToken: context.runToken,
|
|
212
|
+
});
|
|
213
|
+
return await analyzeLandingWithVision({
|
|
214
|
+
signedUrl: upload.signedUrl,
|
|
215
|
+
rawText: scrape.rawText,
|
|
216
|
+
model: DEFAULT_VISION_MODEL,
|
|
217
|
+
fetchImpl: context.fetchImpl,
|
|
218
|
+
});
|
|
219
|
+
}
|
|
220
|
+
catch (error) {
|
|
221
|
+
if (error instanceof LandingShotEndpointMissingError || error.message === 'LANDING_SHOT_ENDPOINT_MISSING') {
|
|
222
|
+
context.logger.warn('[crm-run] Landing-shot endpoint missing; vision skipped');
|
|
223
|
+
return null;
|
|
224
|
+
}
|
|
225
|
+
context.logger.warn(`[crm-run] Vision analysis skipped for ${launch.sourceUrl}: ${error.message}`);
|
|
226
|
+
return null;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
function mergeSignals(scrape, vision) {
|
|
230
|
+
return {
|
|
231
|
+
n_screenshots: clampInt(vision?.n_screenshots ?? scrape.htmlSignals.n_screenshots_html ?? 0, 0, 50),
|
|
232
|
+
has_video: vision?.has_video ?? scrape.htmlSignals.has_video_html ?? false,
|
|
233
|
+
has_docs: vision?.has_docs ?? scrape.htmlSignals.has_docs_html ?? false,
|
|
234
|
+
has_dashboard: vision?.has_dashboard ?? scrape.htmlSignals.has_dashboard_html ?? false,
|
|
235
|
+
beta_list_quality: vision?.beta_list_quality_inferred,
|
|
236
|
+
is_web_app: vision?.is_web_app ?? scrape.isWebApp,
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
function sanitizeFreeSignals(value) {
|
|
240
|
+
return value.slice(0, 8).map((signal) => ({
|
|
241
|
+
label: signal.label.slice(0, 120),
|
|
242
|
+
weight: clampInt(signal.weight, -5, 5),
|
|
243
|
+
})).filter((signal) => signal.label.trim().length > 0);
|
|
244
|
+
}
|
|
245
|
+
function clampInt(value, min, max) {
|
|
246
|
+
const int = Math.trunc(Number.isFinite(value) ? value : min);
|
|
247
|
+
return Math.max(min, Math.min(max, int));
|
|
248
|
+
}
|
|
249
|
+
async function postCheckpoint(context, body) {
|
|
250
|
+
try {
|
|
251
|
+
const response = await context.fetchImpl(`${context.apiBaseUrl}/api/cli/crm/runs/${context.runId}/checkpoint`, {
|
|
252
|
+
method: 'POST',
|
|
253
|
+
headers: jsonAuthHeaders(context.runToken),
|
|
254
|
+
body: JSON.stringify(body),
|
|
255
|
+
signal: AbortSignal.timeout(15_000),
|
|
256
|
+
});
|
|
257
|
+
if (!response.ok) {
|
|
258
|
+
const responseBody = await response.text().catch(() => response.statusText);
|
|
259
|
+
context.logger.warn(`[crm-run] checkpoint POST failed: status=${response.status} body=${responseBody.slice(0, 300)}`);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
catch (error) {
|
|
263
|
+
context.logger.warn(`[crm-run] checkpoint POST errored: ${error.message}`);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
async function postProspect(context, body) {
|
|
267
|
+
try {
|
|
268
|
+
const response = await context.fetchImpl(`${context.apiBaseUrl}/api/cli/crm/runs/${context.runId}/prospect`, {
|
|
269
|
+
method: 'POST',
|
|
270
|
+
headers: jsonAuthHeaders(context.runToken),
|
|
271
|
+
body: JSON.stringify(body),
|
|
272
|
+
signal: AbortSignal.timeout(15_000),
|
|
273
|
+
});
|
|
274
|
+
const text = await response.text().catch(() => '');
|
|
275
|
+
const json = text ? JSON.parse(text) : null;
|
|
276
|
+
if (!response.ok) {
|
|
277
|
+
context.logger.warn(`[crm-run] prospect POST failed: status=${response.status} body=${text.slice(0, 500)}`);
|
|
278
|
+
return null;
|
|
279
|
+
}
|
|
280
|
+
return { status: response.status, json };
|
|
281
|
+
}
|
|
282
|
+
catch (error) {
|
|
283
|
+
context.logger.warn(`[crm-run] prospect POST errored: ${error.message}`);
|
|
284
|
+
return null;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
async function postFinish(context, status, errorMessage) {
|
|
288
|
+
try {
|
|
289
|
+
const response = await context.fetchImpl(`${context.apiBaseUrl}/api/cli/crm/runs/${context.runId}/finish`, {
|
|
290
|
+
method: 'POST',
|
|
291
|
+
headers: jsonAuthHeaders(context.runToken),
|
|
292
|
+
body: JSON.stringify(errorMessage ? { status, errorMessage } : { status }),
|
|
293
|
+
signal: AbortSignal.timeout(15_000),
|
|
294
|
+
});
|
|
295
|
+
if (!response.ok) {
|
|
296
|
+
const body = await response.text().catch(() => response.statusText);
|
|
297
|
+
context.logger.warn(`[crm-run] finish POST failed: status=${response.status} body=${body.slice(0, 300)}`);
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
catch (error) {
|
|
301
|
+
context.logger.warn(`[crm-run] finish POST errored: ${error.message}`);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
function jsonAuthHeaders(runToken) {
|
|
305
|
+
return {
|
|
306
|
+
Authorization: `Bearer ${runToken}`,
|
|
307
|
+
'Content-Type': 'application/json',
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
async function processWithConcurrency(items, concurrency, worker) {
|
|
311
|
+
let nextIndex = 0;
|
|
312
|
+
const workers = Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
313
|
+
while (nextIndex < items.length) {
|
|
314
|
+
const item = items[nextIndex];
|
|
315
|
+
nextIndex += 1;
|
|
316
|
+
if (item !== undefined)
|
|
317
|
+
await worker(item);
|
|
318
|
+
}
|
|
319
|
+
});
|
|
320
|
+
await Promise.all(workers);
|
|
321
|
+
}
|
|
322
|
+
async function analyzeLandingWithVision(args) {
|
|
323
|
+
const apiKey = process.env.OPENROUTER_API_KEY;
|
|
324
|
+
if (!apiKey)
|
|
325
|
+
throw new Error('OPENROUTER_API_KEY missing');
|
|
326
|
+
const fetchImpl = args.fetchImpl ?? globalThis.fetch;
|
|
327
|
+
const response = await fetchImpl('https://openrouter.ai/api/v1/chat/completions', {
|
|
328
|
+
method: 'POST',
|
|
329
|
+
headers: {
|
|
330
|
+
Authorization: `Bearer ${apiKey}`,
|
|
331
|
+
'Content-Type': 'application/json',
|
|
332
|
+
'HTTP-Referer': 'https://autokap.app',
|
|
333
|
+
'X-Title': 'AutoKap CRM Scraper',
|
|
334
|
+
},
|
|
335
|
+
body: JSON.stringify({
|
|
336
|
+
model: args.model,
|
|
337
|
+
messages: [
|
|
338
|
+
{
|
|
339
|
+
role: 'system',
|
|
340
|
+
content: 'Analyze a SaaS product landing page screenshot and text. Return one JSON object with EXACT keys: n_screenshots (int 0..50, count product-screenshot-like images visible), has_video (bool, demo or product video), has_docs (bool, link to docs/api docs visible), has_dashboard (bool, mentions or shows a dashboard/app interface), is_web_app (bool, false if iOS/Android/hardware-only), beta_list_quality_inferred ("clean" | "sloppy", based on visual polish), free_signals (array of { label: string<=120, weight: int -5..5 } for notable strengths or weaknesses, max 6), creator_lang_guess ("fr" | "en" | null). Be terse and respond ONLY with JSON.',
|
|
341
|
+
},
|
|
342
|
+
{
|
|
343
|
+
role: 'user',
|
|
344
|
+
content: [
|
|
345
|
+
{ type: 'text', text: args.rawText.slice(0, 4000) },
|
|
346
|
+
{ type: 'image_url', image_url: { url: args.signedUrl } },
|
|
347
|
+
],
|
|
348
|
+
},
|
|
349
|
+
],
|
|
350
|
+
response_format: { type: 'json_object' },
|
|
351
|
+
temperature: 0,
|
|
352
|
+
}),
|
|
353
|
+
signal: AbortSignal.timeout(60_000),
|
|
354
|
+
});
|
|
355
|
+
if (!response.ok) {
|
|
356
|
+
const body = await response.text().catch(() => response.statusText);
|
|
357
|
+
throw new Error(`OpenRouter HTTP ${response.status}: ${body.slice(0, 300)}`);
|
|
358
|
+
}
|
|
359
|
+
const json = await response.json().catch(() => null);
|
|
360
|
+
const content = json?.choices?.[0]?.message?.content;
|
|
361
|
+
if (typeof content !== 'string')
|
|
362
|
+
return null;
|
|
363
|
+
let parsed;
|
|
364
|
+
try {
|
|
365
|
+
parsed = JSON.parse(content);
|
|
366
|
+
}
|
|
367
|
+
catch {
|
|
368
|
+
return null;
|
|
369
|
+
}
|
|
370
|
+
return validateVisionAnalysis(parsed);
|
|
371
|
+
}
|
|
372
|
+
function validateVisionAnalysis(value) {
|
|
373
|
+
if (!value || typeof value !== 'object')
|
|
374
|
+
return null;
|
|
375
|
+
const input = value;
|
|
376
|
+
const output = {};
|
|
377
|
+
if (typeof input.n_screenshots === 'number')
|
|
378
|
+
output.n_screenshots = clampInt(input.n_screenshots, 0, 50);
|
|
379
|
+
if (typeof input.has_video === 'boolean')
|
|
380
|
+
output.has_video = input.has_video;
|
|
381
|
+
if (typeof input.has_docs === 'boolean')
|
|
382
|
+
output.has_docs = input.has_docs;
|
|
383
|
+
if (typeof input.has_dashboard === 'boolean')
|
|
384
|
+
output.has_dashboard = input.has_dashboard;
|
|
385
|
+
if (typeof input.is_web_app === 'boolean')
|
|
386
|
+
output.is_web_app = input.is_web_app;
|
|
387
|
+
if (input.beta_list_quality_inferred === 'clean' || input.beta_list_quality_inferred === 'sloppy') {
|
|
388
|
+
output.beta_list_quality_inferred = input.beta_list_quality_inferred;
|
|
389
|
+
}
|
|
390
|
+
if (input.creator_lang_guess === 'fr' || input.creator_lang_guess === 'en' || input.creator_lang_guess === null) {
|
|
391
|
+
output.creator_lang_guess = input.creator_lang_guess;
|
|
392
|
+
}
|
|
393
|
+
if (Array.isArray(input.free_signals)) {
|
|
394
|
+
output.free_signals = sanitizeFreeSignals(input.free_signals.flatMap((item) => {
|
|
395
|
+
if (!item || typeof item !== 'object')
|
|
396
|
+
return [];
|
|
397
|
+
const signal = item;
|
|
398
|
+
if (typeof signal.label !== 'string' || typeof signal.weight !== 'number')
|
|
399
|
+
return [];
|
|
400
|
+
return [{ label: signal.label, weight: signal.weight }];
|
|
401
|
+
}).slice(0, 6));
|
|
402
|
+
}
|
|
403
|
+
return output;
|
|
404
|
+
}
|
|
405
|
+
//# sourceMappingURL=run-campaign.js.map
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
export type BetaListLaunch = {
|
|
2
|
+
sourceUrl: string;
|
|
3
|
+
productName: string;
|
|
4
|
+
productUrl: string | null;
|
|
5
|
+
creatorName: string | null;
|
|
6
|
+
creatorEmail: string | null;
|
|
7
|
+
creatorHandle: string | null;
|
|
8
|
+
creatorLang: string | null;
|
|
9
|
+
tagline?: string | null;
|
|
10
|
+
};
|
|
11
|
+
export interface ScrapeBetaListOptions {
|
|
12
|
+
lookbackDays: number;
|
|
13
|
+
userAgent: string;
|
|
14
|
+
logger: {
|
|
15
|
+
info(msg: string): void;
|
|
16
|
+
warn(msg: string): void;
|
|
17
|
+
error(msg: string): void;
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
export declare function scrapeBetaListLaunches(opts: ScrapeBetaListOptions): Promise<BetaListLaunch[]>;
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
const BETALIST_ORIGIN = 'https://betalist.com/';
|
|
2
|
+
export async function scrapeBetaListLaunches(opts) {
|
|
3
|
+
let html;
|
|
4
|
+
try {
|
|
5
|
+
const response = await fetch(BETALIST_ORIGIN, {
|
|
6
|
+
headers: {
|
|
7
|
+
Accept: 'text/html',
|
|
8
|
+
'User-Agent': opts.userAgent,
|
|
9
|
+
},
|
|
10
|
+
signal: AbortSignal.timeout(15_000),
|
|
11
|
+
});
|
|
12
|
+
if (!response.ok) {
|
|
13
|
+
opts.logger.warn(`[crm-betalist] BetaList fetch returned HTTP ${response.status}`);
|
|
14
|
+
return [];
|
|
15
|
+
}
|
|
16
|
+
html = await response.text();
|
|
17
|
+
}
|
|
18
|
+
catch (error) {
|
|
19
|
+
opts.logger.warn(`[crm-betalist] BetaList fetch failed: ${error.message}`);
|
|
20
|
+
return [];
|
|
21
|
+
}
|
|
22
|
+
const cheerio = await loadCheerio();
|
|
23
|
+
const $ = cheerio.load(html);
|
|
24
|
+
const launches = [];
|
|
25
|
+
const seen = new Set();
|
|
26
|
+
$('a[href^="/startups/"], a[href*="betalist.com/startups/"]').each((_, anchor) => {
|
|
27
|
+
const href = $(anchor).attr('href');
|
|
28
|
+
if (!href)
|
|
29
|
+
return;
|
|
30
|
+
const sourceUrl = canonicalBetaListUrl(href);
|
|
31
|
+
if (!sourceUrl || seen.has(sourceUrl))
|
|
32
|
+
return;
|
|
33
|
+
const container = closestLaunchContainer($, anchor);
|
|
34
|
+
const productName = extractProductName($, anchor, container);
|
|
35
|
+
if (!productName) {
|
|
36
|
+
opts.logger.warn(`[crm-betalist] Missing product name for ${sourceUrl}`);
|
|
37
|
+
return;
|
|
38
|
+
}
|
|
39
|
+
const launchDate = parseLaunchDateForRow($, container);
|
|
40
|
+
if (launchDate && !isWithinLookback(launchDate, opts.lookbackDays))
|
|
41
|
+
return;
|
|
42
|
+
const productUrl = extractProductUrl($, container, sourceUrl);
|
|
43
|
+
const text = container.text().replace(/\s+/g, ' ').trim();
|
|
44
|
+
const creatorHandle = extractHandleFromHtml(container.html() ?? '');
|
|
45
|
+
launches.push({
|
|
46
|
+
sourceUrl,
|
|
47
|
+
productName,
|
|
48
|
+
productUrl,
|
|
49
|
+
creatorName: extractCreatorName($, container),
|
|
50
|
+
creatorEmail: extractEmail(text),
|
|
51
|
+
creatorHandle,
|
|
52
|
+
creatorLang: null,
|
|
53
|
+
tagline: extractTagline($, container, productName),
|
|
54
|
+
});
|
|
55
|
+
seen.add(sourceUrl);
|
|
56
|
+
});
|
|
57
|
+
if (launches.length === 0) {
|
|
58
|
+
opts.logger.warn('[crm-betalist] No BetaList launch links found on homepage');
|
|
59
|
+
}
|
|
60
|
+
return launches;
|
|
61
|
+
}
|
|
62
|
+
function canonicalBetaListUrl(href) {
|
|
63
|
+
try {
|
|
64
|
+
const url = new URL(href, BETALIST_ORIGIN);
|
|
65
|
+
if (url.hostname !== 'betalist.com' && url.hostname !== 'www.betalist.com')
|
|
66
|
+
return null;
|
|
67
|
+
if (!url.pathname.startsWith('/startups/'))
|
|
68
|
+
return null;
|
|
69
|
+
url.hash = '';
|
|
70
|
+
url.search = '';
|
|
71
|
+
return url.toString();
|
|
72
|
+
}
|
|
73
|
+
catch {
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
async function loadCheerio() {
|
|
78
|
+
const importer = new Function('specifier', 'return import(specifier)');
|
|
79
|
+
return importer('cheerio');
|
|
80
|
+
}
|
|
81
|
+
function closestLaunchContainer($, anchor) {
|
|
82
|
+
const candidates = $(anchor).parents('article, li, tr, div, section').toArray();
|
|
83
|
+
for (const candidate of candidates) {
|
|
84
|
+
const row = $(candidate);
|
|
85
|
+
const text = row.text().replace(/\s+/g, ' ').trim();
|
|
86
|
+
if (row.find('a[href^="/startups/"], a[href*="betalist.com/startups/"]').length >= 1 && text.length > 0) {
|
|
87
|
+
return row;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
return $(anchor).parent();
|
|
91
|
+
}
|
|
92
|
+
function extractProductName($, anchor, container) {
|
|
93
|
+
const direct = $(anchor).text().replace(/\s+/g, ' ').trim();
|
|
94
|
+
if (direct && direct.length <= 200)
|
|
95
|
+
return direct;
|
|
96
|
+
const heading = container.find('h1, h2, h3, h4, [class*="title"], [class*="name"]').first().text().replace(/\s+/g, ' ').trim();
|
|
97
|
+
if (heading && heading.length <= 200)
|
|
98
|
+
return heading;
|
|
99
|
+
return null;
|
|
100
|
+
}
|
|
101
|
+
function extractProductUrl($, container, sourceUrl) {
|
|
102
|
+
const links = container.find('a[href]').toArray();
|
|
103
|
+
for (const link of links) {
|
|
104
|
+
const href = $(link).attr('href');
|
|
105
|
+
if (!href)
|
|
106
|
+
continue;
|
|
107
|
+
const text = $(link).text().replace(/\s+/g, ' ').trim().toLowerCase();
|
|
108
|
+
if (!text.includes('visit') && !text.includes('website'))
|
|
109
|
+
continue;
|
|
110
|
+
const resolved = resolveExternalUrl(href);
|
|
111
|
+
if (resolved && resolved !== sourceUrl)
|
|
112
|
+
return resolved;
|
|
113
|
+
}
|
|
114
|
+
for (const link of links) {
|
|
115
|
+
const href = $(link).attr('href');
|
|
116
|
+
const resolved = href ? resolveExternalUrl(href) : null;
|
|
117
|
+
if (resolved && !resolved.includes('betalist.com/startups/'))
|
|
118
|
+
return resolved;
|
|
119
|
+
}
|
|
120
|
+
return null;
|
|
121
|
+
}
|
|
122
|
+
function resolveExternalUrl(href) {
|
|
123
|
+
try {
|
|
124
|
+
const url = new URL(href, BETALIST_ORIGIN);
|
|
125
|
+
if (url.hostname === 'betalist.com' || url.hostname === 'www.betalist.com')
|
|
126
|
+
return null;
|
|
127
|
+
return url.toString();
|
|
128
|
+
}
|
|
129
|
+
catch {
|
|
130
|
+
return null;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
function parseLaunchDateForRow($, container) {
|
|
134
|
+
const time = container.find('time[datetime]').first().attr('datetime');
|
|
135
|
+
const parsedTime = parseDate(time);
|
|
136
|
+
if (parsedTime)
|
|
137
|
+
return parsedTime;
|
|
138
|
+
const localDate = parseDate(container.find('time').first().text());
|
|
139
|
+
if (localDate)
|
|
140
|
+
return localDate;
|
|
141
|
+
const previous = container.prevAll('h1, h2, h3, h4, time, [datetime]').slice(0, 5).toArray();
|
|
142
|
+
for (const element of previous) {
|
|
143
|
+
const value = $(element).attr('datetime') ?? $(element).text();
|
|
144
|
+
const parsed = parseDate(value);
|
|
145
|
+
if (parsed)
|
|
146
|
+
return parsed;
|
|
147
|
+
}
|
|
148
|
+
const parentPrevious = container.parent().prevAll('h1, h2, h3, h4, time, [datetime]').slice(0, 5).toArray();
|
|
149
|
+
for (const element of parentPrevious) {
|
|
150
|
+
const value = $(element).attr('datetime') ?? $(element).text();
|
|
151
|
+
const parsed = parseDate(value);
|
|
152
|
+
if (parsed)
|
|
153
|
+
return parsed;
|
|
154
|
+
}
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
function parseDate(value) {
|
|
158
|
+
const trimmed = value?.replace(/\s+/g, ' ').trim();
|
|
159
|
+
if (!trimmed)
|
|
160
|
+
return null;
|
|
161
|
+
const normalized = trimmed
|
|
162
|
+
.replace(/^today$/i, new Date().toISOString())
|
|
163
|
+
.replace(/^yesterday$/i, new Date(Date.now() - 86_400_000).toISOString());
|
|
164
|
+
const parsed = new Date(normalized);
|
|
165
|
+
return Number.isNaN(parsed.getTime()) ? null : parsed;
|
|
166
|
+
}
|
|
167
|
+
function isWithinLookback(date, lookbackDays) {
|
|
168
|
+
const cutoff = new Date();
|
|
169
|
+
cutoff.setHours(0, 0, 0, 0);
|
|
170
|
+
cutoff.setDate(cutoff.getDate() - Math.max(1, lookbackDays));
|
|
171
|
+
return date >= cutoff;
|
|
172
|
+
}
|
|
173
|
+
function extractCreatorName($, container) {
|
|
174
|
+
const relAuthor = container.find('[rel="author"], [class*="creator"], [class*="maker"], a[href^="/makers/"]').first().text();
|
|
175
|
+
const normalized = relAuthor.replace(/\s+/g, ' ').trim();
|
|
176
|
+
return normalized || null;
|
|
177
|
+
}
|
|
178
|
+
function extractTagline($, container, productName) {
|
|
179
|
+
const selectors = ['[class*="tagline"]', '[class*="description"]', 'p'];
|
|
180
|
+
for (const selector of selectors) {
|
|
181
|
+
const text = container.find(selector).first().text().replace(/\s+/g, ' ').trim();
|
|
182
|
+
if (text && text !== productName && text.length <= 500)
|
|
183
|
+
return text;
|
|
184
|
+
}
|
|
185
|
+
return null;
|
|
186
|
+
}
|
|
187
|
+
function extractEmail(text) {
|
|
188
|
+
return text.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/)?.[0] ?? null;
|
|
189
|
+
}
|
|
190
|
+
function extractHandleFromHtml(html) {
|
|
191
|
+
const match = html.match(/(?:twitter\.com|x\.com)\/([A-Za-z0-9_]{1,20})/i);
|
|
192
|
+
return match ? `@${match[1]}` : null;
|
|
193
|
+
}
|
|
194
|
+
//# sourceMappingURL=scrape-betalist.js.map
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { Browser } from 'playwright';
|
|
2
|
+
export type LandingScrape = {
|
|
3
|
+
htmlSignals: {
|
|
4
|
+
n_screenshots_html?: number;
|
|
5
|
+
has_video_html?: boolean;
|
|
6
|
+
has_docs_html?: boolean;
|
|
7
|
+
has_dashboard_html?: boolean;
|
|
8
|
+
};
|
|
9
|
+
screenshotPath: string | null;
|
|
10
|
+
isWebApp: boolean;
|
|
11
|
+
rawText: string;
|
|
12
|
+
error?: string;
|
|
13
|
+
};
|
|
14
|
+
export interface ScrapeLandingOptions {
|
|
15
|
+
productUrl: string;
|
|
16
|
+
userAgent: string;
|
|
17
|
+
logger: {
|
|
18
|
+
info(msg: string): void;
|
|
19
|
+
warn(msg: string): void;
|
|
20
|
+
error(msg: string): void;
|
|
21
|
+
};
|
|
22
|
+
browser: Browser;
|
|
23
|
+
}
|
|
24
|
+
export declare function scrapeLanding(opts: ScrapeLandingOptions): Promise<LandingScrape>;
|