autokap 1.4.2 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-contract.d.ts +73 -0
- package/dist/cli-contract.js +1 -0
- package/dist/cli-doctor.d.ts +4 -0
- package/dist/cli-doctor.js +302 -0
- package/dist/cli-runner.js +315 -2
- package/dist/cli.js +122 -47
- package/dist/types.d.ts +1 -1
- package/dist/version-check.d.ts +4 -0
- package/dist/version-check.js +102 -0
- package/package.json +2 -3
- package/dist/crm/email-fallback.d.ts +0 -16
- package/dist/crm/email-fallback.js +0 -217
- package/dist/crm/run-campaign.d.ts +0 -28
- package/dist/crm/run-campaign.js +0 -405
- package/dist/crm/scrape-betalist.d.ts +0 -20
- package/dist/crm/scrape-betalist.js +0 -194
- package/dist/crm/scrape-landing.d.ts +0 -24
- package/dist/crm/scrape-landing.js +0 -240
- package/dist/crm/storage-upload.d.ts +0 -14
- package/dist/crm/storage-upload.js +0 -40
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
export type BetaListLaunch = {
|
|
2
|
-
sourceUrl: string;
|
|
3
|
-
productName: string;
|
|
4
|
-
productUrl: string | null;
|
|
5
|
-
creatorName: string | null;
|
|
6
|
-
creatorEmail: string | null;
|
|
7
|
-
creatorHandle: string | null;
|
|
8
|
-
creatorLang: string | null;
|
|
9
|
-
tagline?: string | null;
|
|
10
|
-
};
|
|
11
|
-
export interface ScrapeBetaListOptions {
|
|
12
|
-
lookbackDays: number;
|
|
13
|
-
userAgent: string;
|
|
14
|
-
logger: {
|
|
15
|
-
info(msg: string): void;
|
|
16
|
-
warn(msg: string): void;
|
|
17
|
-
error(msg: string): void;
|
|
18
|
-
};
|
|
19
|
-
}
|
|
20
|
-
export declare function scrapeBetaListLaunches(opts: ScrapeBetaListOptions): Promise<BetaListLaunch[]>;
|
|
@@ -1,194 +0,0 @@
|
|
|
1
|
-
const BETALIST_ORIGIN = 'https://betalist.com/';
|
|
2
|
-
export async function scrapeBetaListLaunches(opts) {
|
|
3
|
-
let html;
|
|
4
|
-
try {
|
|
5
|
-
const response = await fetch(BETALIST_ORIGIN, {
|
|
6
|
-
headers: {
|
|
7
|
-
Accept: 'text/html',
|
|
8
|
-
'User-Agent': opts.userAgent,
|
|
9
|
-
},
|
|
10
|
-
signal: AbortSignal.timeout(15_000),
|
|
11
|
-
});
|
|
12
|
-
if (!response.ok) {
|
|
13
|
-
opts.logger.warn(`[crm-betalist] BetaList fetch returned HTTP ${response.status}`);
|
|
14
|
-
return [];
|
|
15
|
-
}
|
|
16
|
-
html = await response.text();
|
|
17
|
-
}
|
|
18
|
-
catch (error) {
|
|
19
|
-
opts.logger.warn(`[crm-betalist] BetaList fetch failed: ${error.message}`);
|
|
20
|
-
return [];
|
|
21
|
-
}
|
|
22
|
-
const cheerio = await loadCheerio();
|
|
23
|
-
const $ = cheerio.load(html);
|
|
24
|
-
const launches = [];
|
|
25
|
-
const seen = new Set();
|
|
26
|
-
$('a[href^="/startups/"], a[href*="betalist.com/startups/"]').each((_, anchor) => {
|
|
27
|
-
const href = $(anchor).attr('href');
|
|
28
|
-
if (!href)
|
|
29
|
-
return;
|
|
30
|
-
const sourceUrl = canonicalBetaListUrl(href);
|
|
31
|
-
if (!sourceUrl || seen.has(sourceUrl))
|
|
32
|
-
return;
|
|
33
|
-
const container = closestLaunchContainer($, anchor);
|
|
34
|
-
const productName = extractProductName($, anchor, container);
|
|
35
|
-
if (!productName) {
|
|
36
|
-
opts.logger.warn(`[crm-betalist] Missing product name for ${sourceUrl}`);
|
|
37
|
-
return;
|
|
38
|
-
}
|
|
39
|
-
const launchDate = parseLaunchDateForRow($, container);
|
|
40
|
-
if (launchDate && !isWithinLookback(launchDate, opts.lookbackDays))
|
|
41
|
-
return;
|
|
42
|
-
const productUrl = extractProductUrl($, container, sourceUrl);
|
|
43
|
-
const text = container.text().replace(/\s+/g, ' ').trim();
|
|
44
|
-
const creatorHandle = extractHandleFromHtml(container.html() ?? '');
|
|
45
|
-
launches.push({
|
|
46
|
-
sourceUrl,
|
|
47
|
-
productName,
|
|
48
|
-
productUrl,
|
|
49
|
-
creatorName: extractCreatorName($, container),
|
|
50
|
-
creatorEmail: extractEmail(text),
|
|
51
|
-
creatorHandle,
|
|
52
|
-
creatorLang: null,
|
|
53
|
-
tagline: extractTagline($, container, productName),
|
|
54
|
-
});
|
|
55
|
-
seen.add(sourceUrl);
|
|
56
|
-
});
|
|
57
|
-
if (launches.length === 0) {
|
|
58
|
-
opts.logger.warn('[crm-betalist] No BetaList launch links found on homepage');
|
|
59
|
-
}
|
|
60
|
-
return launches;
|
|
61
|
-
}
|
|
62
|
-
function canonicalBetaListUrl(href) {
|
|
63
|
-
try {
|
|
64
|
-
const url = new URL(href, BETALIST_ORIGIN);
|
|
65
|
-
if (url.hostname !== 'betalist.com' && url.hostname !== 'www.betalist.com')
|
|
66
|
-
return null;
|
|
67
|
-
if (!url.pathname.startsWith('/startups/'))
|
|
68
|
-
return null;
|
|
69
|
-
url.hash = '';
|
|
70
|
-
url.search = '';
|
|
71
|
-
return url.toString();
|
|
72
|
-
}
|
|
73
|
-
catch {
|
|
74
|
-
return null;
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
async function loadCheerio() {
|
|
78
|
-
const importer = new Function('specifier', 'return import(specifier)');
|
|
79
|
-
return importer('cheerio');
|
|
80
|
-
}
|
|
81
|
-
function closestLaunchContainer($, anchor) {
|
|
82
|
-
const candidates = $(anchor).parents('article, li, tr, div, section').toArray();
|
|
83
|
-
for (const candidate of candidates) {
|
|
84
|
-
const row = $(candidate);
|
|
85
|
-
const text = row.text().replace(/\s+/g, ' ').trim();
|
|
86
|
-
if (row.find('a[href^="/startups/"], a[href*="betalist.com/startups/"]').length >= 1 && text.length > 0) {
|
|
87
|
-
return row;
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
return $(anchor).parent();
|
|
91
|
-
}
|
|
92
|
-
function extractProductName($, anchor, container) {
|
|
93
|
-
const direct = $(anchor).text().replace(/\s+/g, ' ').trim();
|
|
94
|
-
if (direct && direct.length <= 200)
|
|
95
|
-
return direct;
|
|
96
|
-
const heading = container.find('h1, h2, h3, h4, [class*="title"], [class*="name"]').first().text().replace(/\s+/g, ' ').trim();
|
|
97
|
-
if (heading && heading.length <= 200)
|
|
98
|
-
return heading;
|
|
99
|
-
return null;
|
|
100
|
-
}
|
|
101
|
-
function extractProductUrl($, container, sourceUrl) {
|
|
102
|
-
const links = container.find('a[href]').toArray();
|
|
103
|
-
for (const link of links) {
|
|
104
|
-
const href = $(link).attr('href');
|
|
105
|
-
if (!href)
|
|
106
|
-
continue;
|
|
107
|
-
const text = $(link).text().replace(/\s+/g, ' ').trim().toLowerCase();
|
|
108
|
-
if (!text.includes('visit') && !text.includes('website'))
|
|
109
|
-
continue;
|
|
110
|
-
const resolved = resolveExternalUrl(href);
|
|
111
|
-
if (resolved && resolved !== sourceUrl)
|
|
112
|
-
return resolved;
|
|
113
|
-
}
|
|
114
|
-
for (const link of links) {
|
|
115
|
-
const href = $(link).attr('href');
|
|
116
|
-
const resolved = href ? resolveExternalUrl(href) : null;
|
|
117
|
-
if (resolved && !resolved.includes('betalist.com/startups/'))
|
|
118
|
-
return resolved;
|
|
119
|
-
}
|
|
120
|
-
return null;
|
|
121
|
-
}
|
|
122
|
-
function resolveExternalUrl(href) {
|
|
123
|
-
try {
|
|
124
|
-
const url = new URL(href, BETALIST_ORIGIN);
|
|
125
|
-
if (url.hostname === 'betalist.com' || url.hostname === 'www.betalist.com')
|
|
126
|
-
return null;
|
|
127
|
-
return url.toString();
|
|
128
|
-
}
|
|
129
|
-
catch {
|
|
130
|
-
return null;
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
function parseLaunchDateForRow($, container) {
|
|
134
|
-
const time = container.find('time[datetime]').first().attr('datetime');
|
|
135
|
-
const parsedTime = parseDate(time);
|
|
136
|
-
if (parsedTime)
|
|
137
|
-
return parsedTime;
|
|
138
|
-
const localDate = parseDate(container.find('time').first().text());
|
|
139
|
-
if (localDate)
|
|
140
|
-
return localDate;
|
|
141
|
-
const previous = container.prevAll('h1, h2, h3, h4, time, [datetime]').slice(0, 5).toArray();
|
|
142
|
-
for (const element of previous) {
|
|
143
|
-
const value = $(element).attr('datetime') ?? $(element).text();
|
|
144
|
-
const parsed = parseDate(value);
|
|
145
|
-
if (parsed)
|
|
146
|
-
return parsed;
|
|
147
|
-
}
|
|
148
|
-
const parentPrevious = container.parent().prevAll('h1, h2, h3, h4, time, [datetime]').slice(0, 5).toArray();
|
|
149
|
-
for (const element of parentPrevious) {
|
|
150
|
-
const value = $(element).attr('datetime') ?? $(element).text();
|
|
151
|
-
const parsed = parseDate(value);
|
|
152
|
-
if (parsed)
|
|
153
|
-
return parsed;
|
|
154
|
-
}
|
|
155
|
-
return null;
|
|
156
|
-
}
|
|
157
|
-
function parseDate(value) {
|
|
158
|
-
const trimmed = value?.replace(/\s+/g, ' ').trim();
|
|
159
|
-
if (!trimmed)
|
|
160
|
-
return null;
|
|
161
|
-
const normalized = trimmed
|
|
162
|
-
.replace(/^today$/i, new Date().toISOString())
|
|
163
|
-
.replace(/^yesterday$/i, new Date(Date.now() - 86_400_000).toISOString());
|
|
164
|
-
const parsed = new Date(normalized);
|
|
165
|
-
return Number.isNaN(parsed.getTime()) ? null : parsed;
|
|
166
|
-
}
|
|
167
|
-
function isWithinLookback(date, lookbackDays) {
|
|
168
|
-
const cutoff = new Date();
|
|
169
|
-
cutoff.setHours(0, 0, 0, 0);
|
|
170
|
-
cutoff.setDate(cutoff.getDate() - Math.max(1, lookbackDays));
|
|
171
|
-
return date >= cutoff;
|
|
172
|
-
}
|
|
173
|
-
function extractCreatorName($, container) {
|
|
174
|
-
const relAuthor = container.find('[rel="author"], [class*="creator"], [class*="maker"], a[href^="/makers/"]').first().text();
|
|
175
|
-
const normalized = relAuthor.replace(/\s+/g, ' ').trim();
|
|
176
|
-
return normalized || null;
|
|
177
|
-
}
|
|
178
|
-
function extractTagline($, container, productName) {
|
|
179
|
-
const selectors = ['[class*="tagline"]', '[class*="description"]', 'p'];
|
|
180
|
-
for (const selector of selectors) {
|
|
181
|
-
const text = container.find(selector).first().text().replace(/\s+/g, ' ').trim();
|
|
182
|
-
if (text && text !== productName && text.length <= 500)
|
|
183
|
-
return text;
|
|
184
|
-
}
|
|
185
|
-
return null;
|
|
186
|
-
}
|
|
187
|
-
function extractEmail(text) {
|
|
188
|
-
return text.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/)?.[0] ?? null;
|
|
189
|
-
}
|
|
190
|
-
function extractHandleFromHtml(html) {
|
|
191
|
-
const match = html.match(/(?:twitter\.com|x\.com)\/([A-Za-z0-9_]{1,20})/i);
|
|
192
|
-
return match ? `@${match[1]}` : null;
|
|
193
|
-
}
|
|
194
|
-
//# sourceMappingURL=scrape-betalist.js.map
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import type { Browser } from 'playwright';
|
|
2
|
-
export type LandingScrape = {
|
|
3
|
-
htmlSignals: {
|
|
4
|
-
n_screenshots_html?: number;
|
|
5
|
-
has_video_html?: boolean;
|
|
6
|
-
has_docs_html?: boolean;
|
|
7
|
-
has_dashboard_html?: boolean;
|
|
8
|
-
};
|
|
9
|
-
screenshotPath: string | null;
|
|
10
|
-
isWebApp: boolean;
|
|
11
|
-
rawText: string;
|
|
12
|
-
error?: string;
|
|
13
|
-
};
|
|
14
|
-
export interface ScrapeLandingOptions {
|
|
15
|
-
productUrl: string;
|
|
16
|
-
userAgent: string;
|
|
17
|
-
logger: {
|
|
18
|
-
info(msg: string): void;
|
|
19
|
-
warn(msg: string): void;
|
|
20
|
-
error(msg: string): void;
|
|
21
|
-
};
|
|
22
|
-
browser: Browser;
|
|
23
|
-
}
|
|
24
|
-
export declare function scrapeLanding(opts: ScrapeLandingOptions): Promise<LandingScrape>;
|
|
@@ -1,240 +0,0 @@
|
|
|
1
|
-
import fs from 'node:fs/promises';
|
|
2
|
-
import os from 'node:os';
|
|
3
|
-
import path from 'node:path';
|
|
4
|
-
const VIDEO_HOST_RE = /(?:youtube\.com|youtu\.be|vimeo\.com|loom\.com|wistia)/i;
|
|
5
|
-
const NOT_WEB_APP_RE = /\b(iOS only|App Store only|iPhone only|iPad only|Android only|Google Play only|hardware|physical product|chrome extension only|firefox addon only)\b/i;
|
|
6
|
-
export async function scrapeLanding(opts) {
|
|
7
|
-
const url = normalizeHttpUrl(opts.productUrl);
|
|
8
|
-
if (!url) {
|
|
9
|
-
return {
|
|
10
|
-
htmlSignals: {},
|
|
11
|
-
screenshotPath: null,
|
|
12
|
-
isWebApp: true,
|
|
13
|
-
rawText: '',
|
|
14
|
-
error: 'invalid_product_url',
|
|
15
|
-
};
|
|
16
|
-
}
|
|
17
|
-
const robotsAllowed = await isAllowedByRobots(url, opts.userAgent, opts.logger);
|
|
18
|
-
if (!robotsAllowed) {
|
|
19
|
-
return {
|
|
20
|
-
htmlSignals: {},
|
|
21
|
-
screenshotPath: null,
|
|
22
|
-
isWebApp: true,
|
|
23
|
-
rawText: '',
|
|
24
|
-
error: 'robots_disallowed',
|
|
25
|
-
};
|
|
26
|
-
}
|
|
27
|
-
const htmlResult = await scrapeHtmlSignals(url, opts.userAgent, opts.logger);
|
|
28
|
-
// The orchestrator launches Chromium and applies Linux-only container flags.
|
|
29
|
-
try {
|
|
30
|
-
const screenshotPath = await captureLandingScreenshot(url, opts);
|
|
31
|
-
return { ...htmlResult, screenshotPath };
|
|
32
|
-
}
|
|
33
|
-
catch (error) {
|
|
34
|
-
opts.logger.warn(`[crm-landing] Screenshot failed for ${url}: ${error.message}`);
|
|
35
|
-
return {
|
|
36
|
-
...htmlResult,
|
|
37
|
-
screenshotPath: null,
|
|
38
|
-
error: `screenshot_failed: ${error.message}`,
|
|
39
|
-
};
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
async function isAllowedByRobots(url, userAgent, logger) {
|
|
43
|
-
let robotsUrl;
|
|
44
|
-
try {
|
|
45
|
-
robotsUrl = new URL('/robots.txt', url).toString();
|
|
46
|
-
}
|
|
47
|
-
catch {
|
|
48
|
-
return true;
|
|
49
|
-
}
|
|
50
|
-
try {
|
|
51
|
-
const response = await fetch(robotsUrl, {
|
|
52
|
-
headers: {
|
|
53
|
-
Accept: 'text/plain,text/*,*/*',
|
|
54
|
-
'User-Agent': userAgent,
|
|
55
|
-
},
|
|
56
|
-
signal: AbortSignal.timeout(15_000),
|
|
57
|
-
});
|
|
58
|
-
if (!response.ok)
|
|
59
|
-
return true;
|
|
60
|
-
const text = await response.text();
|
|
61
|
-
return !robotsDisallowsAll(text);
|
|
62
|
-
}
|
|
63
|
-
catch (error) {
|
|
64
|
-
logger.warn(`[crm-landing] robots.txt fetch failed open for ${robotsUrl}: ${error.message}`);
|
|
65
|
-
return true;
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
function robotsDisallowsAll(text) {
|
|
69
|
-
const groups = [];
|
|
70
|
-
let current = null;
|
|
71
|
-
for (const rawLine of text.split(/\r?\n/)) {
|
|
72
|
-
const line = rawLine.replace(/#.*/, '').trim();
|
|
73
|
-
if (!line)
|
|
74
|
-
continue;
|
|
75
|
-
const colon = line.indexOf(':');
|
|
76
|
-
if (colon === -1)
|
|
77
|
-
continue;
|
|
78
|
-
const key = line.slice(0, colon).trim().toLowerCase();
|
|
79
|
-
const value = line.slice(colon + 1).trim();
|
|
80
|
-
if (key === 'user-agent') {
|
|
81
|
-
if (!current || current.disallows.length > 0) {
|
|
82
|
-
current = { agents: [], disallows: [] };
|
|
83
|
-
groups.push(current);
|
|
84
|
-
}
|
|
85
|
-
current.agents.push(value.toLowerCase());
|
|
86
|
-
}
|
|
87
|
-
else if (key === 'disallow' && current) {
|
|
88
|
-
current.disallows.push(value);
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
const autokap = groups.find((group) => group.agents.includes('autokap-crawler'));
|
|
92
|
-
const wildcard = groups.find((group) => group.agents.includes('*'));
|
|
93
|
-
const group = autokap ?? wildcard;
|
|
94
|
-
return group?.disallows.some((rule) => rule.trim() === '/') ?? false;
|
|
95
|
-
}
|
|
96
|
-
async function scrapeHtmlSignals(url, userAgent, logger) {
|
|
97
|
-
try {
|
|
98
|
-
const response = await fetch(url, {
|
|
99
|
-
headers: {
|
|
100
|
-
Accept: 'text/html',
|
|
101
|
-
'User-Agent': userAgent,
|
|
102
|
-
},
|
|
103
|
-
signal: AbortSignal.timeout(15_000),
|
|
104
|
-
});
|
|
105
|
-
if (!response.ok) {
|
|
106
|
-
logger.warn(`[crm-landing] HTML fetch returned HTTP ${response.status} for ${url}`);
|
|
107
|
-
return { htmlSignals: {}, isWebApp: true, rawText: '' };
|
|
108
|
-
}
|
|
109
|
-
const html = await response.text();
|
|
110
|
-
const cheerio = await loadCheerio();
|
|
111
|
-
const $ = cheerio.load(html);
|
|
112
|
-
$('script, style, noscript, svg').remove();
|
|
113
|
-
const rawText = $('body').text().replace(/\s+/g, ' ').trim().slice(0, 8000);
|
|
114
|
-
const htmlSignals = {
|
|
115
|
-
n_screenshots_html: countContentImages($),
|
|
116
|
-
has_video_html: hasVideo($),
|
|
117
|
-
has_docs_html: hasDocs($),
|
|
118
|
-
has_dashboard_html: hasDashboard(rawText),
|
|
119
|
-
};
|
|
120
|
-
return {
|
|
121
|
-
htmlSignals,
|
|
122
|
-
isWebApp: detectWebApp($, rawText),
|
|
123
|
-
rawText,
|
|
124
|
-
};
|
|
125
|
-
}
|
|
126
|
-
catch (error) {
|
|
127
|
-
logger.warn(`[crm-landing] HTML fetch failed for ${url}: ${error.message}`);
|
|
128
|
-
return { htmlSignals: {}, isWebApp: true, rawText: '' };
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
async function loadCheerio() {
|
|
132
|
-
const importer = new Function('specifier', 'return import(specifier)');
|
|
133
|
-
return importer('cheerio');
|
|
134
|
-
}
|
|
135
|
-
function countContentImages($) {
|
|
136
|
-
let count = 0;
|
|
137
|
-
$('img[src]').each((_, img) => {
|
|
138
|
-
if (count >= 50)
|
|
139
|
-
return;
|
|
140
|
-
const node = $(img);
|
|
141
|
-
const src = node.attr('src') ?? '';
|
|
142
|
-
const className = node.attr('class') ?? '';
|
|
143
|
-
const width = parseDimension(node.attr('width'));
|
|
144
|
-
const height = parseDimension(node.attr('height'));
|
|
145
|
-
let pathname = '';
|
|
146
|
-
try {
|
|
147
|
-
pathname = new URL(src, 'https://example.test/').pathname;
|
|
148
|
-
}
|
|
149
|
-
catch {
|
|
150
|
-
pathname = src;
|
|
151
|
-
}
|
|
152
|
-
if (/logo|icon/i.test(className))
|
|
153
|
-
return;
|
|
154
|
-
if (/logo|favicon/i.test(pathname))
|
|
155
|
-
return;
|
|
156
|
-
if ((width !== null && width <= 64) || (height !== null && height <= 64))
|
|
157
|
-
return;
|
|
158
|
-
count += 1;
|
|
159
|
-
});
|
|
160
|
-
return count;
|
|
161
|
-
}
|
|
162
|
-
function parseDimension(value) {
|
|
163
|
-
if (!value)
|
|
164
|
-
return null;
|
|
165
|
-
const parsed = Number.parseInt(value, 10);
|
|
166
|
-
return Number.isFinite(parsed) ? parsed : null;
|
|
167
|
-
}
|
|
168
|
-
function hasVideo($) {
|
|
169
|
-
if ($('video').length > 0)
|
|
170
|
-
return true;
|
|
171
|
-
return $('iframe[src]').toArray().some((iframe) => VIDEO_HOST_RE.test($(iframe).attr('src') ?? ''));
|
|
172
|
-
}
|
|
173
|
-
function hasDocs($) {
|
|
174
|
-
return $('a[href]').toArray().some((anchor) => {
|
|
175
|
-
const href = ($(anchor).attr('href') ?? '').toLowerCase();
|
|
176
|
-
const text = $(anchor).text().replace(/\s+/g, ' ').trim().toLowerCase();
|
|
177
|
-
return href.includes('/docs') || href.includes('/documentation') || text === 'docs' || text === 'documentation';
|
|
178
|
-
});
|
|
179
|
-
}
|
|
180
|
-
function hasDashboard(text) {
|
|
181
|
-
if (/\bdashboard\b/i.test(text))
|
|
182
|
-
return true;
|
|
183
|
-
if (!/\bapp\b/i.test(text))
|
|
184
|
-
return false;
|
|
185
|
-
return /\b(sign in|log in|login|signup)\b/i.test(text);
|
|
186
|
-
}
|
|
187
|
-
function detectWebApp($, rawText) {
|
|
188
|
-
if (NOT_WEB_APP_RE.test(rawText))
|
|
189
|
-
return false;
|
|
190
|
-
const ctaLinks = $('a[href]').toArray().filter((anchor) => {
|
|
191
|
-
const text = $(anchor).text().replace(/\s+/g, ' ').trim();
|
|
192
|
-
const href = $(anchor).attr('href') ?? '';
|
|
193
|
-
return /app store|google play/i.test(`${text} ${href}`);
|
|
194
|
-
});
|
|
195
|
-
const primaryLinks = $('a[href]').toArray().filter((anchor) => {
|
|
196
|
-
const text = $(anchor).text().replace(/\s+/g, ' ').trim();
|
|
197
|
-
if (!text)
|
|
198
|
-
return false;
|
|
199
|
-
return /download|get|install|app store|google play/i.test(text);
|
|
200
|
-
});
|
|
201
|
-
return !(primaryLinks.length > 0 && primaryLinks.length === ctaLinks.length);
|
|
202
|
-
}
|
|
203
|
-
async function captureLandingScreenshot(url, opts) {
|
|
204
|
-
const context = await opts.browser.newContext({
|
|
205
|
-
userAgent: opts.userAgent,
|
|
206
|
-
viewport: { width: 1280, height: 800 },
|
|
207
|
-
});
|
|
208
|
-
try {
|
|
209
|
-
const page = await context.newPage();
|
|
210
|
-
try {
|
|
211
|
-
await page.goto(url, { waitUntil: 'networkidle', timeout: 30_000 });
|
|
212
|
-
}
|
|
213
|
-
catch (error) {
|
|
214
|
-
const err = error;
|
|
215
|
-
opts.logger.warn(`[crm-landing] networkidle goto failed for ${url}, retrying load: ${err.message}`);
|
|
216
|
-
await page.goto(url, { waitUntil: 'load', timeout: 30_000 });
|
|
217
|
-
}
|
|
218
|
-
const dir = path.join(os.tmpdir(), 'autokap-crm');
|
|
219
|
-
await fs.mkdir(dir, { recursive: true });
|
|
220
|
-
const slug = new URL(url).hostname.replace(/[^a-z0-9.-]+/gi, '-').replace(/^-+|-+$/g, '') || 'landing';
|
|
221
|
-
const screenshotPath = path.join(dir, `landing-${slug}-${Date.now()}.png`);
|
|
222
|
-
await page.screenshot({ path: screenshotPath, fullPage: true, type: 'png' });
|
|
223
|
-
return screenshotPath;
|
|
224
|
-
}
|
|
225
|
-
finally {
|
|
226
|
-
await context.close();
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
function normalizeHttpUrl(value) {
|
|
230
|
-
try {
|
|
231
|
-
const url = new URL(value);
|
|
232
|
-
if (url.protocol !== 'http:' && url.protocol !== 'https:')
|
|
233
|
-
return null;
|
|
234
|
-
return url.toString();
|
|
235
|
-
}
|
|
236
|
-
catch {
|
|
237
|
-
return null;
|
|
238
|
-
}
|
|
239
|
-
}
|
|
240
|
-
//# sourceMappingURL=scrape-landing.js.map
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
export interface UploadLandingShotOptions {
|
|
2
|
-
runId: string;
|
|
3
|
-
sourceUrl: string;
|
|
4
|
-
pngBuffer: Buffer;
|
|
5
|
-
apiBaseUrl: string;
|
|
6
|
-
runToken: string;
|
|
7
|
-
}
|
|
8
|
-
export declare class LandingShotEndpointMissingError extends Error {
|
|
9
|
-
constructor();
|
|
10
|
-
}
|
|
11
|
-
export declare function uploadLandingShot(opts: UploadLandingShotOptions): Promise<{
|
|
12
|
-
signedUrl: string;
|
|
13
|
-
expiresInSec: number;
|
|
14
|
-
}>;
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
// TODO(parent): backend endpoint POST /api/cli/crm/runs/[runId]/landing-shot is referenced here but may not exist yet — see comment.
|
|
2
|
-
import { logger } from '../logger.js';
|
|
3
|
-
export class LandingShotEndpointMissingError extends Error {
|
|
4
|
-
constructor() {
|
|
5
|
-
super('LANDING_SHOT_ENDPOINT_MISSING');
|
|
6
|
-
this.name = 'LandingShotEndpointMissingError';
|
|
7
|
-
}
|
|
8
|
-
}
|
|
9
|
-
export async function uploadLandingShot(opts) {
|
|
10
|
-
const form = new FormData();
|
|
11
|
-
form.set('source_url', opts.sourceUrl);
|
|
12
|
-
const bytes = new Uint8Array(opts.pngBuffer.byteLength);
|
|
13
|
-
bytes.set(opts.pngBuffer);
|
|
14
|
-
form.set('file', new Blob([bytes.buffer], { type: 'image/png' }), 'landing.png');
|
|
15
|
-
const response = await fetch(`${opts.apiBaseUrl.replace(/\/+$/, '')}/api/cli/crm/runs/${opts.runId}/landing-shot`, {
|
|
16
|
-
method: 'POST',
|
|
17
|
-
headers: {
|
|
18
|
-
Authorization: `Bearer ${opts.runToken}`,
|
|
19
|
-
},
|
|
20
|
-
body: form,
|
|
21
|
-
signal: AbortSignal.timeout(30_000),
|
|
22
|
-
});
|
|
23
|
-
if (response.status === 404) {
|
|
24
|
-
logger.warn('[crm-upload] Landing-shot endpoint missing; continuing without vision');
|
|
25
|
-
throw new LandingShotEndpointMissingError();
|
|
26
|
-
}
|
|
27
|
-
if (!response.ok) {
|
|
28
|
-
const body = await response.text().catch(() => response.statusText);
|
|
29
|
-
throw new Error(`landing-shot upload failed: HTTP ${response.status} ${body.slice(0, 300)}`);
|
|
30
|
-
}
|
|
31
|
-
const json = await response.json().catch(() => null);
|
|
32
|
-
if (!json || typeof json.signedUrl !== 'string' || typeof json.expiresInSec !== 'number') {
|
|
33
|
-
throw new Error('landing-shot upload returned invalid JSON');
|
|
34
|
-
}
|
|
35
|
-
return {
|
|
36
|
-
signedUrl: json.signedUrl,
|
|
37
|
-
expiresInSec: json.expiresInSec,
|
|
38
|
-
};
|
|
39
|
-
}
|
|
40
|
-
//# sourceMappingURL=storage-upload.js.map
|