autokap 1.4.3 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-contract.d.ts +73 -0
- package/dist/cli-contract.js +1 -0
- package/dist/cli-doctor.d.ts +4 -0
- package/dist/cli-doctor.js +302 -0
- package/dist/cli-runner.js +315 -2
- package/dist/cli.js +122 -66
- package/dist/types.d.ts +1 -1
- package/dist/version-check.d.ts +4 -0
- package/dist/version-check.js +102 -0
- package/package.json +2 -3
- package/dist/crm/email-fallback.d.ts +0 -16
- package/dist/crm/email-fallback.js +0 -217
- package/dist/crm/run-campaign.d.ts +0 -28
- package/dist/crm/run-campaign.js +0 -405
- package/dist/crm/scrape-betalist.d.ts +0 -20
- package/dist/crm/scrape-betalist.js +0 -194
- package/dist/crm/scrape-landing.d.ts +0 -24
- package/dist/crm/scrape-landing.js +0 -240
- package/dist/crm/storage-upload.d.ts +0 -14
- package/dist/crm/storage-upload.js +0 -40
|
@@ -1,240 +0,0 @@
|
|
|
1
|
-
import fs from 'node:fs/promises';
|
|
2
|
-
import os from 'node:os';
|
|
3
|
-
import path from 'node:path';
|
|
4
|
-
const VIDEO_HOST_RE = /(?:youtube\.com|youtu\.be|vimeo\.com|loom\.com|wistia)/i;
|
|
5
|
-
const NOT_WEB_APP_RE = /\b(iOS only|App Store only|iPhone only|iPad only|Android only|Google Play only|hardware|physical product|chrome extension only|firefox addon only)\b/i;
|
|
6
|
-
export async function scrapeLanding(opts) {
|
|
7
|
-
const url = normalizeHttpUrl(opts.productUrl);
|
|
8
|
-
if (!url) {
|
|
9
|
-
return {
|
|
10
|
-
htmlSignals: {},
|
|
11
|
-
screenshotPath: null,
|
|
12
|
-
isWebApp: true,
|
|
13
|
-
rawText: '',
|
|
14
|
-
error: 'invalid_product_url',
|
|
15
|
-
};
|
|
16
|
-
}
|
|
17
|
-
const robotsAllowed = await isAllowedByRobots(url, opts.userAgent, opts.logger);
|
|
18
|
-
if (!robotsAllowed) {
|
|
19
|
-
return {
|
|
20
|
-
htmlSignals: {},
|
|
21
|
-
screenshotPath: null,
|
|
22
|
-
isWebApp: true,
|
|
23
|
-
rawText: '',
|
|
24
|
-
error: 'robots_disallowed',
|
|
25
|
-
};
|
|
26
|
-
}
|
|
27
|
-
const htmlResult = await scrapeHtmlSignals(url, opts.userAgent, opts.logger);
|
|
28
|
-
// The orchestrator launches Chromium and applies Linux-only container flags.
|
|
29
|
-
try {
|
|
30
|
-
const screenshotPath = await captureLandingScreenshot(url, opts);
|
|
31
|
-
return { ...htmlResult, screenshotPath };
|
|
32
|
-
}
|
|
33
|
-
catch (error) {
|
|
34
|
-
opts.logger.warn(`[crm-landing] Screenshot failed for ${url}: ${error.message}`);
|
|
35
|
-
return {
|
|
36
|
-
...htmlResult,
|
|
37
|
-
screenshotPath: null,
|
|
38
|
-
error: `screenshot_failed: ${error.message}`,
|
|
39
|
-
};
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
async function isAllowedByRobots(url, userAgent, logger) {
|
|
43
|
-
let robotsUrl;
|
|
44
|
-
try {
|
|
45
|
-
robotsUrl = new URL('/robots.txt', url).toString();
|
|
46
|
-
}
|
|
47
|
-
catch {
|
|
48
|
-
return true;
|
|
49
|
-
}
|
|
50
|
-
try {
|
|
51
|
-
const response = await fetch(robotsUrl, {
|
|
52
|
-
headers: {
|
|
53
|
-
Accept: 'text/plain,text/*,*/*',
|
|
54
|
-
'User-Agent': userAgent,
|
|
55
|
-
},
|
|
56
|
-
signal: AbortSignal.timeout(15_000),
|
|
57
|
-
});
|
|
58
|
-
if (!response.ok)
|
|
59
|
-
return true;
|
|
60
|
-
const text = await response.text();
|
|
61
|
-
return !robotsDisallowsAll(text);
|
|
62
|
-
}
|
|
63
|
-
catch (error) {
|
|
64
|
-
logger.warn(`[crm-landing] robots.txt fetch failed open for ${robotsUrl}: ${error.message}`);
|
|
65
|
-
return true;
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
function robotsDisallowsAll(text) {
|
|
69
|
-
const groups = [];
|
|
70
|
-
let current = null;
|
|
71
|
-
for (const rawLine of text.split(/\r?\n/)) {
|
|
72
|
-
const line = rawLine.replace(/#.*/, '').trim();
|
|
73
|
-
if (!line)
|
|
74
|
-
continue;
|
|
75
|
-
const colon = line.indexOf(':');
|
|
76
|
-
if (colon === -1)
|
|
77
|
-
continue;
|
|
78
|
-
const key = line.slice(0, colon).trim().toLowerCase();
|
|
79
|
-
const value = line.slice(colon + 1).trim();
|
|
80
|
-
if (key === 'user-agent') {
|
|
81
|
-
if (!current || current.disallows.length > 0) {
|
|
82
|
-
current = { agents: [], disallows: [] };
|
|
83
|
-
groups.push(current);
|
|
84
|
-
}
|
|
85
|
-
current.agents.push(value.toLowerCase());
|
|
86
|
-
}
|
|
87
|
-
else if (key === 'disallow' && current) {
|
|
88
|
-
current.disallows.push(value);
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
const autokap = groups.find((group) => group.agents.includes('autokap-crawler'));
|
|
92
|
-
const wildcard = groups.find((group) => group.agents.includes('*'));
|
|
93
|
-
const group = autokap ?? wildcard;
|
|
94
|
-
return group?.disallows.some((rule) => rule.trim() === '/') ?? false;
|
|
95
|
-
}
|
|
96
|
-
async function scrapeHtmlSignals(url, userAgent, logger) {
|
|
97
|
-
try {
|
|
98
|
-
const response = await fetch(url, {
|
|
99
|
-
headers: {
|
|
100
|
-
Accept: 'text/html',
|
|
101
|
-
'User-Agent': userAgent,
|
|
102
|
-
},
|
|
103
|
-
signal: AbortSignal.timeout(15_000),
|
|
104
|
-
});
|
|
105
|
-
if (!response.ok) {
|
|
106
|
-
logger.warn(`[crm-landing] HTML fetch returned HTTP ${response.status} for ${url}`);
|
|
107
|
-
return { htmlSignals: {}, isWebApp: true, rawText: '' };
|
|
108
|
-
}
|
|
109
|
-
const html = await response.text();
|
|
110
|
-
const cheerio = await loadCheerio();
|
|
111
|
-
const $ = cheerio.load(html);
|
|
112
|
-
$('script, style, noscript, svg').remove();
|
|
113
|
-
const rawText = $('body').text().replace(/\s+/g, ' ').trim().slice(0, 8000);
|
|
114
|
-
const htmlSignals = {
|
|
115
|
-
n_screenshots_html: countContentImages($),
|
|
116
|
-
has_video_html: hasVideo($),
|
|
117
|
-
has_docs_html: hasDocs($),
|
|
118
|
-
has_dashboard_html: hasDashboard(rawText),
|
|
119
|
-
};
|
|
120
|
-
return {
|
|
121
|
-
htmlSignals,
|
|
122
|
-
isWebApp: detectWebApp($, rawText),
|
|
123
|
-
rawText,
|
|
124
|
-
};
|
|
125
|
-
}
|
|
126
|
-
catch (error) {
|
|
127
|
-
logger.warn(`[crm-landing] HTML fetch failed for ${url}: ${error.message}`);
|
|
128
|
-
return { htmlSignals: {}, isWebApp: true, rawText: '' };
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
async function loadCheerio() {
|
|
132
|
-
const importer = new Function('specifier', 'return import(specifier)');
|
|
133
|
-
return importer('cheerio');
|
|
134
|
-
}
|
|
135
|
-
function countContentImages($) {
|
|
136
|
-
let count = 0;
|
|
137
|
-
$('img[src]').each((_, img) => {
|
|
138
|
-
if (count >= 50)
|
|
139
|
-
return;
|
|
140
|
-
const node = $(img);
|
|
141
|
-
const src = node.attr('src') ?? '';
|
|
142
|
-
const className = node.attr('class') ?? '';
|
|
143
|
-
const width = parseDimension(node.attr('width'));
|
|
144
|
-
const height = parseDimension(node.attr('height'));
|
|
145
|
-
let pathname = '';
|
|
146
|
-
try {
|
|
147
|
-
pathname = new URL(src, 'https://example.test/').pathname;
|
|
148
|
-
}
|
|
149
|
-
catch {
|
|
150
|
-
pathname = src;
|
|
151
|
-
}
|
|
152
|
-
if (/logo|icon/i.test(className))
|
|
153
|
-
return;
|
|
154
|
-
if (/logo|favicon/i.test(pathname))
|
|
155
|
-
return;
|
|
156
|
-
if ((width !== null && width <= 64) || (height !== null && height <= 64))
|
|
157
|
-
return;
|
|
158
|
-
count += 1;
|
|
159
|
-
});
|
|
160
|
-
return count;
|
|
161
|
-
}
|
|
162
|
-
function parseDimension(value) {
|
|
163
|
-
if (!value)
|
|
164
|
-
return null;
|
|
165
|
-
const parsed = Number.parseInt(value, 10);
|
|
166
|
-
return Number.isFinite(parsed) ? parsed : null;
|
|
167
|
-
}
|
|
168
|
-
function hasVideo($) {
|
|
169
|
-
if ($('video').length > 0)
|
|
170
|
-
return true;
|
|
171
|
-
return $('iframe[src]').toArray().some((iframe) => VIDEO_HOST_RE.test($(iframe).attr('src') ?? ''));
|
|
172
|
-
}
|
|
173
|
-
function hasDocs($) {
|
|
174
|
-
return $('a[href]').toArray().some((anchor) => {
|
|
175
|
-
const href = ($(anchor).attr('href') ?? '').toLowerCase();
|
|
176
|
-
const text = $(anchor).text().replace(/\s+/g, ' ').trim().toLowerCase();
|
|
177
|
-
return href.includes('/docs') || href.includes('/documentation') || text === 'docs' || text === 'documentation';
|
|
178
|
-
});
|
|
179
|
-
}
|
|
180
|
-
function hasDashboard(text) {
|
|
181
|
-
if (/\bdashboard\b/i.test(text))
|
|
182
|
-
return true;
|
|
183
|
-
if (!/\bapp\b/i.test(text))
|
|
184
|
-
return false;
|
|
185
|
-
return /\b(sign in|log in|login|signup)\b/i.test(text);
|
|
186
|
-
}
|
|
187
|
-
function detectWebApp($, rawText) {
|
|
188
|
-
if (NOT_WEB_APP_RE.test(rawText))
|
|
189
|
-
return false;
|
|
190
|
-
const ctaLinks = $('a[href]').toArray().filter((anchor) => {
|
|
191
|
-
const text = $(anchor).text().replace(/\s+/g, ' ').trim();
|
|
192
|
-
const href = $(anchor).attr('href') ?? '';
|
|
193
|
-
return /app store|google play/i.test(`${text} ${href}`);
|
|
194
|
-
});
|
|
195
|
-
const primaryLinks = $('a[href]').toArray().filter((anchor) => {
|
|
196
|
-
const text = $(anchor).text().replace(/\s+/g, ' ').trim();
|
|
197
|
-
if (!text)
|
|
198
|
-
return false;
|
|
199
|
-
return /download|get|install|app store|google play/i.test(text);
|
|
200
|
-
});
|
|
201
|
-
return !(primaryLinks.length > 0 && primaryLinks.length === ctaLinks.length);
|
|
202
|
-
}
|
|
203
|
-
async function captureLandingScreenshot(url, opts) {
|
|
204
|
-
const context = await opts.browser.newContext({
|
|
205
|
-
userAgent: opts.userAgent,
|
|
206
|
-
viewport: { width: 1280, height: 800 },
|
|
207
|
-
});
|
|
208
|
-
try {
|
|
209
|
-
const page = await context.newPage();
|
|
210
|
-
try {
|
|
211
|
-
await page.goto(url, { waitUntil: 'networkidle', timeout: 30_000 });
|
|
212
|
-
}
|
|
213
|
-
catch (error) {
|
|
214
|
-
const err = error;
|
|
215
|
-
opts.logger.warn(`[crm-landing] networkidle goto failed for ${url}, retrying load: ${err.message}`);
|
|
216
|
-
await page.goto(url, { waitUntil: 'load', timeout: 30_000 });
|
|
217
|
-
}
|
|
218
|
-
const dir = path.join(os.tmpdir(), 'autokap-crm');
|
|
219
|
-
await fs.mkdir(dir, { recursive: true });
|
|
220
|
-
const slug = new URL(url).hostname.replace(/[^a-z0-9.-]+/gi, '-').replace(/^-+|-+$/g, '') || 'landing';
|
|
221
|
-
const screenshotPath = path.join(dir, `landing-${slug}-${Date.now()}.png`);
|
|
222
|
-
await page.screenshot({ path: screenshotPath, fullPage: true, type: 'png' });
|
|
223
|
-
return screenshotPath;
|
|
224
|
-
}
|
|
225
|
-
finally {
|
|
226
|
-
await context.close();
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
function normalizeHttpUrl(value) {
|
|
230
|
-
try {
|
|
231
|
-
const url = new URL(value);
|
|
232
|
-
if (url.protocol !== 'http:' && url.protocol !== 'https:')
|
|
233
|
-
return null;
|
|
234
|
-
return url.toString();
|
|
235
|
-
}
|
|
236
|
-
catch {
|
|
237
|
-
return null;
|
|
238
|
-
}
|
|
239
|
-
}
|
|
240
|
-
//# sourceMappingURL=scrape-landing.js.map
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
export interface UploadLandingShotOptions {
|
|
2
|
-
runId: string;
|
|
3
|
-
sourceUrl: string;
|
|
4
|
-
pngBuffer: Buffer;
|
|
5
|
-
apiBaseUrl: string;
|
|
6
|
-
runToken: string;
|
|
7
|
-
}
|
|
8
|
-
export declare class LandingShotEndpointMissingError extends Error {
|
|
9
|
-
constructor();
|
|
10
|
-
}
|
|
11
|
-
export declare function uploadLandingShot(opts: UploadLandingShotOptions): Promise<{
|
|
12
|
-
signedUrl: string;
|
|
13
|
-
expiresInSec: number;
|
|
14
|
-
}>;
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
// TODO(parent): backend endpoint POST /api/cli/crm/runs/[runId]/landing-shot is referenced here but may not exist yet — see comment.
|
|
2
|
-
import { logger } from '../logger.js';
|
|
3
|
-
export class LandingShotEndpointMissingError extends Error {
|
|
4
|
-
constructor() {
|
|
5
|
-
super('LANDING_SHOT_ENDPOINT_MISSING');
|
|
6
|
-
this.name = 'LandingShotEndpointMissingError';
|
|
7
|
-
}
|
|
8
|
-
}
|
|
9
|
-
export async function uploadLandingShot(opts) {
|
|
10
|
-
const form = new FormData();
|
|
11
|
-
form.set('source_url', opts.sourceUrl);
|
|
12
|
-
const bytes = new Uint8Array(opts.pngBuffer.byteLength);
|
|
13
|
-
bytes.set(opts.pngBuffer);
|
|
14
|
-
form.set('file', new Blob([bytes.buffer], { type: 'image/png' }), 'landing.png');
|
|
15
|
-
const response = await fetch(`${opts.apiBaseUrl.replace(/\/+$/, '')}/api/cli/crm/runs/${opts.runId}/landing-shot`, {
|
|
16
|
-
method: 'POST',
|
|
17
|
-
headers: {
|
|
18
|
-
Authorization: `Bearer ${opts.runToken}`,
|
|
19
|
-
},
|
|
20
|
-
body: form,
|
|
21
|
-
signal: AbortSignal.timeout(30_000),
|
|
22
|
-
});
|
|
23
|
-
if (response.status === 404) {
|
|
24
|
-
logger.warn('[crm-upload] Landing-shot endpoint missing; continuing without vision');
|
|
25
|
-
throw new LandingShotEndpointMissingError();
|
|
26
|
-
}
|
|
27
|
-
if (!response.ok) {
|
|
28
|
-
const body = await response.text().catch(() => response.statusText);
|
|
29
|
-
throw new Error(`landing-shot upload failed: HTTP ${response.status} ${body.slice(0, 300)}`);
|
|
30
|
-
}
|
|
31
|
-
const json = await response.json().catch(() => null);
|
|
32
|
-
if (!json || typeof json.signedUrl !== 'string' || typeof json.expiresInSec !== 'number') {
|
|
33
|
-
throw new Error('landing-shot upload returned invalid JSON');
|
|
34
|
-
}
|
|
35
|
-
return {
|
|
36
|
-
signedUrl: json.signedUrl,
|
|
37
|
-
expiresInSec: json.expiresInSec,
|
|
38
|
-
};
|
|
39
|
-
}
|
|
40
|
-
//# sourceMappingURL=storage-upload.js.map
|