autokap 1.4.3 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-contract.d.ts +79 -0
- package/dist/cli-contract.js +1 -0
- package/dist/cli-doctor.d.ts +4 -0
- package/dist/cli-doctor.js +302 -0
- package/dist/cli-runner.js +318 -2
- package/dist/cli.js +122 -66
- package/dist/execution-types.d.ts +19 -2
- package/dist/mouse-animation.d.ts +6 -0
- package/dist/mouse-animation.js +8 -0
- package/dist/opcode-actions.d.ts +6 -0
- package/dist/opcode-actions.js +7 -3
- package/dist/opcode-runner.js +4 -0
- package/dist/types.d.ts +1 -1
- package/dist/version-check.d.ts +4 -0
- package/dist/version-check.js +102 -0
- package/dist/web-playwright-local.d.ts +6 -2
- package/dist/web-playwright-local.js +7 -7
- package/package.json +2 -3
- package/dist/crm/email-fallback.d.ts +0 -16
- package/dist/crm/email-fallback.js +0 -217
- package/dist/crm/run-campaign.d.ts +0 -28
- package/dist/crm/run-campaign.js +0 -405
- package/dist/crm/scrape-betalist.d.ts +0 -20
- package/dist/crm/scrape-betalist.js +0 -194
- package/dist/crm/scrape-landing.d.ts +0 -24
- package/dist/crm/scrape-landing.js +0 -240
- package/dist/crm/storage-upload.d.ts +0 -14
- package/dist/crm/storage-upload.js +0 -40
package/dist/opcode-actions.js
CHANGED
|
@@ -81,8 +81,12 @@ export async function executeOpcodeCoreAction(opcode, adapter, context = {}) {
|
|
|
81
81
|
? opcode.textByLocale[context.currentVariant.locale] ?? opcode.text
|
|
82
82
|
: opcode.text);
|
|
83
83
|
const text = substituteCredentialPlaceholders(rawText, context.credentials);
|
|
84
|
+
const keystrokeTimestampsMs = [];
|
|
85
|
+
const onKeystroke = (timestampMs) => {
|
|
86
|
+
keystrokeTimestampsMs.push(timestampMs);
|
|
87
|
+
};
|
|
84
88
|
try {
|
|
85
|
-
await adapter.type(opcode.selector, text, opcode.clearFirst);
|
|
89
|
+
await adapter.type(opcode.selector, text, opcode.clearFirst, { onKeystroke });
|
|
86
90
|
}
|
|
87
91
|
catch (error) {
|
|
88
92
|
if (!opcode.target || !adapter.typeByTarget)
|
|
@@ -91,9 +95,9 @@ export async function executeOpcodeCoreAction(opcode, adapter, context = {}) {
|
|
|
91
95
|
selector: opcode.selector,
|
|
92
96
|
target: opcode.target,
|
|
93
97
|
selectorAlternates: opcode.selectorAlternates,
|
|
94
|
-
}, text, opcode.clearFirst);
|
|
98
|
+
}, text, opcode.clearFirst, { onKeystroke });
|
|
95
99
|
}
|
|
96
|
-
|
|
100
|
+
return { success: true, keystrokeTimestampsMs };
|
|
97
101
|
}
|
|
98
102
|
case 'PRESS_KEY':
|
|
99
103
|
await adapter.pressKey(opcode.key);
|
package/dist/opcode-runner.js
CHANGED
|
@@ -287,6 +287,9 @@ async function executeOpcode(opcode, index, adapter, verifier, breaker, recovery
|
|
|
287
287
|
const result = await withTimeout(() => executeOpcodeAction(opcode, index, adapter, artifacts, telemetry, currentVariant, executionState, artifactPlan, mockDataGroups, options, credentials), actionBudgetMs);
|
|
288
288
|
logger.debug(`[opcode ${index}] action exec end — took ${Date.now() - actionStart}ms, success=${result.success}${result.error ? `, error=${result.error}` : ''}`);
|
|
289
289
|
if (preTiming) {
|
|
290
|
+
const keystrokeOffsetsMs = result.keystrokeTimestampsMs && result.keystrokeTimestampsMs.length > 0
|
|
291
|
+
? result.keystrokeTimestampsMs.map((t) => Math.max(0, t - preTiming.clipStartedAt))
|
|
292
|
+
: undefined;
|
|
290
293
|
opcodeTimings.push({
|
|
291
294
|
stepIndex: index,
|
|
292
295
|
stepId: opcode.stepId,
|
|
@@ -296,6 +299,7 @@ async function executeOpcode(opcode, index, adapter, verifier, breaker, recovery
|
|
|
296
299
|
timecodeStartMs: preTiming.timecodeStartMs,
|
|
297
300
|
timecodeEndMs: Math.max(0, Date.now() - preTiming.clipStartedAt),
|
|
298
301
|
bbox: preTiming.bbox,
|
|
302
|
+
...(keystrokeOffsetsMs ? { keystrokeOffsetsMs } : {}),
|
|
299
303
|
});
|
|
300
304
|
}
|
|
301
305
|
if (!result.success) {
|
package/dist/types.d.ts
CHANGED
|
@@ -583,7 +583,7 @@ export interface ClipOptions {
|
|
|
583
583
|
/** Usage metadata from a single OpenRouter API call */
|
|
584
584
|
export interface StepUsage {
|
|
585
585
|
stepNumber: number;
|
|
586
|
-
stepType: 'agent_iteration' | 'verification' | 'element_capture' | 'video_planning' | 'video_variant_classification' | 'video_step_verification' | 'video_step_fix' | 'assistant_chat' | 'studio_creation' | 'studio_iteration' | 'studio_capture_suggestion' | 'mock_data_generation' | 'page_identity_classification' | 'capture_verification' | 'alt_text_generation' | 'healer_invocation' | 'cron_feedback_classification' | 'tts_generation'
|
|
586
|
+
stepType: 'agent_iteration' | 'verification' | 'element_capture' | 'video_planning' | 'video_variant_classification' | 'video_step_verification' | 'video_step_fix' | 'assistant_chat' | 'studio_creation' | 'studio_iteration' | 'studio_capture_suggestion' | 'mock_data_generation' | 'page_identity_classification' | 'capture_verification' | 'alt_text_generation' | 'healer_invocation' | 'cron_feedback_classification' | 'tts_generation';
|
|
587
587
|
generationId: string | null;
|
|
588
588
|
modelRequested: string;
|
|
589
589
|
modelUsed: string | null;
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export declare function fetchLatestVersionFromRegistry(): Promise<string | null>;
|
|
2
|
+
export declare function isNewerVersion(latest: string, current: string): boolean;
|
|
3
|
+
export declare function getCachedOrFetchLatest(): Promise<string | null>;
|
|
4
|
+
export declare function displayNewVersionNoticeIfAvailable(currentVersion: string): Promise<void>;
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { logger } from './logger.js';
|
|
4
|
+
import { getConfigDir } from './cli-config.js';
|
|
5
|
+
const NPM_REGISTRY_URL = 'https://registry.npmjs.org/autokap';
|
|
6
|
+
const CACHE_TTL_MS = 24 * 60 * 60 * 1000;
|
|
7
|
+
const FETCH_TIMEOUT_MS = 2500;
|
|
8
|
+
const TOTAL_BUDGET_MS = 1000;
|
|
9
|
+
function getCachePath() {
|
|
10
|
+
return path.join(getConfigDir(), 'version-check.json');
|
|
11
|
+
}
|
|
12
|
+
async function readCache() {
|
|
13
|
+
try {
|
|
14
|
+
const raw = await fs.readFile(getCachePath(), 'utf-8');
|
|
15
|
+
const parsed = JSON.parse(raw);
|
|
16
|
+
if (typeof parsed.latestVersion !== 'string' || typeof parsed.checkedAt !== 'number') {
|
|
17
|
+
return null;
|
|
18
|
+
}
|
|
19
|
+
return { latestVersion: parsed.latestVersion, checkedAt: parsed.checkedAt };
|
|
20
|
+
}
|
|
21
|
+
catch {
|
|
22
|
+
return null;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
async function writeCache(entry) {
|
|
26
|
+
try {
|
|
27
|
+
await fs.mkdir(getConfigDir(), { recursive: true });
|
|
28
|
+
await fs.writeFile(getCachePath(), JSON.stringify(entry, null, 2), 'utf-8');
|
|
29
|
+
}
|
|
30
|
+
catch {
|
|
31
|
+
// Cache write failure is non-fatal; we just won't have a cache next time
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
export async function fetchLatestVersionFromRegistry() {
|
|
35
|
+
const controller = new AbortController();
|
|
36
|
+
const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
|
|
37
|
+
try {
|
|
38
|
+
const res = await fetch(NPM_REGISTRY_URL, { signal: controller.signal });
|
|
39
|
+
if (!res.ok)
|
|
40
|
+
return null;
|
|
41
|
+
const data = (await res.json());
|
|
42
|
+
return data['dist-tags']?.latest ?? null;
|
|
43
|
+
}
|
|
44
|
+
catch {
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
finally {
|
|
48
|
+
clearTimeout(timeout);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
function parseSemver(version) {
|
|
52
|
+
const stripped = version.split('-')[0];
|
|
53
|
+
const parts = stripped.split('.').map(Number);
|
|
54
|
+
if (parts.length !== 3 || parts.some(n => !Number.isFinite(n)))
|
|
55
|
+
return null;
|
|
56
|
+
return [parts[0], parts[1], parts[2]];
|
|
57
|
+
}
|
|
58
|
+
export function isNewerVersion(latest, current) {
|
|
59
|
+
const a = parseSemver(latest);
|
|
60
|
+
const b = parseSemver(current);
|
|
61
|
+
if (!a || !b)
|
|
62
|
+
return false;
|
|
63
|
+
if (a[0] !== b[0])
|
|
64
|
+
return a[0] > b[0];
|
|
65
|
+
if (a[1] !== b[1])
|
|
66
|
+
return a[1] > b[1];
|
|
67
|
+
return a[2] > b[2];
|
|
68
|
+
}
|
|
69
|
+
function isPreRelease(version) {
|
|
70
|
+
return version.includes('-');
|
|
71
|
+
}
|
|
72
|
+
export async function getCachedOrFetchLatest() {
|
|
73
|
+
const cache = await readCache();
|
|
74
|
+
if (cache && Date.now() - cache.checkedAt < CACHE_TTL_MS) {
|
|
75
|
+
return cache.latestVersion;
|
|
76
|
+
}
|
|
77
|
+
const latest = await fetchLatestVersionFromRegistry();
|
|
78
|
+
if (latest) {
|
|
79
|
+
await writeCache({ latestVersion: latest, checkedAt: Date.now() });
|
|
80
|
+
return latest;
|
|
81
|
+
}
|
|
82
|
+
return cache?.latestVersion ?? null;
|
|
83
|
+
}
|
|
84
|
+
export async function displayNewVersionNoticeIfAvailable(currentVersion) {
|
|
85
|
+
if (isPreRelease(currentVersion))
|
|
86
|
+
return;
|
|
87
|
+
try {
|
|
88
|
+
const latest = await Promise.race([
|
|
89
|
+
getCachedOrFetchLatest(),
|
|
90
|
+
new Promise(resolve => setTimeout(() => resolve(null), TOTAL_BUDGET_MS)),
|
|
91
|
+
]);
|
|
92
|
+
if (!latest)
|
|
93
|
+
return;
|
|
94
|
+
if (!isNewerVersion(latest, currentVersion))
|
|
95
|
+
return;
|
|
96
|
+
logger.info(`A new version of autokap (${latest}) is available, run npm install -g autokap@latest to update`);
|
|
97
|
+
}
|
|
98
|
+
catch {
|
|
99
|
+
// Silent failure — the version check must never block or break the CLI
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
//# sourceMappingURL=version-check.js.map
|
|
@@ -38,7 +38,9 @@ export declare class WebPlaywrightLocal implements RuntimeAdapter {
|
|
|
38
38
|
/**
|
|
39
39
|
* Type into an element using semantic target resolution.
|
|
40
40
|
*/
|
|
41
|
-
typeByTarget(opts: ResolveOptions, text: string, clearFirst?: boolean
|
|
41
|
+
typeByTarget(opts: ResolveOptions, text: string, clearFirst?: boolean, typeOpts?: {
|
|
42
|
+
onKeystroke?: (timestampMs: number) => void;
|
|
43
|
+
}): Promise<void>;
|
|
42
44
|
/**
|
|
43
45
|
* Wait for an element using semantic target resolution.
|
|
44
46
|
*/
|
|
@@ -47,7 +49,9 @@ export declare class WebPlaywrightLocal implements RuntimeAdapter {
|
|
|
47
49
|
* Scroll an element into view using semantic target resolution.
|
|
48
50
|
*/
|
|
49
51
|
scrollIntoViewByTarget(opts: ResolveOptions): Promise<void>;
|
|
50
|
-
type(selector: string, text: string, clearFirst?: boolean
|
|
52
|
+
type(selector: string, text: string, clearFirst?: boolean, opts?: {
|
|
53
|
+
onKeystroke?: (timestampMs: number) => void;
|
|
54
|
+
}): Promise<void>;
|
|
51
55
|
pressKey(key: string): Promise<void>;
|
|
52
56
|
scroll(direction: 'up' | 'down' | 'left' | 'right', amount?: number): Promise<void>;
|
|
53
57
|
scrollIntoView(selector: string): Promise<void>;
|
|
@@ -152,14 +152,14 @@ export class WebPlaywrightLocal {
|
|
|
152
152
|
/**
|
|
153
153
|
* Type into an element using semantic target resolution.
|
|
154
154
|
*/
|
|
155
|
-
async typeByTarget(opts, text, clearFirst = true) {
|
|
155
|
+
async typeByTarget(opts, text, clearFirst = true, typeOpts) {
|
|
156
156
|
const page = await this.browser.currentPage;
|
|
157
157
|
const resolved = await resolveTarget(page, opts);
|
|
158
158
|
if (!resolved) {
|
|
159
159
|
throw new Error(`cannot find target for typing: ${describeResolveOptions(opts)}`);
|
|
160
160
|
}
|
|
161
161
|
if (this.clipCursor) {
|
|
162
|
-
await this.typeIntoLocator(resolved.locator, text, clearFirst);
|
|
162
|
+
await this.typeIntoLocator(resolved.locator, text, clearFirst, typeOpts?.onKeystroke);
|
|
163
163
|
return;
|
|
164
164
|
}
|
|
165
165
|
if (clearFirst) {
|
|
@@ -196,10 +196,10 @@ export class WebPlaywrightLocal {
|
|
|
196
196
|
}
|
|
197
197
|
await resolved.locator.scrollIntoViewIfNeeded({ timeout: 5000 });
|
|
198
198
|
}
|
|
199
|
-
async type(selector, text, clearFirst = true) {
|
|
199
|
+
async type(selector, text, clearFirst = true, opts) {
|
|
200
200
|
if (this.clipCursor) {
|
|
201
201
|
const page = await this.browser.currentPage;
|
|
202
|
-
await this.typeIntoLocator(page.locator(selector).first(), text, clearFirst);
|
|
202
|
+
await this.typeIntoLocator(page.locator(selector).first(), text, clearFirst, opts?.onKeystroke);
|
|
203
203
|
return;
|
|
204
204
|
}
|
|
205
205
|
await this.browser.typeText(text, { selector, clearFirst });
|
|
@@ -878,7 +878,7 @@ export class WebPlaywrightLocal {
|
|
|
878
878
|
async close() {
|
|
879
879
|
await this.browser.close();
|
|
880
880
|
}
|
|
881
|
-
async typeIntoLocator(locator, text, clearFirst) {
|
|
881
|
+
async typeIntoLocator(locator, text, clearFirst, onKeystroke) {
|
|
882
882
|
const page = await this.browser.currentPage;
|
|
883
883
|
await locator.waitFor({ state: 'visible', timeout: 5000 });
|
|
884
884
|
await locator.scrollIntoViewIfNeeded({ timeout: 5000 }).catch(() => undefined);
|
|
@@ -895,8 +895,8 @@ export class WebPlaywrightLocal {
|
|
|
895
895
|
}
|
|
896
896
|
await page.waitForTimeout(70);
|
|
897
897
|
await humanType(page, text, this.clipCursor
|
|
898
|
-
? { minDelayMs: 20, maxDelayMs: 45 }
|
|
899
|
-
:
|
|
898
|
+
? { minDelayMs: 20, maxDelayMs: 45, onKeystroke }
|
|
899
|
+
: { onKeystroke });
|
|
900
900
|
}
|
|
901
901
|
async seedClipCursor(position) {
|
|
902
902
|
if (!this.clipCursor)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "autokap",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.5.2",
|
|
4
4
|
"description": "AI-powered CLI tool for capturing clean screenshots of websites",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -231,8 +231,7 @@
|
|
|
231
231
|
"satori": "^0.26.0",
|
|
232
232
|
"wawoff2": "^2.0.1",
|
|
233
233
|
"ws": "^8.20.0",
|
|
234
|
-
"zod": "^4.3.6"
|
|
235
|
-
"cheerio": "^1.1.2"
|
|
234
|
+
"zod": "^4.3.6"
|
|
236
235
|
},
|
|
237
236
|
"devDependencies": {
|
|
238
237
|
"@types/node": "^25.3.3",
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
export interface EmailFallbackOptions {
|
|
2
|
-
betaListLaunchUrl: string;
|
|
3
|
-
productUrl: string | null;
|
|
4
|
-
logger: {
|
|
5
|
-
info(msg: string): void;
|
|
6
|
-
warn(msg: string): void;
|
|
7
|
-
error(msg: string): void;
|
|
8
|
-
};
|
|
9
|
-
}
|
|
10
|
-
export declare function findEmail(opts: EmailFallbackOptions): Promise<{
|
|
11
|
-
email: string | null;
|
|
12
|
-
handle: string | null;
|
|
13
|
-
lang: string | null;
|
|
14
|
-
}>;
|
|
15
|
-
export declare function extractEmailsFromText(text: string): string[];
|
|
16
|
-
export declare function pickBestEmail(emails: string[], productHostname: string | null): string | null;
|
|
@@ -1,217 +0,0 @@
|
|
|
1
|
-
const CRAWLER_UA = 'AutoKap-Crawler/1.0 (+https://autokap.app/crawler)';
|
|
2
|
-
const EMAIL_RE = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
|
|
3
|
-
const ROLE_PREFIXES = ['founder', 'contact', 'hello', 'team', 'support', 'info'];
|
|
4
|
-
const FREE_MAIL_DOMAINS = new Set(['gmail.com', 'outlook.com', 'hotmail.com', 'yahoo.com', 'protonmail.com', 'icloud.com', 'proton.me']);
|
|
5
|
-
const domainQueues = new Map();
|
|
6
|
-
export async function findEmail(opts) {
|
|
7
|
-
const visited = [];
|
|
8
|
-
const emails = new Set();
|
|
9
|
-
let handle = null;
|
|
10
|
-
let lang = null;
|
|
11
|
-
const productHostname = opts.productUrl ? hostnameOf(opts.productUrl) : null;
|
|
12
|
-
const visit = async (url) => {
|
|
13
|
-
const page = await fetchPage(url, opts.logger);
|
|
14
|
-
if (!page)
|
|
15
|
-
return;
|
|
16
|
-
visited.push(page);
|
|
17
|
-
const cheerio = await loadCheerio();
|
|
18
|
-
const $ = cheerio.load(page.html);
|
|
19
|
-
handle ??= extractHandle($);
|
|
20
|
-
if (!lang && productHostname && hostnameOf(page.url) === productHostname && page.status === 200) {
|
|
21
|
-
lang = extractLanguage($, page.text);
|
|
22
|
-
}
|
|
23
|
-
for (const email of extractEmailsFromText(extractMailtos($).join(' ')))
|
|
24
|
-
emails.add(email);
|
|
25
|
-
for (const email of extractEmailsFromText(page.text))
|
|
26
|
-
emails.add(email);
|
|
27
|
-
};
|
|
28
|
-
await visit(opts.betaListLaunchUrl);
|
|
29
|
-
let best = pickBestEmail([...emails], productHostname);
|
|
30
|
-
if (isHighRankEmail(best, productHostname) && handle) {
|
|
31
|
-
return { email: best, handle, lang };
|
|
32
|
-
}
|
|
33
|
-
const productUrls = buildProductUrls(opts.productUrl);
|
|
34
|
-
for (const url of productUrls) {
|
|
35
|
-
if (isHighRankEmail(best, productHostname) && handle)
|
|
36
|
-
break;
|
|
37
|
-
await visit(url);
|
|
38
|
-
best = pickBestEmail([...emails], productHostname);
|
|
39
|
-
}
|
|
40
|
-
if (!lang) {
|
|
41
|
-
const combinedText = visited.map((page) => page.text).join(' ');
|
|
42
|
-
lang = inferLanguageFromText(combinedText);
|
|
43
|
-
}
|
|
44
|
-
return {
|
|
45
|
-
email: best,
|
|
46
|
-
handle,
|
|
47
|
-
lang,
|
|
48
|
-
};
|
|
49
|
-
}
|
|
50
|
-
export function extractEmailsFromText(text) {
|
|
51
|
-
const matches = text.match(EMAIL_RE) ?? [];
|
|
52
|
-
return [...new Set(matches.map((email) => email.toLowerCase()).filter((email) => !isJunkEmail(email)))];
|
|
53
|
-
}
|
|
54
|
-
export function pickBestEmail(emails, productHostname) {
|
|
55
|
-
if (emails.length === 0)
|
|
56
|
-
return null;
|
|
57
|
-
const normalized = [...new Set(emails.map((email) => email.trim().toLowerCase()).filter(Boolean))];
|
|
58
|
-
if (normalized.length === 0)
|
|
59
|
-
return null;
|
|
60
|
-
return normalized.sort((a, b) => rankEmail(a, productHostname) - rankEmail(b, productHostname))[0] ?? null;
|
|
61
|
-
}
|
|
62
|
-
function isJunkEmail(email) {
|
|
63
|
-
const lower = email.toLowerCase();
|
|
64
|
-
return lower.includes('example.com')
|
|
65
|
-
|| lower.includes('sentry.io')
|
|
66
|
-
|| lower.includes('wixpress.com')
|
|
67
|
-
|| lower.includes('@2x')
|
|
68
|
-
|| lower.includes('png')
|
|
69
|
-
|| lower.includes('jpg')
|
|
70
|
-
|| lower.includes('svg');
|
|
71
|
-
}
|
|
72
|
-
function rankEmail(email, productHostname) {
|
|
73
|
-
const domain = email.split('@')[1]?.toLowerCase() ?? '';
|
|
74
|
-
const local = email.split('@')[0]?.toLowerCase() ?? '';
|
|
75
|
-
const sameDomain = productHostname ? domainsMatch(domain, productHostname) : false;
|
|
76
|
-
const roleRank = ROLE_PREFIXES.indexOf(local);
|
|
77
|
-
const isRole = roleRank !== -1;
|
|
78
|
-
const isFreeMail = FREE_MAIL_DOMAINS.has(domain);
|
|
79
|
-
if (sameDomain && isRole)
|
|
80
|
-
return roleRank;
|
|
81
|
-
if (sameDomain)
|
|
82
|
-
return 100;
|
|
83
|
-
if (isFreeMail && isRole)
|
|
84
|
-
return 200 + roleRank;
|
|
85
|
-
if (isFreeMail)
|
|
86
|
-
return 300;
|
|
87
|
-
return 400;
|
|
88
|
-
}
|
|
89
|
-
function isHighRankEmail(email, productHostname) {
|
|
90
|
-
return email !== null && rankEmail(email, productHostname) < 200;
|
|
91
|
-
}
|
|
92
|
-
function domainsMatch(emailDomain, productHostname) {
|
|
93
|
-
const normalizedHost = stripWww(productHostname);
|
|
94
|
-
const normalizedEmailDomain = stripWww(emailDomain);
|
|
95
|
-
return normalizedEmailDomain === normalizedHost || etldOne(normalizedEmailDomain) === etldOne(normalizedHost);
|
|
96
|
-
}
|
|
97
|
-
function etldOne(hostname) {
|
|
98
|
-
const parts = stripWww(hostname).split('.').filter(Boolean);
|
|
99
|
-
return parts.length <= 2 ? parts.join('.') : parts.slice(-2).join('.');
|
|
100
|
-
}
|
|
101
|
-
function stripWww(hostname) {
|
|
102
|
-
return hostname.toLowerCase().replace(/^www\./, '');
|
|
103
|
-
}
|
|
104
|
-
function buildProductUrls(productUrl) {
|
|
105
|
-
if (!productUrl)
|
|
106
|
-
return [];
|
|
107
|
-
try {
|
|
108
|
-
const base = new URL(productUrl);
|
|
109
|
-
const urls = [base.toString()];
|
|
110
|
-
for (const pathname of ['/contact', '/about', '/legal', '/mentions-legales']) {
|
|
111
|
-
const next = new URL(base.toString());
|
|
112
|
-
next.pathname = pathname;
|
|
113
|
-
next.search = '';
|
|
114
|
-
next.hash = '';
|
|
115
|
-
urls.push(next.toString());
|
|
116
|
-
}
|
|
117
|
-
return [...new Set(urls)];
|
|
118
|
-
}
|
|
119
|
-
catch {
|
|
120
|
-
return [];
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
async function fetchPage(url, logger) {
|
|
124
|
-
let parsed;
|
|
125
|
-
try {
|
|
126
|
-
parsed = new URL(url);
|
|
127
|
-
}
|
|
128
|
-
catch {
|
|
129
|
-
return null;
|
|
130
|
-
}
|
|
131
|
-
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:')
|
|
132
|
-
return null;
|
|
133
|
-
await waitForDomainTurn(parsed.hostname);
|
|
134
|
-
try {
|
|
135
|
-
const response = await fetch(parsed.toString(), {
|
|
136
|
-
headers: {
|
|
137
|
-
Accept: 'text/html',
|
|
138
|
-
'User-Agent': CRAWLER_UA,
|
|
139
|
-
},
|
|
140
|
-
signal: AbortSignal.timeout(15_000),
|
|
141
|
-
});
|
|
142
|
-
if (!response.ok) {
|
|
143
|
-
logger.warn(`[crm-email] Fetch returned HTTP ${response.status} for ${parsed.toString()}`);
|
|
144
|
-
return null;
|
|
145
|
-
}
|
|
146
|
-
const html = await response.text();
|
|
147
|
-
const cheerio = await loadCheerio();
|
|
148
|
-
const $ = cheerio.load(html);
|
|
149
|
-
$('script, style, noscript, svg').remove();
|
|
150
|
-
return {
|
|
151
|
-
html,
|
|
152
|
-
text: $('body').text().replace(/\s+/g, ' ').trim(),
|
|
153
|
-
url: response.url || parsed.toString(),
|
|
154
|
-
status: response.status,
|
|
155
|
-
};
|
|
156
|
-
}
|
|
157
|
-
catch (error) {
|
|
158
|
-
logger.warn(`[crm-email] Fetch failed for ${parsed.toString()}: ${error.message}`);
|
|
159
|
-
return null;
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
async function waitForDomainTurn(hostname) {
|
|
163
|
-
const previous = domainQueues.get(hostname) ?? Promise.resolve();
|
|
164
|
-
let release = () => { };
|
|
165
|
-
const current = previous.then(() => new Promise((resolve) => {
|
|
166
|
-
release = resolve;
|
|
167
|
-
}));
|
|
168
|
-
domainQueues.set(hostname, current);
|
|
169
|
-
await previous;
|
|
170
|
-
setTimeout(release, 1_000);
|
|
171
|
-
}
|
|
172
|
-
async function loadCheerio() {
|
|
173
|
-
const importer = new Function('specifier', 'return import(specifier)');
|
|
174
|
-
return importer('cheerio');
|
|
175
|
-
}
|
|
176
|
-
function extractMailtos($) {
|
|
177
|
-
return $('a[href^="mailto:"]').toArray().map((anchor) => {
|
|
178
|
-
const href = $(anchor).attr('href') ?? '';
|
|
179
|
-
return decodeURIComponent(href.replace(/^mailto:/i, '').split('?')[0] ?? '');
|
|
180
|
-
});
|
|
181
|
-
}
|
|
182
|
-
function extractHandle($) {
|
|
183
|
-
for (const anchor of $('a[href]').toArray()) {
|
|
184
|
-
const href = $(anchor).attr('href') ?? '';
|
|
185
|
-
const twitter = href.match(/(?:twitter\.com|x\.com)\/([A-Za-z0-9_]{1,20})(?:[/?#]|$)/i);
|
|
186
|
-
if (twitter)
|
|
187
|
-
return `@${twitter[1]}`;
|
|
188
|
-
const linkedin = href.match(/linkedin\.com\/in\/([^/?#]+)/i);
|
|
189
|
-
if (linkedin)
|
|
190
|
-
return linkedin[1] ?? null;
|
|
191
|
-
}
|
|
192
|
-
return null;
|
|
193
|
-
}
|
|
194
|
-
function extractLanguage($, text) {
|
|
195
|
-
const lang = $('html').attr('lang')?.trim().split(/[-_]/)[0]?.toLowerCase();
|
|
196
|
-
if (lang)
|
|
197
|
-
return lang;
|
|
198
|
-
return inferLanguageFromText(text);
|
|
199
|
-
}
|
|
200
|
-
function inferLanguageFromText(text) {
|
|
201
|
-
if (!text)
|
|
202
|
-
return null;
|
|
203
|
-
if (/\b(bonjour|merci|à propos|mentions légales)\b/i.test(text))
|
|
204
|
-
return 'fr';
|
|
205
|
-
if (/\b(the|and|contact|about|privacy|terms|login|sign in)\b/i.test(text))
|
|
206
|
-
return 'en';
|
|
207
|
-
return null;
|
|
208
|
-
}
|
|
209
|
-
function hostnameOf(value) {
|
|
210
|
-
try {
|
|
211
|
-
return new URL(value).hostname;
|
|
212
|
-
}
|
|
213
|
-
catch {
|
|
214
|
-
return null;
|
|
215
|
-
}
|
|
216
|
-
}
|
|
217
|
-
//# sourceMappingURL=email-fallback.js.map
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
import { type Browser } from 'playwright';
|
|
2
|
-
import { findEmail } from './email-fallback.js';
|
|
3
|
-
import { scrapeBetaListLaunches } from './scrape-betalist.js';
|
|
4
|
-
import { scrapeLanding } from './scrape-landing.js';
|
|
5
|
-
export interface RunCampaignOptions {
|
|
6
|
-
runId: string;
|
|
7
|
-
lookbackDays: number;
|
|
8
|
-
apiBaseUrl: string;
|
|
9
|
-
runToken: string;
|
|
10
|
-
logger: {
|
|
11
|
-
info(msg: string): void;
|
|
12
|
-
warn(msg: string): void;
|
|
13
|
-
error(msg: string): void;
|
|
14
|
-
};
|
|
15
|
-
}
|
|
16
|
-
export interface RunCampaignDeps {
|
|
17
|
-
scrapeLaunches?: typeof scrapeBetaListLaunches;
|
|
18
|
-
scrapeLanding?: typeof scrapeLanding;
|
|
19
|
-
findEmail?: typeof findEmail;
|
|
20
|
-
fetch?: typeof fetch;
|
|
21
|
-
launchBrowser?: () => Promise<Browser>;
|
|
22
|
-
}
|
|
23
|
-
export declare function runCampaign(opts: RunCampaignOptions, deps?: RunCampaignDeps): Promise<{
|
|
24
|
-
scraped: number;
|
|
25
|
-
inserted: number;
|
|
26
|
-
disqualified: number;
|
|
27
|
-
skipped: number;
|
|
28
|
-
}>;
|