autokap 1.4.0 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/skill/SKILL.md +3 -0
- package/assets/skill/references/STANDARDS.md +236 -0
- package/dist/cli.js +73 -2
- package/dist/crm/email-fallback.d.ts +16 -0
- package/dist/crm/email-fallback.js +217 -0
- package/dist/crm/run-campaign.d.ts +28 -0
- package/dist/crm/run-campaign.js +405 -0
- package/dist/crm/scrape-betalist.d.ts +20 -0
- package/dist/crm/scrape-betalist.js +194 -0
- package/dist/crm/scrape-landing.d.ts +24 -0
- package/dist/crm/scrape-landing.js +240 -0
- package/dist/crm/storage-upload.d.ts +14 -0
- package/dist/crm/storage-upload.js +40 -0
- package/dist/mockup.d.ts +7 -0
- package/dist/mockup.js +52 -6
- package/dist/types.d.ts +1 -1
- package/package.json +3 -2
package/assets/skill/SKILL.md
CHANGED
|
@@ -19,6 +19,8 @@ Normal navigation and interaction stay deterministic. Runtime AI is limited to n
|
|
|
19
19
|
|
|
20
20
|
This installed skill is the **source of truth** for the AutoKap contract: opcode schema, login rules, variant handling, persistence, and validation. The copied prompt from the AutoKap dashboard is only the **preset-specific brief** (project URL, variants, template goal, mock data guidance, etc.).
|
|
21
21
|
|
|
22
|
+
> **Compatibility note.** The dashboard prompts now embed a mini AutoKap mental-model so they remain useful for assistants that don't have this skill installed (Cursor without the bundle, Codex, Copilot, plain Claude Code). When this skill IS installed, **the rules below override anything that contradicts them in the inline mental-model** — the inline version is necessarily a summary.
|
|
23
|
+
|
|
22
24
|
## When To Use This Skill
|
|
23
25
|
|
|
24
26
|
- User wants to capture screenshots or clips of their web app
|
|
@@ -54,6 +56,7 @@ Load these only when the request actually needs them:
|
|
|
54
56
|
- **Opcode parameters** — [OPCODE-REFERENCE.md](OPCODE-REFERENCE.md)
|
|
55
57
|
- **Mock data** — [references/mock-data.md](references/mock-data.md)
|
|
56
58
|
- **Complete examples** — [references/examples.md](references/examples.md)
|
|
59
|
+
- **Prompt charter** — [references/STANDARDS.md](references/STANDARDS.md) — defines the structure every dashboard-generated prompt now follows. Use it when authoring new prompts, when extending existing builders, or to understand why a copied prompt is shaped the way it is.
|
|
57
60
|
|
|
58
61
|
Keep the core `SKILL.md` for the non-negotiable contract. Reach for the
|
|
59
62
|
references only after you know which mode or advanced feature the user needs.
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# AutoKap Prompt Standards
|
|
2
|
+
|
|
3
|
+
This document defines the contract every user-facing prompt produced by AutoKap must follow. It is the single source of truth referenced by `web/lib/prompts/blocks/*` and the prompt builders that compose them.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Why this charter exists
|
|
8
|
+
|
|
9
|
+
AutoKap generates prompts that the user copy-pastes into their IDE so an AI assistant can:
|
|
10
|
+
|
|
11
|
+
- inspect the user's codebase and add `data-ak` attributes
|
|
12
|
+
- generate or edit a deterministic `ExecutionProgram`
|
|
13
|
+
- scaffold a proxy, embed assets, or build a video demo
|
|
14
|
+
|
|
15
|
+
The recipient assistant is rarely "AutoKap-aware":
|
|
16
|
+
|
|
17
|
+
- it may be Cursor, Codex, GitHub Copilot, or Claude Code without the `autokap-preset` skill installed
|
|
18
|
+
- it has no prior context about AutoKap's runtime model, opcode contract, or CLI
|
|
19
|
+
|
|
20
|
+
When prompts are inconsistent, the assistant invents selectors, generates programs before inspecting the codebase, drops the CLI, and ignores the user's brief. We fix that by enforcing a **single, predictable structure across every prompt**.
|
|
21
|
+
|
|
22
|
+
Two design principles flow from this:
|
|
23
|
+
|
|
24
|
+
1. **Self-sufficient** — every prompt embeds a mini mental-model so the assistant can succeed even without the skill installed.
|
|
25
|
+
2. **Predictable** — every prompt uses the same nine blocks in the same order. The assistant learns the shape once and applies it everywhere.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## The nine blocks
|
|
30
|
+
|
|
31
|
+
Every user-facing prompt is composed from the following blocks, in this order. Blocks are skipped only when explicitly marked optional.
|
|
32
|
+
|
|
33
|
+
### 1. Header (required)
|
|
34
|
+
|
|
35
|
+
3-5 lines. Names the product, the runtime model in one sentence, the task. Mentions the optional skill.
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
You are working on AutoKap, a screenshot-and-video automation tool. AutoKap presets
|
|
39
|
+
are deterministic JSON programs (opcodes for Playwright) that the AutoKap CLI runs
|
|
40
|
+
locally — there is no LLM at capture runtime. Your job here is to <one-line task>.
|
|
41
|
+
If a skill named `autokap-preset` is installed in your environment, treat it as the
|
|
42
|
+
source of truth — the rules below override anything that contradicts it.
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
The header replaces the implicit assumption that the assistant already knows AutoKap. It is the same in every prompt; only the `<one-line task>` changes.
|
|
46
|
+
|
|
47
|
+
### 2. Before you write anything (required for any prompt that touches code)
|
|
48
|
+
|
|
49
|
+
A numbered checklist of 3-5 actions the assistant must perform before generating the artifact. Always includes:
|
|
50
|
+
|
|
51
|
+
- inspect the codebase
|
|
52
|
+
- plan
|
|
53
|
+
- ask the user when ambiguous
|
|
54
|
+
- never invent UI details, selectors, or copy
|
|
55
|
+
- search for existing `data-ak` before adding new ones (when applicable)
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
## Before you write anything
|
|
59
|
+
|
|
60
|
+
1. Inspect the codebase: routes, auth, theme/locale system, components you may need to tag.
|
|
61
|
+
2. Plan in your head (or in your planning tool if you have one) before generating.
|
|
62
|
+
3. If anything is ambiguous (auth flow, target route, data shape), STOP and ask the user.
|
|
63
|
+
Do not invent UI details, selectors, or copy.
|
|
64
|
+
4. Search for existing `data-ak` attributes first; only add new ones if none exist.
|
|
65
|
+
5. If your environment supports parallel subagents, you may run codebase inspection
|
|
66
|
+
and program drafting in parallel — but do not skip step 3.
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
This block sits **immediately after the header**, before any technical detail. The assistant must read it before being tempted by the project context or the brief.
|
|
70
|
+
|
|
71
|
+
### 3. User guidance (required and visible)
|
|
72
|
+
|
|
73
|
+
The user's brief, isolated in its own section. The section name varies by task: `## User guidance`, `## What the user wants`, `## Preset brief`, `## What the demo should show`. The content is always the user's free-form input.
|
|
74
|
+
|
|
75
|
+
When the user provided no guidance:
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
## User guidance
|
|
79
|
+
|
|
80
|
+
_The user has not provided specific guidance for this task. Use sensible defaults
|
|
81
|
+
based on the project context above._
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
When the user provided partial guidance (e.g. mock data guidance only), each piece appears in its own clearly-labelled subsection.
|
|
85
|
+
|
|
86
|
+
This block sits **immediately after "Before you write anything"**, before any constraint. The assistant has just been told to inspect; the brief is the next thing it sees so it shapes inspection.
|
|
87
|
+
|
|
88
|
+
### 4. Project context (required when applicable)
|
|
89
|
+
|
|
90
|
+
Compact key/value list. Project name, project ID, base URL, credentials account ID, locale defaults. No prose.
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
## Project context
|
|
94
|
+
|
|
95
|
+
- **Name**: Acme Dashboard
|
|
96
|
+
- **ID**: `proj_abc123`
|
|
97
|
+
- **Base URL**: `https://acme.example.com`
|
|
98
|
+
- **Credentials account ID**: `cred_xyz789`
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### 5. Hard constraints (required)
|
|
102
|
+
|
|
103
|
+
Non-negotiable values: viewport variants, capture mode, mediaMode, baseUrl rules, etc. The "Variants (use these EXACTLY)" pattern is the canonical example. Use **bold** for emphasis on critical numbers and `code` for identifiers.
|
|
104
|
+
|
|
105
|
+
### 6. Specific reminders (required when applicable)
|
|
106
|
+
|
|
107
|
+
Auth, mock data, locale/theme handling, etc. — only the reminders that apply to the current mode. Anti-patterns are presented as `Don't / Do instead` tables, never bullets:
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
| Don't | Do instead |
|
|
111
|
+
|---|---|
|
|
112
|
+
| Use a CSS selector you guessed (`.btn-primary`) | Add a `data-ak="login-btn"` attribute and target `[data-ak="login-btn"]` |
|
|
113
|
+
| Hardcode the auth cookie | Use `credentialsId` and let the runtime inject the right session |
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### 7. How to persist (required for prompts that produce an artifact)
|
|
117
|
+
|
|
118
|
+
CLI commands first. Useful flags listed (`--dry`, `--headed`, `--output`). Fallbacks (JSON file, dashboard import) explicitly labelled "fallback only".
|
|
119
|
+
|
|
120
|
+
### 8. If you get stuck (required for any prompt that touches code)
|
|
121
|
+
|
|
122
|
+
3 concrete scenarios minimum. Tells the assistant what to do at the failure points where it would otherwise improvise:
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
## If you get stuck
|
|
126
|
+
|
|
127
|
+
- If you can't find a stable selector → ask the user before guessing.
|
|
128
|
+
- If a CLI command fails → run with `--dry` first to validate, then re-run without `--dry`.
|
|
129
|
+
- If the program runs but captures the wrong screen → report back with the AutoKap run log;
|
|
130
|
+
don't try to patch by adding more opcodes blindly.
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### 9. Subagents hint (optional, multi-phase prompts only)
|
|
134
|
+
|
|
135
|
+
A suggestion (never a requirement) for assistants that support parallel subagents. Always qualified with "if your environment supports..." so single-agent assistants are not derailed.
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
## If you have parallel subagents available
|
|
139
|
+
|
|
140
|
+
This prompt has multiple phases (inspect → tag → generate → persist → integrate). If
|
|
141
|
+
your environment supports subagents (Claude Code, multi-agent Cursor, etc.), consider
|
|
142
|
+
parallelizing: agent 1 inspects the codebase and tags `data-ak`; agent 2 drafts the
|
|
143
|
+
`ExecutionProgram` from the tagged components; you merge their outputs.
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Cross-cutting rules
|
|
149
|
+
|
|
150
|
+
### Tone
|
|
151
|
+
|
|
152
|
+
Direct imperative: `Do`, `Use`, `Never`, `Ask`. Never "you may want to consider", "perhaps", "if you wish". The assistant should know exactly what the prompt expects.
|
|
153
|
+
|
|
154
|
+
### Markdown
|
|
155
|
+
|
|
156
|
+
- `**bold**` for emphasis on critical values
|
|
157
|
+
- `` `code` `` for identifiers, file paths, CLI flags
|
|
158
|
+
- Tables for matrices and anti-patterns
|
|
159
|
+
- Fenced code blocks (with language tag) for examples
|
|
160
|
+
- `## Section` / `### Subsection` for hierarchy — the assistant should be able to refer to sections by name
|
|
161
|
+
|
|
162
|
+
### No emojis
|
|
163
|
+
|
|
164
|
+
Anywhere in any prompt. They look unprofessional and clutter terminal output when the prompt is shown verbatim.
|
|
165
|
+
|
|
166
|
+
### Examples are concrete
|
|
167
|
+
|
|
168
|
+
Every prompt that asks the assistant to produce an artifact contains at least one fully-formed example: a JSON snippet for opcodes, a bash one-liner for CLI commands, a JSX block for `data-ak` placement.
|
|
169
|
+
|
|
170
|
+
### Anti-patterns are tables, not bullets
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
| Don't | Why / Do instead |
|
|
174
|
+
|---|---|
|
|
175
|
+
| ... | ... |
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Section names are stable
|
|
179
|
+
|
|
180
|
+
The assistant should be able to refer to "the User guidance section" or "the Hard constraints section" and have it mean the same thing across all prompts.
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Block composition pattern
|
|
185
|
+
|
|
186
|
+
Each block exposes a `make*` function returning `string[]` (lines). Builders compose blocks by concatenating arrays and joining with `"\n"` at the end:
|
|
187
|
+
|
|
188
|
+
```typescript
|
|
189
|
+
import { joinBlocks } from "@/lib/prompts";
|
|
190
|
+
import { makeHeader } from "@/lib/prompts/blocks/header";
|
|
191
|
+
import { makeBeforeAnythingChecklist } from "@/lib/prompts/blocks/before-anything";
|
|
192
|
+
import { makeUserGuidanceSection } from "@/lib/prompts/blocks/user-guidance";
|
|
193
|
+
|
|
194
|
+
export function buildPresetPrompt(input: PresetPromptInput): string {
|
|
195
|
+
return joinBlocks([
|
|
196
|
+
makeHeader({ task: "generate an ExecutionProgram for this preset" }),
|
|
197
|
+
makeBeforeAnythingChecklist({ touchesCode: true, supportsSubagents: true }),
|
|
198
|
+
makeUserGuidanceSection({ brief: input.userBrief, label: "Preset brief" }),
|
|
199
|
+
// ... other blocks
|
|
200
|
+
]);
|
|
201
|
+
}
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
Blocks must:
|
|
205
|
+
|
|
206
|
+
- accept typed parameters; never read environment variables or globals
|
|
207
|
+
- be deterministic (same input → same output)
|
|
208
|
+
- handle all "missing optional" cases internally (empty guidance, no credentials, etc.)
|
|
209
|
+
- never add a trailing newline (the joiner handles spacing)
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## When the prompt does NOT touch code
|
|
214
|
+
|
|
215
|
+
Some prompts do not produce code (e.g. an image-generation prompt for the Studio composition flow). For those:
|
|
216
|
+
|
|
217
|
+
- Header is still required
|
|
218
|
+
- Block 2 ("Before you write anything") is skipped
|
|
219
|
+
- Block 8 ("If you get stuck") is skipped
|
|
220
|
+
- Block 9 ("Subagents hint") is skipped
|
|
221
|
+
- The remaining blocks apply if relevant
|
|
222
|
+
|
|
223
|
+
These prompts are out of scope for the current refactor and follow a lighter contract.
|
|
224
|
+
|
|
225
|
+
---
|
|
226
|
+
|
|
227
|
+
## Versioning and divergence
|
|
228
|
+
|
|
229
|
+
This document is mirrored in `assets/skill/references/STANDARDS.md` so the installed skill has access to the same charter. When the charter changes:
|
|
230
|
+
|
|
231
|
+
1. Update `web/lib/prompts/STANDARDS.md` first.
|
|
232
|
+
2. Mirror the change to `assets/skill/references/STANDARDS.md`.
|
|
233
|
+
3. Update the affected blocks in `web/lib/prompts/blocks/`.
|
|
234
|
+
4. Re-run the snapshot tests on the builders to surface any drift.
|
|
235
|
+
|
|
236
|
+
The blocks are the source of truth for what the prompts emit. This document is the source of truth for what the blocks should look like.
|
package/dist/cli.js
CHANGED
|
@@ -6,7 +6,7 @@ import fs from 'node:fs/promises';
|
|
|
6
6
|
const require = createRequire(import.meta.url);
|
|
7
7
|
const { version } = require('../package.json');
|
|
8
8
|
import { logger } from './logger.js';
|
|
9
|
-
import { writeConfig, deleteConfig, requireConfig, getConfigPath, DEFAULT_API_BASE_URL, getDefaultApiBaseUrl, getDefaultWsUrl, LOCAL_API_BASE_URL, LOCAL_WS_URL, API_KEY_ENV_VAR, API_BASE_URL_ENV_VAR, WS_URL_ENV_VAR, } from './cli-config.js';
|
|
9
|
+
import { writeConfig, readConfig, deleteConfig, requireConfig, getConfigPath, DEFAULT_API_BASE_URL, getDefaultApiBaseUrl, getDefaultWsUrl, LOCAL_API_BASE_URL, LOCAL_WS_URL, API_KEY_ENV_VAR, API_BASE_URL_ENV_VAR, WS_URL_ENV_VAR, } from './cli-config.js';
|
|
10
10
|
import { renderSkillSingleFile, writeSkillExport } from './skill-packaging.js';
|
|
11
11
|
// ── Program definition ──────────────────────────────────────────────
|
|
12
12
|
export const program = new Command();
|
|
@@ -276,6 +276,7 @@ program
|
|
|
276
276
|
.option('--allow-upload-failure', 'Keep a successful capture exit code even if artifact upload fails', false)
|
|
277
277
|
.option('--debug', 'Verbose logging: per-substep timing, opcode dumps, recovery strategy traces', false)
|
|
278
278
|
.option('--cloud', 'Cloud runner mode: signals 4+ vCPU available, unblocks the conservative Linux FPS default (8 → 30)', false)
|
|
279
|
+
.option('--preset-ids <ids>', 'Comma-separated preset IDs to capture. When omitted, captures all presets with auto_recapture_enabled=true.')
|
|
279
280
|
.action(async (opts) => {
|
|
280
281
|
if (opts.debug) {
|
|
281
282
|
const { setDebugEnabled } = await import('./logger.js');
|
|
@@ -387,7 +388,12 @@ program
|
|
|
387
388
|
// Fetch the presets list with a hard timeout. Without this, a slow or
|
|
388
389
|
// unreachable dashboard would leave the CLI hanging forever — the
|
|
389
390
|
// dashboard would stay stuck at "machine started" with no error surfaced.
|
|
390
|
-
|
|
391
|
+
// When `--preset-ids` is provided, restrict the run to that subset
|
|
392
|
+
// (per-preset recapture launched from the dashboard).
|
|
393
|
+
const presetIdsArg = opts.presetIds?.trim();
|
|
394
|
+
const presetsPath = presetIdsArg
|
|
395
|
+
? `/api/cli/projects/${opts.project}/auto-recapture-presets?preset_ids=${encodeURIComponent(presetIdsArg)}`
|
|
396
|
+
: `/api/cli/projects/${opts.project}/auto-recapture-presets`;
|
|
391
397
|
let data;
|
|
392
398
|
try {
|
|
393
399
|
const response = await fetch(buildApiUrl(config, presetsPath), {
|
|
@@ -543,6 +549,71 @@ program
|
|
|
543
549
|
});
|
|
544
550
|
process.exit(0);
|
|
545
551
|
});
|
|
552
|
+
// ── crm-run command ────────────────────────────────────────────────
|
|
553
|
+
program
|
|
554
|
+
.command('crm-run')
|
|
555
|
+
.description('Scrape BetaList launches and feed the CRM (AUT-109 Phase B)')
|
|
556
|
+
.option('--runId <id>', 'CRM run id (defaults to AUTOKAP_RUN_ID env)')
|
|
557
|
+
.option('--lookback-days <n>', 'How far back to look for launches', '1')
|
|
558
|
+
.option('--debug', 'Verbose logging', false)
|
|
559
|
+
.action(async (opts) => {
|
|
560
|
+
// Log immediately on boot — before any other call — so Cloud Run logs
|
|
561
|
+
// show evidence the new command is reachable. If the next line silently
|
|
562
|
+
// crashes (missing dep, bad import) we at least see the boot.
|
|
563
|
+
logger.info(`[crm-run] booted — autokap CLI ${version}`);
|
|
564
|
+
if (opts.debug) {
|
|
565
|
+
const { setDebugEnabled } = await import('./logger.js');
|
|
566
|
+
setDebugEnabled(true);
|
|
567
|
+
logger.info('[crm-run] Debug mode enabled — verbose logging on');
|
|
568
|
+
}
|
|
569
|
+
// Self-kill after 20 min so a hung scraper exits before the backend
|
|
570
|
+
// reconcile cron (25 min) gets to it. Ensures the Cloud Run instance
|
|
571
|
+
// doesn't leak and the run row gets force-failed quickly.
|
|
572
|
+
const SELF_TIMEOUT_MS = 20 * 60 * 1000;
|
|
573
|
+
const selfTimeout = setTimeout(() => {
|
|
574
|
+
logger.error(`[crm-run] Self-kill: scraper exceeded ${SELF_TIMEOUT_MS / 60000}min budget — exiting non-zero`);
|
|
575
|
+
process.exit(1);
|
|
576
|
+
}, SELF_TIMEOUT_MS);
|
|
577
|
+
selfTimeout.unref?.();
|
|
578
|
+
const runToken = process.env.AUTOKAP_RUN_TOKEN?.trim();
|
|
579
|
+
const runId = opts.runId?.trim() || process.env.AUTOKAP_RUN_ID?.trim();
|
|
580
|
+
const config = await readConfig();
|
|
581
|
+
const apiBaseUrl = process.env.AUTOKAP_API_BASE_URL?.trim().replace(/\/+$/, '') || config?.apiBaseUrl;
|
|
582
|
+
logger.info(`[crm-run] env check — runId=${runId ?? '<missing>'} ` +
|
|
583
|
+
`apiBaseUrl=${apiBaseUrl ?? '<missing>'} ` +
|
|
584
|
+
`runToken=${runToken ? `${runToken.slice(0, 12)}…` : '<missing>'}`);
|
|
585
|
+
if (!runToken) {
|
|
586
|
+
fatal('[crm-run] Missing AUTOKAP_RUN_TOKEN');
|
|
587
|
+
}
|
|
588
|
+
if (!runId) {
|
|
589
|
+
fatal('[crm-run] Missing CRM run id. Set AUTOKAP_RUN_ID or pass --runId <id>.');
|
|
590
|
+
}
|
|
591
|
+
if (!apiBaseUrl) {
|
|
592
|
+
fatal('[crm-run] Missing API base URL. Set AUTOKAP_API_BASE_URL or run autokap init.');
|
|
593
|
+
}
|
|
594
|
+
const parsedLookback = Number.parseInt(opts.lookbackDays, 10);
|
|
595
|
+
const lookbackDays = Math.max(1, Math.min(7, Number.isFinite(parsedLookback) ? parsedLookback : 1));
|
|
596
|
+
logger.info(`[crm-run] starting campaign — lookbackDays=${lookbackDays}`);
|
|
597
|
+
try {
|
|
598
|
+
const { runCampaign } = await import('./crm/run-campaign.js');
|
|
599
|
+
const result = await runCampaign({
|
|
600
|
+
runId,
|
|
601
|
+
lookbackDays,
|
|
602
|
+
apiBaseUrl,
|
|
603
|
+
runToken,
|
|
604
|
+
logger,
|
|
605
|
+
});
|
|
606
|
+
clearTimeout(selfTimeout);
|
|
607
|
+
logger.success(`[crm-run] Done — scraped=${result.scraped} inserted=${result.inserted} ` +
|
|
608
|
+
`disqualified=${result.disqualified} skipped=${result.skipped}`);
|
|
609
|
+
process.exit(0);
|
|
610
|
+
}
|
|
611
|
+
catch (error) {
|
|
612
|
+
clearTimeout(selfTimeout);
|
|
613
|
+
logger.error(`[crm-run] Failed: ${error.message}`);
|
|
614
|
+
process.exit(1);
|
|
615
|
+
}
|
|
616
|
+
});
|
|
546
617
|
// ── project commands ───────────────────────────────────────────────
|
|
547
618
|
const projectCmd = program
|
|
548
619
|
.command('project')
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export interface EmailFallbackOptions {
|
|
2
|
+
betaListLaunchUrl: string;
|
|
3
|
+
productUrl: string | null;
|
|
4
|
+
logger: {
|
|
5
|
+
info(msg: string): void;
|
|
6
|
+
warn(msg: string): void;
|
|
7
|
+
error(msg: string): void;
|
|
8
|
+
};
|
|
9
|
+
}
|
|
10
|
+
export declare function findEmail(opts: EmailFallbackOptions): Promise<{
|
|
11
|
+
email: string | null;
|
|
12
|
+
handle: string | null;
|
|
13
|
+
lang: string | null;
|
|
14
|
+
}>;
|
|
15
|
+
export declare function extractEmailsFromText(text: string): string[];
|
|
16
|
+
export declare function pickBestEmail(emails: string[], productHostname: string | null): string | null;
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
const CRAWLER_UA = 'AutoKap-Crawler/1.0 (+https://autokap.app/crawler)';
|
|
2
|
+
const EMAIL_RE = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
|
|
3
|
+
const ROLE_PREFIXES = ['founder', 'contact', 'hello', 'team', 'support', 'info'];
|
|
4
|
+
const FREE_MAIL_DOMAINS = new Set(['gmail.com', 'outlook.com', 'hotmail.com', 'yahoo.com', 'protonmail.com', 'icloud.com', 'proton.me']);
|
|
5
|
+
const domainQueues = new Map();
|
|
6
|
+
export async function findEmail(opts) {
|
|
7
|
+
const visited = [];
|
|
8
|
+
const emails = new Set();
|
|
9
|
+
let handle = null;
|
|
10
|
+
let lang = null;
|
|
11
|
+
const productHostname = opts.productUrl ? hostnameOf(opts.productUrl) : null;
|
|
12
|
+
const visit = async (url) => {
|
|
13
|
+
const page = await fetchPage(url, opts.logger);
|
|
14
|
+
if (!page)
|
|
15
|
+
return;
|
|
16
|
+
visited.push(page);
|
|
17
|
+
const cheerio = await loadCheerio();
|
|
18
|
+
const $ = cheerio.load(page.html);
|
|
19
|
+
handle ??= extractHandle($);
|
|
20
|
+
if (!lang && productHostname && hostnameOf(page.url) === productHostname && page.status === 200) {
|
|
21
|
+
lang = extractLanguage($, page.text);
|
|
22
|
+
}
|
|
23
|
+
for (const email of extractEmailsFromText(extractMailtos($).join(' ')))
|
|
24
|
+
emails.add(email);
|
|
25
|
+
for (const email of extractEmailsFromText(page.text))
|
|
26
|
+
emails.add(email);
|
|
27
|
+
};
|
|
28
|
+
await visit(opts.betaListLaunchUrl);
|
|
29
|
+
let best = pickBestEmail([...emails], productHostname);
|
|
30
|
+
if (isHighRankEmail(best, productHostname) && handle) {
|
|
31
|
+
return { email: best, handle, lang };
|
|
32
|
+
}
|
|
33
|
+
const productUrls = buildProductUrls(opts.productUrl);
|
|
34
|
+
for (const url of productUrls) {
|
|
35
|
+
if (isHighRankEmail(best, productHostname) && handle)
|
|
36
|
+
break;
|
|
37
|
+
await visit(url);
|
|
38
|
+
best = pickBestEmail([...emails], productHostname);
|
|
39
|
+
}
|
|
40
|
+
if (!lang) {
|
|
41
|
+
const combinedText = visited.map((page) => page.text).join(' ');
|
|
42
|
+
lang = inferLanguageFromText(combinedText);
|
|
43
|
+
}
|
|
44
|
+
return {
|
|
45
|
+
email: best,
|
|
46
|
+
handle,
|
|
47
|
+
lang,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
export function extractEmailsFromText(text) {
|
|
51
|
+
const matches = text.match(EMAIL_RE) ?? [];
|
|
52
|
+
return [...new Set(matches.map((email) => email.toLowerCase()).filter((email) => !isJunkEmail(email)))];
|
|
53
|
+
}
|
|
54
|
+
export function pickBestEmail(emails, productHostname) {
|
|
55
|
+
if (emails.length === 0)
|
|
56
|
+
return null;
|
|
57
|
+
const normalized = [...new Set(emails.map((email) => email.trim().toLowerCase()).filter(Boolean))];
|
|
58
|
+
if (normalized.length === 0)
|
|
59
|
+
return null;
|
|
60
|
+
return normalized.sort((a, b) => rankEmail(a, productHostname) - rankEmail(b, productHostname))[0] ?? null;
|
|
61
|
+
}
|
|
62
|
+
function isJunkEmail(email) {
|
|
63
|
+
const lower = email.toLowerCase();
|
|
64
|
+
return lower.includes('example.com')
|
|
65
|
+
|| lower.includes('sentry.io')
|
|
66
|
+
|| lower.includes('wixpress.com')
|
|
67
|
+
|| lower.includes('@2x')
|
|
68
|
+
|| lower.includes('png')
|
|
69
|
+
|| lower.includes('jpg')
|
|
70
|
+
|| lower.includes('svg');
|
|
71
|
+
}
|
|
72
|
+
function rankEmail(email, productHostname) {
|
|
73
|
+
const domain = email.split('@')[1]?.toLowerCase() ?? '';
|
|
74
|
+
const local = email.split('@')[0]?.toLowerCase() ?? '';
|
|
75
|
+
const sameDomain = productHostname ? domainsMatch(domain, productHostname) : false;
|
|
76
|
+
const roleRank = ROLE_PREFIXES.indexOf(local);
|
|
77
|
+
const isRole = roleRank !== -1;
|
|
78
|
+
const isFreeMail = FREE_MAIL_DOMAINS.has(domain);
|
|
79
|
+
if (sameDomain && isRole)
|
|
80
|
+
return roleRank;
|
|
81
|
+
if (sameDomain)
|
|
82
|
+
return 100;
|
|
83
|
+
if (isFreeMail && isRole)
|
|
84
|
+
return 200 + roleRank;
|
|
85
|
+
if (isFreeMail)
|
|
86
|
+
return 300;
|
|
87
|
+
return 400;
|
|
88
|
+
}
|
|
89
|
+
function isHighRankEmail(email, productHostname) {
|
|
90
|
+
return email !== null && rankEmail(email, productHostname) < 200;
|
|
91
|
+
}
|
|
92
|
+
function domainsMatch(emailDomain, productHostname) {
|
|
93
|
+
const normalizedHost = stripWww(productHostname);
|
|
94
|
+
const normalizedEmailDomain = stripWww(emailDomain);
|
|
95
|
+
return normalizedEmailDomain === normalizedHost || etldOne(normalizedEmailDomain) === etldOne(normalizedHost);
|
|
96
|
+
}
|
|
97
|
+
function etldOne(hostname) {
|
|
98
|
+
const parts = stripWww(hostname).split('.').filter(Boolean);
|
|
99
|
+
return parts.length <= 2 ? parts.join('.') : parts.slice(-2).join('.');
|
|
100
|
+
}
|
|
101
|
+
function stripWww(hostname) {
|
|
102
|
+
return hostname.toLowerCase().replace(/^www\./, '');
|
|
103
|
+
}
|
|
104
|
+
function buildProductUrls(productUrl) {
|
|
105
|
+
if (!productUrl)
|
|
106
|
+
return [];
|
|
107
|
+
try {
|
|
108
|
+
const base = new URL(productUrl);
|
|
109
|
+
const urls = [base.toString()];
|
|
110
|
+
for (const pathname of ['/contact', '/about', '/legal', '/mentions-legales']) {
|
|
111
|
+
const next = new URL(base.toString());
|
|
112
|
+
next.pathname = pathname;
|
|
113
|
+
next.search = '';
|
|
114
|
+
next.hash = '';
|
|
115
|
+
urls.push(next.toString());
|
|
116
|
+
}
|
|
117
|
+
return [...new Set(urls)];
|
|
118
|
+
}
|
|
119
|
+
catch {
|
|
120
|
+
return [];
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
async function fetchPage(url, logger) {
|
|
124
|
+
let parsed;
|
|
125
|
+
try {
|
|
126
|
+
parsed = new URL(url);
|
|
127
|
+
}
|
|
128
|
+
catch {
|
|
129
|
+
return null;
|
|
130
|
+
}
|
|
131
|
+
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:')
|
|
132
|
+
return null;
|
|
133
|
+
await waitForDomainTurn(parsed.hostname);
|
|
134
|
+
try {
|
|
135
|
+
const response = await fetch(parsed.toString(), {
|
|
136
|
+
headers: {
|
|
137
|
+
Accept: 'text/html',
|
|
138
|
+
'User-Agent': CRAWLER_UA,
|
|
139
|
+
},
|
|
140
|
+
signal: AbortSignal.timeout(15_000),
|
|
141
|
+
});
|
|
142
|
+
if (!response.ok) {
|
|
143
|
+
logger.warn(`[crm-email] Fetch returned HTTP ${response.status} for ${parsed.toString()}`);
|
|
144
|
+
return null;
|
|
145
|
+
}
|
|
146
|
+
const html = await response.text();
|
|
147
|
+
const cheerio = await loadCheerio();
|
|
148
|
+
const $ = cheerio.load(html);
|
|
149
|
+
$('script, style, noscript, svg').remove();
|
|
150
|
+
return {
|
|
151
|
+
html,
|
|
152
|
+
text: $('body').text().replace(/\s+/g, ' ').trim(),
|
|
153
|
+
url: response.url || parsed.toString(),
|
|
154
|
+
status: response.status,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
catch (error) {
|
|
158
|
+
logger.warn(`[crm-email] Fetch failed for ${parsed.toString()}: ${error.message}`);
|
|
159
|
+
return null;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
async function waitForDomainTurn(hostname) {
|
|
163
|
+
const previous = domainQueues.get(hostname) ?? Promise.resolve();
|
|
164
|
+
let release = () => { };
|
|
165
|
+
const current = previous.then(() => new Promise((resolve) => {
|
|
166
|
+
release = resolve;
|
|
167
|
+
}));
|
|
168
|
+
domainQueues.set(hostname, current);
|
|
169
|
+
await previous;
|
|
170
|
+
setTimeout(release, 1_000);
|
|
171
|
+
}
|
|
172
|
+
async function loadCheerio() {
|
|
173
|
+
const importer = new Function('specifier', 'return import(specifier)');
|
|
174
|
+
return importer('cheerio');
|
|
175
|
+
}
|
|
176
|
+
function extractMailtos($) {
|
|
177
|
+
return $('a[href^="mailto:"]').toArray().map((anchor) => {
|
|
178
|
+
const href = $(anchor).attr('href') ?? '';
|
|
179
|
+
return decodeURIComponent(href.replace(/^mailto:/i, '').split('?')[0] ?? '');
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
function extractHandle($) {
|
|
183
|
+
for (const anchor of $('a[href]').toArray()) {
|
|
184
|
+
const href = $(anchor).attr('href') ?? '';
|
|
185
|
+
const twitter = href.match(/(?:twitter\.com|x\.com)\/([A-Za-z0-9_]{1,20})(?:[/?#]|$)/i);
|
|
186
|
+
if (twitter)
|
|
187
|
+
return `@${twitter[1]}`;
|
|
188
|
+
const linkedin = href.match(/linkedin\.com\/in\/([^/?#]+)/i);
|
|
189
|
+
if (linkedin)
|
|
190
|
+
return linkedin[1] ?? null;
|
|
191
|
+
}
|
|
192
|
+
return null;
|
|
193
|
+
}
|
|
194
|
+
function extractLanguage($, text) {
|
|
195
|
+
const lang = $('html').attr('lang')?.trim().split(/[-_]/)[0]?.toLowerCase();
|
|
196
|
+
if (lang)
|
|
197
|
+
return lang;
|
|
198
|
+
return inferLanguageFromText(text);
|
|
199
|
+
}
|
|
200
|
+
function inferLanguageFromText(text) {
|
|
201
|
+
if (!text)
|
|
202
|
+
return null;
|
|
203
|
+
if (/\b(bonjour|merci|à propos|mentions légales)\b/i.test(text))
|
|
204
|
+
return 'fr';
|
|
205
|
+
if (/\b(the|and|contact|about|privacy|terms|login|sign in)\b/i.test(text))
|
|
206
|
+
return 'en';
|
|
207
|
+
return null;
|
|
208
|
+
}
|
|
209
|
+
function hostnameOf(value) {
|
|
210
|
+
try {
|
|
211
|
+
return new URL(value).hostname;
|
|
212
|
+
}
|
|
213
|
+
catch {
|
|
214
|
+
return null;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
//# sourceMappingURL=email-fallback.js.map
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { type Browser } from 'playwright';
|
|
2
|
+
import { findEmail } from './email-fallback.js';
|
|
3
|
+
import { scrapeBetaListLaunches } from './scrape-betalist.js';
|
|
4
|
+
import { scrapeLanding } from './scrape-landing.js';
|
|
5
|
+
export interface RunCampaignOptions {
|
|
6
|
+
runId: string;
|
|
7
|
+
lookbackDays: number;
|
|
8
|
+
apiBaseUrl: string;
|
|
9
|
+
runToken: string;
|
|
10
|
+
logger: {
|
|
11
|
+
info(msg: string): void;
|
|
12
|
+
warn(msg: string): void;
|
|
13
|
+
error(msg: string): void;
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
export interface RunCampaignDeps {
|
|
17
|
+
scrapeLaunches?: typeof scrapeBetaListLaunches;
|
|
18
|
+
scrapeLanding?: typeof scrapeLanding;
|
|
19
|
+
findEmail?: typeof findEmail;
|
|
20
|
+
fetch?: typeof fetch;
|
|
21
|
+
launchBrowser?: () => Promise<Browser>;
|
|
22
|
+
}
|
|
23
|
+
export declare function runCampaign(opts: RunCampaignOptions, deps?: RunCampaignDeps): Promise<{
|
|
24
|
+
scraped: number;
|
|
25
|
+
inserted: number;
|
|
26
|
+
disqualified: number;
|
|
27
|
+
skipped: number;
|
|
28
|
+
}>;
|