@canivel/ralph 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/ralph/PROMPT_build.md +126 -0
- package/.agents/ralph/agents.sh +15 -0
- package/.agents/ralph/config.sh +25 -0
- package/.agents/ralph/log-activity.sh +15 -0
- package/.agents/ralph/loop.sh +1001 -0
- package/.agents/ralph/references/CONTEXT_ENGINEERING.md +126 -0
- package/.agents/ralph/references/GUARDRAILS.md +174 -0
- package/AGENTS.md +20 -0
- package/README.md +266 -0
- package/bin/ralph +766 -0
- package/diagram.svg +55 -0
- package/examples/commands.md +46 -0
- package/package.json +39 -0
- package/ralph.webp +0 -0
- package/skills/commit/SKILL.md +219 -0
- package/skills/commit/references/commit_examples.md +292 -0
- package/skills/dev-browser/SKILL.md +211 -0
- package/skills/dev-browser/bun.lock +443 -0
- package/skills/dev-browser/package-lock.json +2988 -0
- package/skills/dev-browser/package.json +31 -0
- package/skills/dev-browser/references/scraping.md +155 -0
- package/skills/dev-browser/scripts/start-relay.ts +32 -0
- package/skills/dev-browser/scripts/start-server.ts +117 -0
- package/skills/dev-browser/server.sh +24 -0
- package/skills/dev-browser/src/client.ts +474 -0
- package/skills/dev-browser/src/index.ts +287 -0
- package/skills/dev-browser/src/relay.ts +731 -0
- package/skills/dev-browser/src/snapshot/__tests__/snapshot.test.ts +223 -0
- package/skills/dev-browser/src/snapshot/browser-script.ts +877 -0
- package/skills/dev-browser/src/snapshot/index.ts +14 -0
- package/skills/dev-browser/src/snapshot/inject.ts +13 -0
- package/skills/dev-browser/src/types.ts +34 -0
- package/skills/dev-browser/tsconfig.json +36 -0
- package/skills/dev-browser/vitest.config.ts +12 -0
- package/skills/prd/SKILL.md +235 -0
- package/tests/agent-loops.mjs +79 -0
- package/tests/agent-ping.mjs +39 -0
- package/tests/audit.md +56 -0
- package/tests/cli-smoke.mjs +47 -0
- package/tests/real-agents.mjs +127 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "dev-browser",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"imports": {
|
|
6
|
+
"@/*": "./src/*"
|
|
7
|
+
},
|
|
8
|
+
"scripts": {
|
|
9
|
+
"start-server": "npx tsx scripts/start-server.ts",
|
|
10
|
+
"start-extension": "npx tsx scripts/start-relay.ts",
|
|
11
|
+
"dev": "npx tsx --watch src/index.ts",
|
|
12
|
+
"test": "vitest run",
|
|
13
|
+
"test:watch": "vitest"
|
|
14
|
+
},
|
|
15
|
+
"dependencies": {
|
|
16
|
+
"@hono/node-server": "^1.19.7",
|
|
17
|
+
"@hono/node-ws": "^1.2.0",
|
|
18
|
+
"express": "^4.21.0",
|
|
19
|
+
"hono": "^4.11.1",
|
|
20
|
+
"playwright": "^1.49.0"
|
|
21
|
+
},
|
|
22
|
+
"devDependencies": {
|
|
23
|
+
"@types/express": "^5.0.0",
|
|
24
|
+
"tsx": "^4.21.0",
|
|
25
|
+
"typescript": "^5.0.0",
|
|
26
|
+
"vitest": "^2.1.0"
|
|
27
|
+
},
|
|
28
|
+
"optionalDependencies": {
|
|
29
|
+
"@rollup/rollup-linux-x64-gnu": "^4.0.0"
|
|
30
|
+
}
|
|
31
|
+
}
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# Data Scraping Guide
|
|
2
|
+
|
|
3
|
+
For large datasets (followers, posts, search results), **intercept and replay network requests** rather than scrolling and parsing the DOM. This is faster, more reliable, and handles pagination automatically.
|
|
4
|
+
|
|
5
|
+
## Why Not Scroll?
|
|
6
|
+
|
|
7
|
+
Scrolling is slow, unreliable, and wastes time. APIs return structured data with pagination built in. Always prefer API replay.
|
|
8
|
+
|
|
9
|
+
## Start Small, Then Scale
|
|
10
|
+
|
|
11
|
+
**Don't try to automate everything at once.** Work incrementally:
|
|
12
|
+
|
|
13
|
+
1. **Capture one request** - verify you're intercepting the right endpoint
|
|
14
|
+
2. **Inspect one response** - understand the schema before writing extraction code
|
|
15
|
+
3. **Extract a few items** - make sure your parsing logic works
|
|
16
|
+
4. **Then scale up** - add pagination loop only after the basics work
|
|
17
|
+
|
|
18
|
+
This prevents wasting time debugging a complex script when the issue is a simple path like `data.user.timeline` vs `data.user.result.timeline`.
|
|
19
|
+
|
|
20
|
+
## Step-by-Step Workflow
|
|
21
|
+
|
|
22
|
+
### 1. Capture Request Details
|
|
23
|
+
|
|
24
|
+
First, intercept a request to understand URL structure and required headers:
|
|
25
|
+
|
|
26
|
+
```typescript
|
|
27
|
+
import { connect, waitForPageLoad } from "@/client.js";
|
|
28
|
+
import * as fs from "node:fs";
|
|
29
|
+
|
|
30
|
+
const client = await connect();
|
|
31
|
+
const page = await client.page("site");
|
|
32
|
+
|
|
33
|
+
let capturedRequest = null;
|
|
34
|
+
page.on("request", (request) => {
|
|
35
|
+
const url = request.url();
|
|
36
|
+
// Look for API endpoints (adjust pattern for your target site)
|
|
37
|
+
if (url.includes("/api/") || url.includes("/graphql/")) {
|
|
38
|
+
capturedRequest = {
|
|
39
|
+
url: url,
|
|
40
|
+
headers: request.headers(),
|
|
41
|
+
method: request.method(),
|
|
42
|
+
};
|
|
43
|
+
fs.writeFileSync("tmp/request-details.json", JSON.stringify(capturedRequest, null, 2));
|
|
44
|
+
console.log("Captured request:", url.substring(0, 80) + "...");
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
await page.goto("https://example.com/profile");
|
|
49
|
+
await waitForPageLoad(page);
|
|
50
|
+
await page.waitForTimeout(3000);
|
|
51
|
+
|
|
52
|
+
await client.disconnect();
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### 2. Capture Response to Understand Schema
|
|
56
|
+
|
|
57
|
+
Save a raw response to inspect the data structure:
|
|
58
|
+
|
|
59
|
+
```typescript
|
|
60
|
+
page.on("response", async (response) => {
|
|
61
|
+
const url = response.url();
|
|
62
|
+
if (url.includes("UserTweets") || url.includes("/api/data")) {
|
|
63
|
+
const json = await response.json();
|
|
64
|
+
fs.writeFileSync("tmp/api-response.json", JSON.stringify(json, null, 2));
|
|
65
|
+
console.log("Captured response");
|
|
66
|
+
}
|
|
67
|
+
});
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Then analyze the structure to find:
|
|
71
|
+
|
|
72
|
+
- Where the data array lives (e.g., `data.user.result.timeline.instructions[].entries`)
|
|
73
|
+
- Where pagination cursors are (e.g., `cursor-bottom` entries)
|
|
74
|
+
- What fields you need to extract
|
|
75
|
+
|
|
76
|
+
### 3. Replay API with Pagination
|
|
77
|
+
|
|
78
|
+
Once you understand the schema, replay requests directly:
|
|
79
|
+
|
|
80
|
+
```typescript
|
|
81
|
+
import { connect } from "@/client.js";
|
|
82
|
+
import * as fs from "node:fs";
|
|
83
|
+
|
|
84
|
+
const client = await connect();
|
|
85
|
+
const page = await client.page("site");
|
|
86
|
+
|
|
87
|
+
const results = new Map(); // Use Map for deduplication
|
|
88
|
+
const headers = JSON.parse(fs.readFileSync("tmp/request-details.json", "utf8")).headers;
|
|
89
|
+
const baseUrl = "https://example.com/api/data";
|
|
90
|
+
|
|
91
|
+
let cursor = null;
|
|
92
|
+
let hasMore = true;
|
|
93
|
+
|
|
94
|
+
while (hasMore) {
|
|
95
|
+
// Build URL with pagination cursor
|
|
96
|
+
const params = { count: 20 };
|
|
97
|
+
if (cursor) params.cursor = cursor;
|
|
98
|
+
const url = `${baseUrl}?params=${encodeURIComponent(JSON.stringify(params))}`;
|
|
99
|
+
|
|
100
|
+
// Execute fetch in browser context (has auth cookies/headers)
|
|
101
|
+
const response = await page.evaluate(
|
|
102
|
+
async ({ url, headers }) => {
|
|
103
|
+
const res = await fetch(url, { headers });
|
|
104
|
+
return res.json();
|
|
105
|
+
},
|
|
106
|
+
{ url, headers }
|
|
107
|
+
);
|
|
108
|
+
|
|
109
|
+
// Extract data and cursor (adjust paths for your API)
|
|
110
|
+
const entries = response?.data?.entries || [];
|
|
111
|
+
for (const entry of entries) {
|
|
112
|
+
if (entry.type === "cursor-bottom") {
|
|
113
|
+
cursor = entry.value;
|
|
114
|
+
} else if (entry.id && !results.has(entry.id)) {
|
|
115
|
+
results.set(entry.id, {
|
|
116
|
+
id: entry.id,
|
|
117
|
+
text: entry.content,
|
|
118
|
+
timestamp: entry.created_at,
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
console.log(`Fetched page, total: ${results.size}`);
|
|
124
|
+
|
|
125
|
+
// Check stop conditions
|
|
126
|
+
if (!cursor || entries.length === 0) hasMore = false;
|
|
127
|
+
|
|
128
|
+
// Rate limiting - be respectful
|
|
129
|
+
await new Promise((r) => setTimeout(r, 500));
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Export results
|
|
133
|
+
const data = Array.from(results.values());
|
|
134
|
+
fs.writeFileSync("tmp/results.json", JSON.stringify(data, null, 2));
|
|
135
|
+
console.log(`Saved ${data.length} items`);
|
|
136
|
+
|
|
137
|
+
await client.disconnect();
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Key Patterns
|
|
141
|
+
|
|
142
|
+
| Pattern | Description |
|
|
143
|
+
| ----------------------- | ------------------------------------------------------ |
|
|
144
|
+
| `page.on('request')` | Capture outgoing request URL + headers |
|
|
145
|
+
| `page.on('response')` | Capture response data to understand schema |
|
|
146
|
+
| `page.evaluate(fetch)` | Replay requests in browser context (inherits auth) |
|
|
147
|
+
| `Map` for deduplication | APIs often return overlapping data across pages |
|
|
148
|
+
| Cursor-based pagination | Look for `cursor`, `next_token`, `offset` in responses |
|
|
149
|
+
|
|
150
|
+
## Tips
|
|
151
|
+
|
|
152
|
+
- **Extension mode**: `page.context().cookies()` doesn't work - capture auth headers from intercepted requests instead
|
|
153
|
+
- **Rate limiting**: Add 500ms+ delays between requests to avoid blocks
|
|
154
|
+
- **Stop conditions**: Check for empty results, missing cursor, or reaching a date/ID threshold
|
|
155
|
+
- **GraphQL APIs**: URL params often include `variables` and `features` JSON objects - capture and reuse them
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Start the CDP relay server for Chrome extension mode
|
|
3
|
+
*
|
|
4
|
+
* Usage: npm run start-extension
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { serveRelay } from "@/relay.js";
|
|
8
|
+
|
|
9
|
+
const PORT = parseInt(process.env.PORT || "9222", 10);
|
|
10
|
+
const HOST = process.env.HOST || "127.0.0.1";
|
|
11
|
+
|
|
12
|
+
async function main() {
|
|
13
|
+
const server = await serveRelay({
|
|
14
|
+
port: PORT,
|
|
15
|
+
host: HOST,
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
// Handle shutdown
|
|
19
|
+
const shutdown = async () => {
|
|
20
|
+
console.log("\nShutting down relay server...");
|
|
21
|
+
await server.stop();
|
|
22
|
+
process.exit(0);
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
process.on("SIGINT", shutdown);
|
|
26
|
+
process.on("SIGTERM", shutdown);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
main().catch((err) => {
|
|
30
|
+
console.error("Failed to start relay server:", err);
|
|
31
|
+
process.exit(1);
|
|
32
|
+
});
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import { serve } from "@/index.js";
|
|
2
|
+
import { execSync } from "child_process";
|
|
3
|
+
import { mkdirSync, existsSync, readdirSync } from "fs";
|
|
4
|
+
import { join, dirname } from "path";
|
|
5
|
+
import { fileURLToPath } from "url";
|
|
6
|
+
|
|
7
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
8
|
+
const tmpDir = join(__dirname, "..", "tmp");
|
|
9
|
+
const profileDir = join(__dirname, "..", "profiles");
|
|
10
|
+
|
|
11
|
+
// Create tmp and profile directories if they don't exist
|
|
12
|
+
console.log("Creating tmp directory...");
|
|
13
|
+
mkdirSync(tmpDir, { recursive: true });
|
|
14
|
+
console.log("Creating profiles directory...");
|
|
15
|
+
mkdirSync(profileDir, { recursive: true });
|
|
16
|
+
|
|
17
|
+
// Install Playwright browsers if not already installed
|
|
18
|
+
console.log("Checking Playwright browser installation...");
|
|
19
|
+
|
|
20
|
+
function findPackageManager(): { name: string; command: string } | null {
|
|
21
|
+
const managers = [
|
|
22
|
+
{ name: "bun", command: "bunx playwright install chromium" },
|
|
23
|
+
{ name: "pnpm", command: "pnpm exec playwright install chromium" },
|
|
24
|
+
{ name: "npm", command: "npx playwright install chromium" },
|
|
25
|
+
];
|
|
26
|
+
|
|
27
|
+
for (const manager of managers) {
|
|
28
|
+
try {
|
|
29
|
+
execSync(`which ${manager.name}`, { stdio: "ignore" });
|
|
30
|
+
return manager;
|
|
31
|
+
} catch {
|
|
32
|
+
// Package manager not found, try next
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
return null;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function isChromiumInstalled(): boolean {
|
|
39
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || "";
|
|
40
|
+
const playwrightCacheDir = join(homeDir, ".cache", "ms-playwright");
|
|
41
|
+
|
|
42
|
+
if (!existsSync(playwrightCacheDir)) {
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Check for chromium directories (e.g., chromium-1148, chromium_headless_shell-1148)
|
|
47
|
+
try {
|
|
48
|
+
const entries = readdirSync(playwrightCacheDir);
|
|
49
|
+
return entries.some((entry) => entry.startsWith("chromium"));
|
|
50
|
+
} catch {
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
try {
|
|
56
|
+
if (!isChromiumInstalled()) {
|
|
57
|
+
console.log("Playwright Chromium not found. Installing (this may take a minute)...");
|
|
58
|
+
|
|
59
|
+
const pm = findPackageManager();
|
|
60
|
+
if (!pm) {
|
|
61
|
+
throw new Error("No package manager found (tried bun, pnpm, npm)");
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
console.log(`Using ${pm.name} to install Playwright...`);
|
|
65
|
+
execSync(pm.command, { stdio: "inherit" });
|
|
66
|
+
console.log("Chromium installed successfully.");
|
|
67
|
+
} else {
|
|
68
|
+
console.log("Playwright Chromium already installed.");
|
|
69
|
+
}
|
|
70
|
+
} catch (error) {
|
|
71
|
+
console.error("Failed to install Playwright browsers:", error);
|
|
72
|
+
console.log("You may need to run: npx playwright install chromium");
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Check if server is already running
|
|
76
|
+
console.log("Checking for existing servers...");
|
|
77
|
+
try {
|
|
78
|
+
const res = await fetch("http://localhost:9222", {
|
|
79
|
+
signal: AbortSignal.timeout(1000),
|
|
80
|
+
});
|
|
81
|
+
if (res.ok) {
|
|
82
|
+
console.log("Server already running on port 9222");
|
|
83
|
+
process.exit(0);
|
|
84
|
+
}
|
|
85
|
+
} catch {
|
|
86
|
+
// Server not running, continue to start
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Clean up stale CDP port if HTTP server isn't running (crash recovery)
|
|
90
|
+
// This handles the case where Node crashed but Chrome is still running on 9223
|
|
91
|
+
try {
|
|
92
|
+
const pid = execSync("lsof -ti:9223", { encoding: "utf-8" }).trim();
|
|
93
|
+
if (pid) {
|
|
94
|
+
console.log(`Cleaning up stale Chrome process on CDP port 9223 (PID: ${pid})`);
|
|
95
|
+
execSync(`kill -9 ${pid}`);
|
|
96
|
+
}
|
|
97
|
+
} catch {
|
|
98
|
+
// No process on CDP port, which is expected
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
console.log("Starting dev browser server...");
|
|
102
|
+
const headless = process.env.HEADLESS === "true";
|
|
103
|
+
const server = await serve({
|
|
104
|
+
port: 9222,
|
|
105
|
+
headless,
|
|
106
|
+
profileDir,
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
console.log(`Dev browser server started`);
|
|
110
|
+
console.log(` WebSocket: ${server.wsEndpoint}`);
|
|
111
|
+
console.log(` Tmp directory: ${tmpDir}`);
|
|
112
|
+
console.log(` Profile directory: ${profileDir}`);
|
|
113
|
+
console.log(`\nReady`);
|
|
114
|
+
console.log(`\nPress Ctrl+C to stop`);
|
|
115
|
+
|
|
116
|
+
// Keep the process running
|
|
117
|
+
await new Promise(() => {});
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
# Get the directory where this script is located
|
|
4
|
+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
|
5
|
+
|
|
6
|
+
# Change to the script directory
|
|
7
|
+
cd "$SCRIPT_DIR"
|
|
8
|
+
|
|
9
|
+
# Parse command line arguments
|
|
10
|
+
HEADLESS=false
|
|
11
|
+
while [[ "$#" -gt 0 ]]; do
|
|
12
|
+
case $1 in
|
|
13
|
+
--headless) HEADLESS=true ;;
|
|
14
|
+
*) echo "Unknown parameter: $1"; exit 1 ;;
|
|
15
|
+
esac
|
|
16
|
+
shift
|
|
17
|
+
done
|
|
18
|
+
|
|
19
|
+
echo "Installing dependencies..."
|
|
20
|
+
npm install
|
|
21
|
+
|
|
22
|
+
echo "Starting dev-browser server..."
|
|
23
|
+
export HEADLESS=$HEADLESS
|
|
24
|
+
npx tsx scripts/start-server.ts
|