orangeslice 1.4.2 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/browser.d.ts +68 -0
- package/dist/browser.js +114 -0
- package/dist/index.d.ts +14 -2
- package/dist/index.js +11 -2
- package/docs/AGENTS.md +107 -4
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -14,7 +14,8 @@ This copies documentation to `./orangeslice-docs/` and installs the package. Poi
|
|
|
14
14
|
|----------|------------|
|
|
15
15
|
| `b2b` | Query 1B+ LinkedIn profiles, companies, funding, jobs |
|
|
16
16
|
| `serp` | Google search for news, articles, reviews |
|
|
17
|
-
| `firecrawl` | Scrape websites, extract social URLs |
|
|
17
|
+
| `firecrawl` | Scrape static websites, extract social URLs |
|
|
18
|
+
| `browser` | Playwright automation for dynamic/JS sites |
|
|
18
19
|
|
|
19
20
|
## Quick Example
|
|
20
21
|
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
export interface BrowserResponse {
|
|
2
|
+
success: boolean;
|
|
3
|
+
result?: any;
|
|
4
|
+
error?: string;
|
|
5
|
+
browser_live_view_url?: string;
|
|
6
|
+
}
|
|
7
|
+
export interface BrowserOptions {
|
|
8
|
+
/** Browser pool ID (default: pre-warmed pool) */
|
|
9
|
+
pool?: string;
|
|
10
|
+
/** Execution timeout in seconds */
|
|
11
|
+
timeout_sec?: number;
|
|
12
|
+
/** Timeout for acquiring browser from pool */
|
|
13
|
+
acquire_timeout_seconds?: number;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Execute Playwright code with `page` in scope.
|
|
17
|
+
* Browser is automatically acquired from a pre-warmed pool and released when done.
|
|
18
|
+
*
|
|
19
|
+
* @param code - Playwright code to execute (has `page` in scope)
|
|
20
|
+
* @param options - Optional settings for timeout and pool
|
|
21
|
+
*
|
|
22
|
+
* @example
|
|
23
|
+
* // Get page snapshot for analysis
|
|
24
|
+
* const response = await browser.execute(`
|
|
25
|
+
* await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
26
|
+
* return await page._snapshotForAI();
|
|
27
|
+
* `);
|
|
28
|
+
*
|
|
29
|
+
* @example
|
|
30
|
+
* // Extract data from page
|
|
31
|
+
* const response = await browser.execute(`
|
|
32
|
+
* await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
|
|
33
|
+
* return await page.evaluate(() => {
|
|
34
|
+
* return [...document.querySelectorAll('.item')].map(el => ({
|
|
35
|
+
* title: el.querySelector('h2')?.textContent?.trim(),
|
|
36
|
+
* url: el.querySelector('a')?.href
|
|
37
|
+
* }));
|
|
38
|
+
* });
|
|
39
|
+
* `);
|
|
40
|
+
* // response = { success: true, result: [...] }
|
|
41
|
+
*/
|
|
42
|
+
export declare function execute(code: string, options?: BrowserOptions): Promise<BrowserResponse>;
|
|
43
|
+
/**
|
|
44
|
+
* Get a page snapshot for AI analysis.
|
|
45
|
+
* Useful for discovering selectors before extraction.
|
|
46
|
+
*
|
|
47
|
+
* @param url - URL to navigate to
|
|
48
|
+
*
|
|
49
|
+
* @example
|
|
50
|
+
* const snapshot = await browser.snapshot("https://example.com/products");
|
|
51
|
+
* // Returns page HTML structure for selector discovery
|
|
52
|
+
*/
|
|
53
|
+
export declare function snapshot(url: string): Promise<BrowserResponse>;
|
|
54
|
+
/**
|
|
55
|
+
* Extract text content from a URL.
|
|
56
|
+
*
|
|
57
|
+
* @param url - URL to navigate to
|
|
58
|
+
*
|
|
59
|
+
* @example
|
|
60
|
+
* const response = await browser.text("https://example.com");
|
|
61
|
+
* // response.result = page text content
|
|
62
|
+
*/
|
|
63
|
+
export declare function text(url: string): Promise<BrowserResponse>;
|
|
64
|
+
export declare const browser: {
|
|
65
|
+
execute: typeof execute;
|
|
66
|
+
snapshot: typeof snapshot;
|
|
67
|
+
text: typeof text;
|
|
68
|
+
};
|
package/dist/browser.js
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.browser = void 0;
|
|
4
|
+
exports.execute = execute;
|
|
5
|
+
exports.snapshot = snapshot;
|
|
6
|
+
exports.text = text;
|
|
7
|
+
const queue_1 = require("./queue");
|
|
8
|
+
const API_URL = process.env.ORANGESLICE_API_URL || "https://orangeslice.ai/api/function?functionId=browser";
|
|
9
|
+
// Shared queue for browser requests (limit concurrent browser sessions)
|
|
10
|
+
const queue = (0, queue_1.createQueue)(2);
|
|
11
|
+
const rateLimiter = (0, queue_1.createRateLimiter)(500); // 500ms between requests
|
|
12
|
+
/**
|
|
13
|
+
* Helper to make POST request, handling redirects manually
|
|
14
|
+
* (Node.js fetch has issues with POST body on redirects)
|
|
15
|
+
*/
|
|
16
|
+
async function fetchWithRedirect(url, body) {
|
|
17
|
+
let response = await fetch(url, {
|
|
18
|
+
method: "POST",
|
|
19
|
+
headers: { "Content-Type": "application/json" },
|
|
20
|
+
body,
|
|
21
|
+
redirect: "manual",
|
|
22
|
+
});
|
|
23
|
+
// Handle redirect manually - re-POST to the new location
|
|
24
|
+
if (response.status >= 300 && response.status < 400) {
|
|
25
|
+
const location = response.headers.get("location");
|
|
26
|
+
if (location) {
|
|
27
|
+
response = await fetch(location, {
|
|
28
|
+
method: "POST",
|
|
29
|
+
headers: { "Content-Type": "application/json" },
|
|
30
|
+
body,
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
return response;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Execute Playwright code with `page` in scope.
|
|
38
|
+
* Browser is automatically acquired from a pre-warmed pool and released when done.
|
|
39
|
+
*
|
|
40
|
+
* @param code - Playwright code to execute (has `page` in scope)
|
|
41
|
+
* @param options - Optional settings for timeout and pool
|
|
42
|
+
*
|
|
43
|
+
* @example
|
|
44
|
+
* // Get page snapshot for analysis
|
|
45
|
+
* const response = await browser.execute(`
|
|
46
|
+
* await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
47
|
+
* return await page._snapshotForAI();
|
|
48
|
+
* `);
|
|
49
|
+
*
|
|
50
|
+
* @example
|
|
51
|
+
* // Extract data from page
|
|
52
|
+
* const response = await browser.execute(`
|
|
53
|
+
* await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
|
|
54
|
+
* return await page.evaluate(() => {
|
|
55
|
+
* return [...document.querySelectorAll('.item')].map(el => ({
|
|
56
|
+
* title: el.querySelector('h2')?.textContent?.trim(),
|
|
57
|
+
* url: el.querySelector('a')?.href
|
|
58
|
+
* }));
|
|
59
|
+
* });
|
|
60
|
+
* `);
|
|
61
|
+
* // response = { success: true, result: [...] }
|
|
62
|
+
*/
|
|
63
|
+
async function execute(code, options = {}) {
|
|
64
|
+
return queue(async () => {
|
|
65
|
+
return rateLimiter(async () => {
|
|
66
|
+
const body = JSON.stringify({ code, ...options });
|
|
67
|
+
const response = await fetchWithRedirect(API_URL, body);
|
|
68
|
+
if (!response.ok) {
|
|
69
|
+
throw new Error(`Browser request failed: ${response.status} ${response.statusText}`);
|
|
70
|
+
}
|
|
71
|
+
const data = (await response.json());
|
|
72
|
+
return data;
|
|
73
|
+
});
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Get a page snapshot for AI analysis.
|
|
78
|
+
* Useful for discovering selectors before extraction.
|
|
79
|
+
*
|
|
80
|
+
* @param url - URL to navigate to
|
|
81
|
+
*
|
|
82
|
+
* @example
|
|
83
|
+
* const snapshot = await browser.snapshot("https://example.com/products");
|
|
84
|
+
* // Returns page HTML structure for selector discovery
|
|
85
|
+
*/
|
|
86
|
+
async function snapshot(url) {
|
|
87
|
+
const code = `
|
|
88
|
+
await page.goto(${JSON.stringify(url)}, { waitUntil: 'domcontentloaded' });
|
|
89
|
+
return await page._snapshotForAI();
|
|
90
|
+
`;
|
|
91
|
+
return execute(code);
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Extract text content from a URL.
|
|
95
|
+
*
|
|
96
|
+
* @param url - URL to navigate to
|
|
97
|
+
*
|
|
98
|
+
* @example
|
|
99
|
+
* const response = await browser.text("https://example.com");
|
|
100
|
+
* // response.result = page text content
|
|
101
|
+
*/
|
|
102
|
+
async function text(url) {
|
|
103
|
+
const code = `
|
|
104
|
+
await page.goto(${JSON.stringify(url)}, { waitUntil: 'domcontentloaded' });
|
|
105
|
+
return await page.evaluate(() => document.body.innerText);
|
|
106
|
+
`;
|
|
107
|
+
return execute(code);
|
|
108
|
+
}
|
|
109
|
+
// Export as namespace
|
|
110
|
+
exports.browser = {
|
|
111
|
+
execute,
|
|
112
|
+
snapshot,
|
|
113
|
+
text,
|
|
114
|
+
};
|
package/dist/index.d.ts
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { b2b } from "./b2b";
|
|
2
2
|
import { serp } from "./serp";
|
|
3
3
|
import { firecrawl } from "./firecrawl";
|
|
4
|
-
|
|
4
|
+
import { browser } from "./browser";
|
|
5
|
+
export { b2b, serp, firecrawl, browser };
|
|
5
6
|
/**
|
|
6
7
|
* Main orangeslice namespace - AI sales agent toolkit
|
|
7
8
|
*
|
|
@@ -14,9 +15,15 @@ export { b2b, serp, firecrawl };
|
|
|
14
15
|
* // Google Search
|
|
15
16
|
* const results = await orangeslice.serp.search("best CRM software 2024");
|
|
16
17
|
*
|
|
17
|
-
* // Website Scraping
|
|
18
|
+
* // Website Scraping (simple)
|
|
18
19
|
* const page = await orangeslice.firecrawl.scrape("https://stripe.com/about");
|
|
19
20
|
*
|
|
21
|
+
* // Browser Automation (Playwright)
|
|
22
|
+
* const data = await orangeslice.browser.execute(`
|
|
23
|
+
* await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
|
|
24
|
+
* return await page.evaluate(() => document.title);
|
|
25
|
+
* `);
|
|
26
|
+
*
|
|
20
27
|
* // All calls are automatically rate-limited and queued
|
|
21
28
|
*/
|
|
22
29
|
export declare const orangeslice: {
|
|
@@ -34,5 +41,10 @@ export declare const orangeslice: {
|
|
|
34
41
|
markdown: typeof import("./firecrawl").markdown;
|
|
35
42
|
socials: typeof import("./firecrawl").socials;
|
|
36
43
|
};
|
|
44
|
+
browser: {
|
|
45
|
+
execute: typeof import("./browser").execute;
|
|
46
|
+
snapshot: typeof import("./browser").snapshot;
|
|
47
|
+
text: typeof import("./browser").text;
|
|
48
|
+
};
|
|
37
49
|
};
|
|
38
50
|
export default orangeslice;
|
package/dist/index.js
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.orangeslice = exports.firecrawl = exports.serp = exports.b2b = void 0;
|
|
3
|
+
exports.orangeslice = exports.browser = exports.firecrawl = exports.serp = exports.b2b = void 0;
|
|
4
4
|
const b2b_1 = require("./b2b");
|
|
5
5
|
Object.defineProperty(exports, "b2b", { enumerable: true, get: function () { return b2b_1.b2b; } });
|
|
6
6
|
const serp_1 = require("./serp");
|
|
7
7
|
Object.defineProperty(exports, "serp", { enumerable: true, get: function () { return serp_1.serp; } });
|
|
8
8
|
const firecrawl_1 = require("./firecrawl");
|
|
9
9
|
Object.defineProperty(exports, "firecrawl", { enumerable: true, get: function () { return firecrawl_1.firecrawl; } });
|
|
10
|
+
const browser_1 = require("./browser");
|
|
11
|
+
Object.defineProperty(exports, "browser", { enumerable: true, get: function () { return browser_1.browser; } });
|
|
10
12
|
/**
|
|
11
13
|
* Main orangeslice namespace - AI sales agent toolkit
|
|
12
14
|
*
|
|
@@ -19,14 +21,21 @@ Object.defineProperty(exports, "firecrawl", { enumerable: true, get: function ()
|
|
|
19
21
|
* // Google Search
|
|
20
22
|
* const results = await orangeslice.serp.search("best CRM software 2024");
|
|
21
23
|
*
|
|
22
|
-
* // Website Scraping
|
|
24
|
+
* // Website Scraping (simple)
|
|
23
25
|
* const page = await orangeslice.firecrawl.scrape("https://stripe.com/about");
|
|
24
26
|
*
|
|
27
|
+
* // Browser Automation (Playwright)
|
|
28
|
+
* const data = await orangeslice.browser.execute(`
|
|
29
|
+
* await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
|
|
30
|
+
* return await page.evaluate(() => document.title);
|
|
31
|
+
* `);
|
|
32
|
+
*
|
|
25
33
|
* // All calls are automatically rate-limited and queued
|
|
26
34
|
*/
|
|
27
35
|
exports.orangeslice = {
|
|
28
36
|
b2b: b2b_1.b2b,
|
|
29
37
|
serp: serp_1.serp,
|
|
30
38
|
firecrawl: firecrawl_1.firecrawl,
|
|
39
|
+
browser: browser_1.browser,
|
|
31
40
|
};
|
|
32
41
|
exports.default = exports.orangeslice;
|
package/docs/AGENTS.md
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
You are a B2B sales research agent with access to:
|
|
4
4
|
- **1.15 billion LinkedIn profiles** and millions of companies
|
|
5
5
|
- **Google Search** (SERP API)
|
|
6
|
-
- **Website scraping** (Firecrawl)
|
|
6
|
+
- **Website scraping** (Firecrawl + Browser automation)
|
|
7
7
|
|
|
8
8
|
## What You Can Do
|
|
9
9
|
|
|
@@ -14,8 +14,8 @@ You are a B2B sales research agent with access to:
|
|
|
14
14
|
| **Employee lookup** | `b2b` | Search employees by title, role, or department |
|
|
15
15
|
| **Funding intelligence** | `b2b` | Find recently funded companies and their investors |
|
|
16
16
|
| **Google search** | `serp` | Search for company news, press releases, reviews |
|
|
17
|
-
| **Website scraping** | `firecrawl` | Extract content from
|
|
18
|
-
|
|
17
|
+
| **Website scraping** | `firecrawl` | Extract content from static websites |
|
|
18
|
+
| **Browser automation** | `browser` | Scrape dynamic/JS sites with Playwright |
|
|
19
19
|
|
|
20
20
|
## Quick Start
|
|
21
21
|
|
|
@@ -31,8 +31,14 @@ const company = await orangeslice.b2b.sql(`
|
|
|
31
31
|
// 2. Google Search - Find news, articles, reviews
|
|
32
32
|
const news = await orangeslice.serp.search("Stripe funding 2024");
|
|
33
33
|
|
|
34
|
-
// 3. Website Scraping -
|
|
34
|
+
// 3. Website Scraping (simple) - Static pages
|
|
35
35
|
const about = await orangeslice.firecrawl.scrape("https://stripe.com/about");
|
|
36
|
+
|
|
37
|
+
// 4. Browser Automation (advanced) - Dynamic/JS pages
|
|
38
|
+
const data = await orangeslice.browser.execute(`
|
|
39
|
+
await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
|
|
40
|
+
return await page.evaluate(() => document.title);
|
|
41
|
+
`);
|
|
36
42
|
```
|
|
37
43
|
|
|
38
44
|
All calls are automatically rate-limited. Fire away freely.
|
|
@@ -184,6 +190,102 @@ const site = await orangeslice.firecrawl.scrape("https://company.com", 5);
|
|
|
184
190
|
|
|
185
191
|
---
|
|
186
192
|
|
|
193
|
+
## Browser Automation (Playwright)
|
|
194
|
+
|
|
195
|
+
Execute Playwright code with `page` in scope. Use for dynamic/JS-rendered pages that Firecrawl can't handle.
|
|
196
|
+
|
|
197
|
+
**When to use Browser vs Firecrawl:**
|
|
198
|
+
- `firecrawl` - Static pages, simple content extraction
|
|
199
|
+
- `browser` - Dynamic pages, JS rendering, complex interactions, bot-protected sites
|
|
200
|
+
|
|
201
|
+
```typescript
|
|
202
|
+
// Execute Playwright code - page is already available
|
|
203
|
+
const response = await orangeslice.browser.execute(`
|
|
204
|
+
await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
|
|
205
|
+
return await page.evaluate(() => {
|
|
206
|
+
return [...document.querySelectorAll('.item')].map(el => ({
|
|
207
|
+
title: el.querySelector('h2')?.textContent?.trim(),
|
|
208
|
+
url: el.querySelector('a')?.href
|
|
209
|
+
}));
|
|
210
|
+
});
|
|
211
|
+
`);
|
|
212
|
+
// response = { success: true, result: [...] } or { success: false, error: "..." }
|
|
213
|
+
|
|
214
|
+
// Get page snapshot for selector discovery
|
|
215
|
+
const snapshot = await orangeslice.browser.snapshot("https://example.com");
|
|
216
|
+
|
|
217
|
+
// Just get text content
|
|
218
|
+
const text = await orangeslice.browser.text("https://example.com");
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### Workflow: Analyze → Extract
|
|
222
|
+
|
|
223
|
+
**Step 1: Discover selectors first**
|
|
224
|
+
```typescript
|
|
225
|
+
const response = await orangeslice.browser.execute(`
|
|
226
|
+
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
227
|
+
return await page._snapshotForAI(); // Get page structure
|
|
228
|
+
`);
|
|
229
|
+
// Analyze snapshot to find selectors
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
**Step 2: Extract with discovered selectors**
|
|
233
|
+
```typescript
|
|
234
|
+
const response = await orangeslice.browser.execute(`
|
|
235
|
+
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
236
|
+
return await page.evaluate(() => {
|
|
237
|
+
return [...document.querySelectorAll('.discovered-selector')].map(e => ({
|
|
238
|
+
name: e.querySelector('h2')?.textContent?.trim(),
|
|
239
|
+
price: e.querySelector('.price')?.textContent?.trim()
|
|
240
|
+
}));
|
|
241
|
+
});
|
|
242
|
+
`);
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Bot Protection
|
|
246
|
+
|
|
247
|
+
For bot-protected sites, use single-session navigation:
|
|
248
|
+
|
|
249
|
+
```typescript
|
|
250
|
+
const response = await orangeslice.browser.execute(`
|
|
251
|
+
// Navigate to entry page (passes bot check once)
|
|
252
|
+
await page.goto(entryUrl, { waitUntil: 'domcontentloaded' });
|
|
253
|
+
|
|
254
|
+
// Get all URLs to visit
|
|
255
|
+
const urls = await page.evaluate(() =>
|
|
256
|
+
[...document.querySelectorAll('a.link')].map(a => a.href)
|
|
257
|
+
);
|
|
258
|
+
|
|
259
|
+
// Visit each IN THE SAME SESSION
|
|
260
|
+
const results = [];
|
|
261
|
+
for (const url of urls.slice(0, 10)) {
|
|
262
|
+
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
263
|
+
const data = await page.evaluate(() => ({
|
|
264
|
+
title: document.querySelector('h1')?.textContent?.trim()
|
|
265
|
+
}));
|
|
266
|
+
results.push(data);
|
|
267
|
+
}
|
|
268
|
+
return results;
|
|
269
|
+
`);
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### Options
|
|
273
|
+
|
|
274
|
+
| Option | Type | Description |
|
|
275
|
+
|--------|------|-------------|
|
|
276
|
+
| `timeout_sec` | number | Execution timeout (default 60, max 180) |
|
|
277
|
+
| `acquire_timeout_seconds` | number | Browser pool acquire timeout |
|
|
278
|
+
|
|
279
|
+
### Rules
|
|
280
|
+
|
|
281
|
+
1. **Always use `{ waitUntil: 'domcontentloaded' }`** - Prevents hanging
|
|
282
|
+
2. **Check `response.success`** - Don't just destructure `result`
|
|
283
|
+
3. **Analyze before extracting** - Use `_snapshotForAI()` to find selectors
|
|
284
|
+
4. **Return objects, not HTML** - Use `page.evaluate()` to extract structured data
|
|
285
|
+
5. **3 minute hard limit** - Plan multi-page scrapes accordingly
|
|
286
|
+
|
|
287
|
+
---
|
|
288
|
+
|
|
187
289
|
## Key Tables
|
|
188
290
|
|
|
189
291
|
| Table | Records | Use For |
|
|
@@ -271,6 +373,7 @@ The `orangeslice` package automatically handles rate limiting:
|
|
|
271
373
|
| `b2b` | 2 concurrent | 100ms |
|
|
272
374
|
| `serp` | 2 concurrent | 200ms |
|
|
273
375
|
| `firecrawl` | 2 concurrent | 500ms |
|
|
376
|
+
| `browser` | 2 concurrent | 500ms |
|
|
274
377
|
|
|
275
378
|
You can fire off many calls - they'll be queued automatically.
|
|
276
379
|
|