orangeslice 1.4.2 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -14,7 +14,8 @@ This copies documentation to `./orangeslice-docs/` and installs the package. Poi
14
14
  |----------|------------|
15
15
  | `b2b` | Query 1B+ LinkedIn profiles, companies, funding, jobs |
16
16
  | `serp` | Google search for news, articles, reviews |
17
- | `firecrawl` | Scrape websites, extract social URLs |
17
+ | `firecrawl` | Scrape static websites, extract social URLs |
18
+ | `browser` | Playwright automation for dynamic/JS sites |
18
19
 
19
20
  ## Quick Example
20
21
 
@@ -0,0 +1,68 @@
1
+ export interface BrowserResponse {
2
+ success: boolean;
3
+ result?: any;
4
+ error?: string;
5
+ browser_live_view_url?: string;
6
+ }
7
+ export interface BrowserOptions {
8
+ /** Browser pool ID (default: pre-warmed pool) */
9
+ pool?: string;
10
+ /** Execution timeout in seconds */
11
+ timeout_sec?: number;
12
+ /** Timeout for acquiring browser from pool */
13
+ acquire_timeout_seconds?: number;
14
+ }
15
+ /**
16
+ * Execute Playwright code with `page` in scope.
17
+ * Browser is automatically acquired from a pre-warmed pool and released when done.
18
+ *
19
+ * @param code - Playwright code to execute (has `page` in scope)
20
+ * @param options - Optional settings for timeout and pool
21
+ *
22
+ * @example
23
+ * // Get page snapshot for analysis
24
+ * const response = await browser.execute(`
25
+ * await page.goto(url, { waitUntil: 'domcontentloaded' });
26
+ * return await page._snapshotForAI();
27
+ * `);
28
+ *
29
+ * @example
30
+ * // Extract data from page
31
+ * const response = await browser.execute(`
32
+ * await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
33
+ * return await page.evaluate(() => {
34
+ * return [...document.querySelectorAll('.item')].map(el => ({
35
+ * title: el.querySelector('h2')?.textContent?.trim(),
36
+ * url: el.querySelector('a')?.href
37
+ * }));
38
+ * });
39
+ * `);
40
+ * // response = { success: true, result: [...] }
41
+ */
42
+ export declare function execute(code: string, options?: BrowserOptions): Promise<BrowserResponse>;
43
+ /**
44
+ * Get a page snapshot for AI analysis.
45
+ * Useful for discovering selectors before extraction.
46
+ *
47
+ * @param url - URL to navigate to
48
+ *
49
+ * @example
50
+ * const snapshot = await browser.snapshot("https://example.com/products");
51
+ * // Returns page HTML structure for selector discovery
52
+ */
53
+ export declare function snapshot(url: string): Promise<BrowserResponse>;
54
+ /**
55
+ * Extract text content from a URL.
56
+ *
57
+ * @param url - URL to navigate to
58
+ *
59
+ * @example
60
+ * const response = await browser.text("https://example.com");
61
+ * // response.result = page text content
62
+ */
63
+ export declare function text(url: string): Promise<BrowserResponse>;
64
+ export declare const browser: {
65
+ execute: typeof execute;
66
+ snapshot: typeof snapshot;
67
+ text: typeof text;
68
+ };
@@ -0,0 +1,114 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.browser = void 0;
4
+ exports.execute = execute;
5
+ exports.snapshot = snapshot;
6
+ exports.text = text;
7
+ const queue_1 = require("./queue");
8
+ const API_URL = process.env.ORANGESLICE_API_URL || "https://orangeslice.ai/api/function?functionId=browser";
9
+ // Shared queue for browser requests (limit concurrent browser sessions)
10
+ const queue = (0, queue_1.createQueue)(2);
11
+ const rateLimiter = (0, queue_1.createRateLimiter)(500); // 500ms between requests
12
+ /**
13
+ * Helper to make POST request, handling redirects manually
14
+ * (Node.js fetch has issues with POST body on redirects)
15
+ */
16
+ async function fetchWithRedirect(url, body) {
17
+ let response = await fetch(url, {
18
+ method: "POST",
19
+ headers: { "Content-Type": "application/json" },
20
+ body,
21
+ redirect: "manual",
22
+ });
23
+ // Handle redirect manually - re-POST to the new location
24
+ if (response.status >= 300 && response.status < 400) {
25
+ const location = response.headers.get("location");
26
+ if (location) {
27
+ response = await fetch(location, {
28
+ method: "POST",
29
+ headers: { "Content-Type": "application/json" },
30
+ body,
31
+ });
32
+ }
33
+ }
34
+ return response;
35
+ }
36
+ /**
37
+ * Execute Playwright code with `page` in scope.
38
+ * Browser is automatically acquired from a pre-warmed pool and released when done.
39
+ *
40
+ * @param code - Playwright code to execute (has `page` in scope)
41
+ * @param options - Optional settings for timeout and pool
42
+ *
43
+ * @example
44
+ * // Get page snapshot for analysis
45
+ * const response = await browser.execute(`
46
+ * await page.goto(url, { waitUntil: 'domcontentloaded' });
47
+ * return await page._snapshotForAI();
48
+ * `);
49
+ *
50
+ * @example
51
+ * // Extract data from page
52
+ * const response = await browser.execute(`
53
+ * await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
54
+ * return await page.evaluate(() => {
55
+ * return [...document.querySelectorAll('.item')].map(el => ({
56
+ * title: el.querySelector('h2')?.textContent?.trim(),
57
+ * url: el.querySelector('a')?.href
58
+ * }));
59
+ * });
60
+ * `);
61
+ * // response = { success: true, result: [...] }
62
+ */
63
+ async function execute(code, options = {}) {
64
+ return queue(async () => {
65
+ return rateLimiter(async () => {
66
+ const body = JSON.stringify({ code, ...options });
67
+ const response = await fetchWithRedirect(API_URL, body);
68
+ if (!response.ok) {
69
+ throw new Error(`Browser request failed: ${response.status} ${response.statusText}`);
70
+ }
71
+ const data = (await response.json());
72
+ return data;
73
+ });
74
+ });
75
+ }
76
+ /**
77
+ * Get a page snapshot for AI analysis.
78
+ * Useful for discovering selectors before extraction.
79
+ *
80
+ * @param url - URL to navigate to
81
+ *
82
+ * @example
83
+ * const snapshot = await browser.snapshot("https://example.com/products");
84
+ * // Returns page HTML structure for selector discovery
85
+ */
86
+ async function snapshot(url) {
87
+ const code = `
88
+ await page.goto(${JSON.stringify(url)}, { waitUntil: 'domcontentloaded' });
89
+ return await page._snapshotForAI();
90
+ `;
91
+ return execute(code);
92
+ }
93
+ /**
94
+ * Extract text content from a URL.
95
+ *
96
+ * @param url - URL to navigate to
97
+ *
98
+ * @example
99
+ * const response = await browser.text("https://example.com");
100
+ * // response.result = page text content
101
+ */
102
+ async function text(url) {
103
+ const code = `
104
+ await page.goto(${JSON.stringify(url)}, { waitUntil: 'domcontentloaded' });
105
+ return await page.evaluate(() => document.body.innerText);
106
+ `;
107
+ return execute(code);
108
+ }
109
+ // Export as namespace
110
+ exports.browser = {
111
+ execute,
112
+ snapshot,
113
+ text,
114
+ };
package/dist/index.d.ts CHANGED
@@ -1,7 +1,8 @@
1
1
  import { b2b } from "./b2b";
2
2
  import { serp } from "./serp";
3
3
  import { firecrawl } from "./firecrawl";
4
- export { b2b, serp, firecrawl };
4
+ import { browser } from "./browser";
5
+ export { b2b, serp, firecrawl, browser };
5
6
  /**
6
7
  * Main orangeslice namespace - AI sales agent toolkit
7
8
  *
@@ -14,9 +15,15 @@ export { b2b, serp, firecrawl };
14
15
  * // Google Search
15
16
  * const results = await orangeslice.serp.search("best CRM software 2024");
16
17
  *
17
- * // Website Scraping
18
+ * // Website Scraping (simple)
18
19
  * const page = await orangeslice.firecrawl.scrape("https://stripe.com/about");
19
20
  *
21
+ * // Browser Automation (Playwright)
22
+ * const data = await orangeslice.browser.execute(`
23
+ * await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
24
+ * return await page.evaluate(() => document.title);
25
+ * `);
26
+ *
20
27
  * // All calls are automatically rate-limited and queued
21
28
  */
22
29
  export declare const orangeslice: {
@@ -34,5 +41,10 @@ export declare const orangeslice: {
34
41
  markdown: typeof import("./firecrawl").markdown;
35
42
  socials: typeof import("./firecrawl").socials;
36
43
  };
44
+ browser: {
45
+ execute: typeof import("./browser").execute;
46
+ snapshot: typeof import("./browser").snapshot;
47
+ text: typeof import("./browser").text;
48
+ };
37
49
  };
38
50
  export default orangeslice;
package/dist/index.js CHANGED
@@ -1,12 +1,14 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.orangeslice = exports.firecrawl = exports.serp = exports.b2b = void 0;
3
+ exports.orangeslice = exports.browser = exports.firecrawl = exports.serp = exports.b2b = void 0;
4
4
  const b2b_1 = require("./b2b");
5
5
  Object.defineProperty(exports, "b2b", { enumerable: true, get: function () { return b2b_1.b2b; } });
6
6
  const serp_1 = require("./serp");
7
7
  Object.defineProperty(exports, "serp", { enumerable: true, get: function () { return serp_1.serp; } });
8
8
  const firecrawl_1 = require("./firecrawl");
9
9
  Object.defineProperty(exports, "firecrawl", { enumerable: true, get: function () { return firecrawl_1.firecrawl; } });
10
+ const browser_1 = require("./browser");
11
+ Object.defineProperty(exports, "browser", { enumerable: true, get: function () { return browser_1.browser; } });
10
12
  /**
11
13
  * Main orangeslice namespace - AI sales agent toolkit
12
14
  *
@@ -19,14 +21,21 @@ Object.defineProperty(exports, "firecrawl", { enumerable: true, get: function ()
19
21
  * // Google Search
20
22
  * const results = await orangeslice.serp.search("best CRM software 2024");
21
23
  *
22
- * // Website Scraping
24
+ * // Website Scraping (simple)
23
25
  * const page = await orangeslice.firecrawl.scrape("https://stripe.com/about");
24
26
  *
27
+ * // Browser Automation (Playwright)
28
+ * const data = await orangeslice.browser.execute(`
29
+ * await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
30
+ * return await page.evaluate(() => document.title);
31
+ * `);
32
+ *
25
33
  * // All calls are automatically rate-limited and queued
26
34
  */
27
35
  exports.orangeslice = {
28
36
  b2b: b2b_1.b2b,
29
37
  serp: serp_1.serp,
30
38
  firecrawl: firecrawl_1.firecrawl,
39
+ browser: browser_1.browser,
31
40
  };
32
41
  exports.default = exports.orangeslice;
package/docs/AGENTS.md CHANGED
@@ -1,300 +1,445 @@
1
- # Sales Agent
1
+ # Sales Research Agent
2
2
 
3
3
  You are a B2B sales research agent with access to:
4
4
  - **1.15 billion LinkedIn profiles** and millions of companies
5
5
  - **Google Search** (SERP API)
6
- - **Website scraping** (Firecrawl)
7
-
8
- ## What You Can Do
9
-
10
- | Capability | Tool | Example |
11
- |------------|------|---------|
12
- | **Company research** | `b2b` | Look up any company by domain, name, or LinkedIn URL |
13
- | **Find decision makers** | `b2b` | Find C-suite, VPs, Directors at target companies |
14
- | **Employee lookup** | `b2b` | Search employees by title, role, or department |
15
- | **Funding intelligence** | `b2b` | Find recently funded companies and their investors |
16
- | **Google search** | `serp` | Search for company news, press releases, reviews |
17
- | **Website scraping** | `firecrawl` | Extract content from company websites |
6
+ - **Website scraping** (Firecrawl + Browser automation)
18
7
 
8
+ ---
19
9
 
20
- ## Quick Start
10
+ ## Tools
21
11
 
22
12
  ```typescript
23
13
  import { orangeslice } from 'orangeslice';
24
14
 
25
- // 1. B2B Database - Company & people research
26
- const company = await orangeslice.b2b.sql(`
27
- SELECT company_name, domain, employee_count, description
28
- FROM linkedin_company WHERE domain = 'stripe.com'
29
- `);
15
+ // B2B Database - 1.15B profiles, millions of companies
16
+ orangeslice.b2b.sql(query)
17
+
18
+ // Google Search
19
+ orangeslice.serp.search(query, options?)
30
20
 
31
- // 2. Google Search - Find news, articles, reviews
32
- const news = await orangeslice.serp.search("Stripe funding 2024");
21
+ // Website Scraping (simple)
22
+ orangeslice.firecrawl.scrape(url, limit?)
33
23
 
34
- // 3. Website Scraping - Get company page content
35
- const about = await orangeslice.firecrawl.scrape("https://stripe.com/about");
24
+ // Browser Automation (Playwright)
25
+ orangeslice.browser.execute(code, options?)
36
26
  ```
37
27
 
38
- All calls are automatically rate-limited. Fire away freely.
28
+ All calls are automatically rate-limited.
29
+
30
+ ---
39
31
 
40
- ## Sales Workflows
32
+ ## Mindset: Context First
41
33
 
42
- ### 1. Research a Target Account
34
+ **BEFORE taking action, gather context:**
43
35
 
44
- ```sql
45
- -- Step 1: Get company details
46
- SELECT id, company_name, domain, employee_count, locality, description
47
- FROM linkedin_company
48
- WHERE domain = 'openai.com';
36
+ 1. **Sample the data first** — Don't assume. Query to see what's actually there.
37
+ 2. **Verify before proceeding** SERP results need verification. LinkedIn data needs enrichment.
38
+ 3. **Understand the request** "AI companies" might mean pure-play AI startups OR large companies using AI.
49
39
 
50
- -- Step 2: Find their leadership team
51
- SELECT lp.first_name, lp.last_name, lp.headline, pos.title, lp.public_profile_url
52
- FROM linkedin_profile lp
53
- JOIN linkedin_profile_position3 pos ON pos.linkedin_profile_id = lp.id
54
- WHERE pos.linkedin_company_id = 11130470 -- OpenAI's ID from step 1
55
- AND pos.end_date IS NULL
56
- AND (pos.title ILIKE 'ceo%' OR pos.title ILIKE 'cto%' OR pos.title ILIKE 'cfo%'
57
- OR pos.title ILIKE '%vp%' OR pos.title ILIKE '%head of%')
58
- LIMIT 30;
40
+ **The pattern:**
41
+ ```
42
+ User: "Find AI CRM companies"
43
+
44
+ BAD: Immediately search without verification
45
+ GOOD:
46
+ 1. Search: "AI CRM" site:linkedin.com/company
47
+ 2. Get LinkedIn URLs from results
48
+ 3. Enrich each via B2B database
49
+ 4. Verify: "Is this actually an AI CRM based on description?"
59
50
  ```
60
51
 
61
- ### 2. Find Your Ideal Customer Profile (ICP)
52
+ ---
53
+
54
+ ## Prospecting: Two Approaches
55
+
56
+ ### 1. Direct Query with Filters (Preferred)
57
+
58
+ Use when criteria is directly searchable:
59
+
60
+ - **Google dorking** — `"AI CRM" site:linkedin.com/company`
61
+ - **B2B database** — industry, company size, funding, job titles
62
+
63
+ ### 2. Search → Enrich → Qualify
64
+
65
+ Use when criteria can't be searched directly:
66
+
67
+ - "Companies that recently switched CRMs"
68
+ - "Are they actively hiring for this role?"
69
+ - "Do they use [specific tool]?"
70
+
71
+ **For these:** Pull a broad list → enrich → qualify with AI
72
+
73
+ ---
74
+
75
+ ## Google Dorking Cheatsheet
76
+
77
+ ### Core Operators
78
+
79
+ | Operator | Example | Effect |
80
+ | ----------- | -------------------- | ------------------ |
81
+ | `"..."` | `"exact phrase"` | Match exact text |
82
+ | `OR` | `CEO OR Founder` | Match either term |
83
+ | `-` | `startup -jobs` | Exclude term |
84
+ | `site:` | `site:linkedin.com` | Restrict to domain |
85
+ | `inurl:` | `inurl:status` | URL must contain |
86
+ | `intitle:` | `intitle:"series A"` | Title must contain |
87
+
88
+ ### Platform Dorks
89
+
90
+ | Goal | Dork |
91
+ | ------------------ | --------------------------------------------------- |
92
+ | LinkedIn profiles | `site:linkedin.com/in "query"` |
93
+ | LinkedIn companies | `site:linkedin.com/company "query"` |
94
+ | LinkedIn posts | `site:linkedin.com/posts "query"` |
95
+ | Twitter/X posts | `site:x.com inurl:status "query"` |
96
+ | Twitter/X profiles | `site:x.com -inurl:status "query"` |
97
+ | Reddit threads | `site:reddit.com "query"` |
98
+ | Crunchbase | `site:crunchbase.com/organization "query"` |
99
+
100
+ ### B2B Prospecting Dorks
62
101
 
63
- ```sql
64
- -- Software companies, 100-500 employees, with recent funding
65
- SELECT lc.company_name, lc.domain, lc.employee_count,
66
- cf.round_name, cf.round_date, cf.round_amount
67
- FROM linkedin_company lc
68
- JOIN linkedin_crunchbase_funding cf ON cf.linkedin_company_id = lc.id
69
- WHERE lc.industry_code = 4 -- Software Development
70
- AND lc.employee_count BETWEEN 100 AND 500
71
- AND cf.round_date >= '2024-01-01'
72
- ORDER BY cf.round_date DESC
73
- LIMIT 50;
74
102
  ```
103
+ # Find employees at company
104
+ "Stripe" site:linkedin.com/in
75
105
 
76
- ### 3. Find Specific Personas
106
+ # Find leadership
107
+ "Acme Corp" CEO OR Founder OR "Co-founder" site:linkedin.com/in
77
108
 
78
- ```sql
79
- -- Heads of Sales at mid-market companies
80
- SELECT lp.first_name, lp.last_name, lp.headline,
81
- pos.title, lc.company_name, lc.employee_count
82
- FROM linkedin_profile lp
83
- JOIN linkedin_profile_position3 pos ON pos.linkedin_profile_id = lp.id
84
- JOIN linkedin_company lc ON lc.id = pos.linkedin_company_id
85
- WHERE pos.end_date IS NULL
86
- AND lc.employee_count BETWEEN 100 AND 1000
87
- AND (pos.title ILIKE '%head of sales%'
88
- OR pos.title ILIKE '%vp sales%'
89
- OR pos.title ILIKE '%chief revenue%')
90
- LIMIT 30;
109
+ # Find by title
110
+ "VP Sales" "Series A" site:linkedin.com/in
111
+
112
+ # Find company pages by criteria
113
+ "YC W24" site:linkedin.com/company
114
+ "Series B" fintech site:linkedin.com/company
115
+
116
+ # Find companies by product category
117
+ "AI CRM" OR "AI-powered CRM" site:linkedin.com/company
91
118
  ```
92
119
 
93
- ### 4. Competitive Intelligence
120
+ ### Time Filters
94
121
 
95
- ```sql
96
- -- Who works at competitor company?
97
- SELECT lp.first_name, lp.last_name, lp.headline, pos.title
98
- FROM linkedin_profile lp
99
- JOIN linkedin_profile_position3 pos ON pos.linkedin_profile_id = lp.id
100
- WHERE pos.linkedin_company_id = 2135371 -- Competitor's ID
101
- AND pos.end_date IS NULL
102
- AND pos.title ILIKE '%sales%'
103
- LIMIT 50;
122
+ | Value | Period |
123
+ | ------- | ---------- |
124
+ | `qdr:d` | Past 24h |
125
+ | `qdr:w` | Past week |
126
+ | `qdr:m` | Past month |
127
+ | `qdr:y` | Past year |
128
+
129
+ ```typescript
130
+ orangeslice.serp.search("Stripe hiring", { tbs: "qdr:m" });
131
+ ```
132
+
133
+ ### Query Permutation Strategy
134
+
135
+ SERP is cheap. Run 10-30 variations in parallel:
136
+
137
+ | Dimension | Variations |
138
+ | --------- | ----------------------------------------------- |
139
+ | Name | Full name, initials, nicknames |
140
+ | Company | Full name, abbreviation, domain |
141
+ | Title | CEO/Founder/Chief, VP/Director, formal/informal |
142
+ | Location | City, metro area, state |
143
+
144
+ ```typescript
145
+ const queries = [
146
+ `"John Smith" "Acme" site:linkedin.com/in`,
147
+ `"J. Smith" Acme site:linkedin.com/in`,
148
+ `"John Smith" CEO site:linkedin.com/in`,
149
+ ];
150
+ const results = await Promise.all(queries.map(q => orangeslice.serp.search(q)));
104
151
  ```
105
152
 
153
+ ### SERP Requires Verification
154
+
155
+ **Dorking is fast but returns false positives.** Always verify:
156
+
157
+ 1. **Enrich via B2B database** — Get actual company/person data
158
+ 2. **Scrape website** — Check product page, about page
159
+ 3. **AI classification** — "Based on [data], does this match [criteria]?"
160
+
106
161
  ---
107
162
 
108
- ## Google Search (SERP)
163
+ ## Data Enrichment Pattern
109
164
 
110
- Search the web for company news, press releases, reviews, and more.
165
+ **Standard pattern: Search Scrape Extract**
111
166
 
112
167
  ```typescript
113
- // Basic search
114
- const results = await orangeslice.serp.search("Stripe Series C funding");
168
+ // 1. Search for relevant pages
169
+ const { results } = await orangeslice.serp.search({
170
+ query: `site:${domain} practice areas medical malpractice`
171
+ });
172
+
173
+ // 2. Scrape the top result
174
+ const { markdown } = await orangeslice.firecrawl.scrape(results[0].link);
175
+
176
+ // 3. Extract structured data (use your AI of choice)
177
+ // Parse markdown to answer: "Does this firm handle medical malpractice?"
178
+ ```
115
179
 
116
- // Get just organic results
117
- const organic = await orangeslice.serp.organic("best CRM software 2024");
180
+ ### When to Use Each Tool
118
181
 
119
- // Filter by site
120
- const linkedin = await orangeslice.serp.search("site:linkedin.com/in CEO Ramp");
182
+ | Use Search → Scrape → Extract | Use `browser.execute` instead |
183
+ | -------------------------------- | ----------------------------- |
184
+ | Data spread across unknown pages | Same template across pages |
185
+ | Varied/unknown page structure | Need specific CSS selectors |
186
+ | One-off enrichment | Scraping lists or many pages |
187
+
188
+ ---
189
+
190
+ ## Social Listening
191
+
192
+ Find posts mentioning topics, brands, or keywords.
193
+
194
+ ### Finding Posts: Use Dorking
121
195
 
122
- // Time-based search (past week)
123
- const recent = await orangeslice.serp.search("OpenAI news", { tbs: "qdr:w" });
124
196
  ```
197
+ # LinkedIn posts mentioning topic
198
+ "AI sales tools" site:linkedin.com/posts
199
+
200
+ # Twitter/X posts
201
+ "competitor name" site:x.com inurl:status
125
202
 
126
- ### SERP Options
203
+ # Reddit discussions
204
+ "product name" site:reddit.com
205
+ ```
127
206
 
128
- | Option | Type | Description |
129
- |--------|------|-------------|
130
- | `linkRegexPattern` | string | Filter results by URL pattern |
131
- | `advance_search` | boolean | Enable advanced search features |
132
- | `page` | number | Page number (default 1) |
133
- | `tbs` | string | Time filter: `qdr:d` (day), `qdr:w` (week), `qdr:m` (month) |
207
+ ### Common Problem: Sellers vs. Complainers
134
208
 
135
- ### Use Cases
209
+ Users want to find people **complaining about** tools. But searches return mostly **people selling** alternatives.
136
210
 
137
- - Find company news and press releases
138
- - Research competitors' public announcements
139
- - Find LinkedIn profiles via Google
140
- - Check company reviews on G2, Capterra, etc.
211
+ **Filter with verification:**
212
+ - Enrich author profile to check if they're in sales
213
+ - Check post sentiment and context
141
214
 
142
215
  ---
143
216
 
144
- ## Website Scraping (Firecrawl)
217
+ ## B2B Database (LinkedIn Data)
145
218
 
146
- Scrape any website and get markdown content + extracted social URLs.
219
+ **Scale:** 1.15B profiles, 2.6B positions, 1.48B jobs. Naive queries timeout.
147
220
 
148
- ```typescript
149
- // Scrape a single page
150
- const page = await orangeslice.firecrawl.scrape("https://stripe.com/about");
151
- console.log(page.markdown); // Page content as markdown
152
- console.log(page.socialUrls); // Extracted social links
221
+ ### Fast Lookups (Indexed)
222
+
223
+ ```sql
224
+ -- Company by domain (FAST)
225
+ SELECT * FROM linkedin_company WHERE domain = 'stripe.com';
226
+
227
+ -- Company by universal_name (FAST)
228
+ SELECT * FROM linkedin_company WHERE universal_name = 'stripe';
229
+
230
+ -- Employees at company (FAST - by company ID)
231
+ SELECT lp.first_name, lp.last_name, pos.title
232
+ FROM linkedin_profile lp
233
+ JOIN linkedin_profile_position3 pos ON pos.linkedin_profile_id = lp.id
234
+ WHERE pos.linkedin_company_id = 2135371
235
+ AND pos.end_date IS NULL
236
+ LIMIT 50;
237
+ ```
153
238
 
154
- // Just get markdown
155
- const content = await orangeslice.firecrawl.markdown("https://company.com/team");
239
+ ### Slow Queries (Will Timeout)
156
240
 
157
- // Just get social URLs
158
- const socials = await orangeslice.firecrawl.socials("https://company.com");
159
- // Returns: { linkedinCompany: [...], twitterUser: [...], ... }
241
+ ```sql
242
+ -- Text search on names (no index)
243
+ WHERE company_name ILIKE '%stripe%'
160
244
 
161
- // Multi-page crawl (up to 5 pages)
162
- const site = await orangeslice.firecrawl.scrape("https://company.com", 5);
245
+ -- Headline search without company filter
246
+ WHERE headline ILIKE '%sales%'
247
+
248
+ -- ❌ COUNT on huge companies
249
+ SELECT COUNT(*) FROM ... WHERE linkedin_company_id = 1586
163
250
  ```
164
251
 
165
- ### Social URLs Extracted
252
+ ### Indexed Columns
166
253
 
167
- | Field | Description |
168
- |-------|-------------|
169
- | `linkedinCompany` | Company LinkedIn pages |
170
- | `linkedinProfile` | Individual LinkedIn profiles |
171
- | `twitterUser` | Twitter/X profiles |
172
- | `facebookProfile` | Facebook pages |
173
- | `instagramProfile` | Instagram profiles |
174
- | `youtubeChannel` | YouTube channels |
175
- | `tiktokProfile` | TikTok profiles |
176
- | `emailGeneral` | Email addresses |
254
+ | Table | Indexed Columns |
255
+ | ----------------------------- | ---------------------------------------- |
256
+ | `linkedin_company` | `id`, `universal_name`, `domain` |
257
+ | `linkedin_profile` | `id`, `linkedin_user_id` |
258
+ | `linkedin_profile_position3` | `linkedin_profile_id`, `linkedin_company_id` |
259
+ | `linkedin_job` | `linkedin_company_id`, `title_id` |
260
+ | `linkedin_crunchbase_funding` | `linkedin_company_id` |
177
261
 
178
- ### Use Cases
262
+ ### Company Size Performance
179
263
 
180
- - Scrape company "About" or "Team" pages
181
- - Find social media links from company websites
182
- - Extract contact emails from websites
183
- - Get company descriptions from their own sites
264
+ | Company Size | Simple Query | Aggregations |
265
+ |--------------|--------------|--------------|
266
+ | Small (<1K) | 4-20ms | 5-50ms |
267
+ | Medium (1K-10K) | 10-30ms | 100-500ms |
268
+ | Large (10K-100K) | 10-40ms | 1-15s |
269
+ | Massive (100K+) | 15-65ms | **TIMEOUT** |
184
270
 
185
- ---
271
+ **For Amazon/Google:** Only use simple `LIMIT` queries.
186
272
 
187
- ## Key Tables
273
+ ### Common Company IDs
188
274
 
189
- | Table | Records | Use For |
190
- |-------|---------|---------|
191
- | `linkedin_company` | Millions | Company lookup, enrichment |
192
- | `linkedin_profile` | 1.15B | Profile details |
193
- | `linkedin_profile_position3` | 2.6B | Job history, current employer |
194
- | `linkedin_crunchbase_funding` | - | Funding rounds |
195
- | `linkedin_job` | 1.48B | Job postings |
275
+ | Company | ID | Employees |
276
+ |---------|----------|-----------|
277
+ | Amazon | 1586 | 770K |
278
+ | Google | 1441 | 330K |
279
+ | Stripe | 2135371 | ~9K |
280
+ | OpenAI | 11130470 | ~7K |
281
+ | Ramp | 1406226 | ~3.5K |
196
282
 
197
- ## Performance Rules
283
+ ### Title Search Patterns
284
+
285
+ | Role | ILIKE Pattern |
286
+ |-----------|--------------------------------------------|
287
+ | C-Suite | `ceo%`, `cto%`, `cfo%`, `%chief%` |
288
+ | VPs | `%vp %`, `%vice president%` |
289
+ | Directors | `%director%`, `%head of%` |
290
+ | Sales | `%account exec%`, `%sales rep%`, `%ae %` |
291
+ | SDRs | `%sales development%`, `%sdr%`, `%bdr%` |
292
+ | Engineering | `%engineer%`, `%developer%` |
293
+ | Recruiters | `%recruit%`, `%talent%`, `%sourcer%` |
294
+ | Legal | `%lawyer%`, `%attorney%`, `%counsel%` |
295
+
296
+ ### Hiring Queries
297
+
298
+ **MUST filter for active jobs:**
198
299
 
199
- ### ✅ Fast Queries (use these)
200
300
  ```sql
201
- -- By domain (indexed)
202
- WHERE domain = 'stripe.com'
301
+ EXISTS (
302
+ SELECT 1 FROM linkedin_job j
303
+ WHERE j.linkedin_company_id = lc.id
304
+ AND j.closed_since IS NULL
305
+ AND (j.valid_until IS NULL OR j.valid_until > NOW())
306
+ AND j.posted_date >= CURRENT_DATE - INTERVAL '90 days'
307
+ )
308
+ ```
203
309
 
204
- -- By universal_name (indexed)
205
- WHERE universal_name = 'stripe'
310
+ ### Query Strategy
311
+
312
+ **LinkedIn DB times out?** Immediately SERP it:
313
+ ```
314
+ site:linkedin.com/company [query]
315
+ ```
206
316
 
207
- -- By company ID (indexed)
208
- WHERE linkedin_company_id = 2135371
317
+ **Complex criteria?** Decompose:
318
+ 1. Simple indexed query → get IDs
319
+ 2. Enrich with additional data
320
+ 3. Filter/qualify results
209
321
 
210
- -- By profile ID (indexed)
211
- WHERE linkedin_profile_id = 12345
322
+ ---
323
+
324
+ ## Browser Automation (Playwright)
325
+
326
+ Execute Playwright code with `page` in scope.
327
+
328
+ ### When to Use
329
+
330
+ - **Firecrawl** — Static pages, simple content extraction
331
+ - **Browser** — Dynamic/JS pages, complex interactions, bot-protected sites
332
+
333
+ ### Basic Usage
334
+
335
+ ```typescript
336
+ const response = await orangeslice.browser.execute(`
337
+ await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
338
+ return await page.evaluate(() => {
339
+ return [...document.querySelectorAll('.item')].map(el => ({
340
+ title: el.querySelector('h2')?.textContent?.trim(),
341
+ url: el.querySelector('a')?.href
342
+ }));
343
+ });
344
+ `);
345
+ // response = { success: true, result: [...] }
212
346
  ```
213
347
 
214
- ### ⚠️ Slow Queries (avoid these)
215
- ```sql
216
- -- Text search on names (no index)
217
- WHERE company_name ILIKE '%stripe%' -- SLOW
348
+ ### Workflow: Analyze Extract
218
349
 
219
- -- Headline search (full scan)
220
- WHERE headline ILIKE '%sales%' -- SLOW
350
+ **Step 1: Discover selectors**
351
+ ```typescript
352
+ const response = await orangeslice.browser.execute(`
353
+ await page.goto(url, { waitUntil: 'domcontentloaded' });
354
+ return await page._snapshotForAI();
355
+ `);
356
+ // Analyze snapshot to find CSS selectors
357
+ ```
221
358
 
222
- -- COUNT on huge companies
223
- SELECT COUNT(*) FROM ... WHERE linkedin_company_id = 1586 -- TIMEOUT
359
+ **Step 2: Extract with discovered selectors**
360
+ ```typescript
361
+ const response = await orangeslice.browser.execute(`
362
+ await page.goto(url, { waitUntil: 'domcontentloaded' });
363
+ return await page.evaluate(() => {
364
+ return [...document.querySelectorAll('.discovered-selector')].map(e => ({
365
+ name: e.querySelector('h2')?.textContent?.trim()
366
+ }));
367
+ });
368
+ `);
224
369
  ```
225
370
 
226
- ### Company Size Matters
371
+ ### Bot Protection
227
372
 
228
- | Company Size | Simple Query | Aggregations |
229
- |-------------|--------------|--------------|
230
- | Small (<1K) | 4-20ms | 5-50ms |
231
- | Medium (1K-10K) | 10-30ms | 100-500ms |
232
- | Large (10K-100K) | 10-40ms | 1-15s |
233
- | Massive (100K+) | 15-65ms | **TIMEOUT** |
234
-
235
- **For Amazon/Google (100K+ employees):** Only use simple `LIMIT` queries, no `COUNT` or `GROUP BY`.
236
-
237
- ## Common Company IDs
238
-
239
- | Company | ID | Employees |
240
- |---------|-----|-----------|
241
- | Amazon | 1586 | 770K |
242
- | Google | 1441 | 330K |
243
- | Stripe | 2135371 | ~9K |
244
- | OpenAI | 11130470 | ~7K |
245
- | Ramp | 1406226 | ~3.5K |
246
-
247
- ## Title Search Patterns
248
-
249
- | Role | ILIKE Pattern |
250
- |------|---------------|
251
- | C-Suite | `ceo%`, `cto%`, `cfo%`, `%chief%` |
252
- | VPs | `%vp %`, `%vice president%` |
253
- | Directors | `%director%`, `%head of%` |
254
- | Sales | `%account exec%`, `%sales rep%`, `%ae %` |
255
- | SDRs | `%sales development%`, `%sdr%`, `%bdr%` |
256
- | Engineering | `%engineer%`, `%developer%` |
257
- | Recruiters | `%recruit%`, `%talent%`, `%sourcer%` |
373
+ For bot-protected sites, use single-session navigation:
258
374
 
259
- ## What You Cannot Do
375
+ ```typescript
376
+ const response = await orangeslice.browser.execute(`
377
+ // Navigate to entry page (passes bot check once)
378
+ await page.goto(entryUrl, { waitUntil: 'domcontentloaded' });
379
+
380
+ // Get all URLs to visit
381
+ const urls = await page.evaluate(() =>
382
+ [...document.querySelectorAll('a.link')].map(a => a.href)
383
+ );
384
+
385
+ // Visit each IN THE SAME SESSION
386
+ const results = [];
387
+ for (const url of urls.slice(0, 10)) {
388
+ await page.goto(url, { waitUntil: 'domcontentloaded' });
389
+ const data = await page.evaluate(() => ({
390
+ title: document.querySelector('h1')?.textContent?.trim()
391
+ }));
392
+ results.push(data);
393
+ }
394
+ return results;
395
+ `);
396
+ ```
397
+
398
+ ### Rules
260
399
 
261
- **No direct contact data** - email addresses and phone numbers are restricted
262
- **No Indeed data** - Indeed tables are restricted
263
- **No traffic/web data** - Domain traffic and web analytics restricted
400
+ 1. **Always use `{ waitUntil: 'domcontentloaded' }`** Prevents hanging
401
+ 2. **Check `response.success`** Don't just destructure `result`
402
+ 3. **Analyze before extracting** Use `_snapshotForAI()` to find selectors
403
+ 4. **Return objects, not HTML** — Use `page.evaluate()` for structured data
404
+ 5. **3 minute hard limit** — Plan multi-page scrapes accordingly
405
+
406
+ ---
264
407
 
265
408
  ## Rate Limits
266
409
 
267
- The `orangeslice` package automatically handles rate limiting:
410
+ | Function | Concurrency | Min Delay |
411
+ |-------------|-------------|-----------|
412
+ | `b2b` | 2 concurrent | 100ms |
413
+ | `serp` | 2 concurrent | 200ms |
414
+ | `firecrawl` | 2 concurrent | 500ms |
415
+ | `browser` | 2 concurrent | 500ms |
416
+
417
+ All calls are queued automatically.
268
418
 
269
- | Function | Concurrency | Min Delay |
270
- |----------|-------------|-----------|
271
- | `b2b` | 2 concurrent | 100ms |
272
- | `serp` | 2 concurrent | 200ms |
273
- | `firecrawl` | 2 concurrent | 500ms |
419
+ ---
274
420
 
275
- You can fire off many calls - they'll be queued automatically.
421
+ ## What You Cannot Do
276
422
 
277
- ## Detailed Documentation
423
+ **No direct contact data** — Email addresses and phone numbers are restricted
424
+ ❌ **No Indeed data** — Indeed tables are restricted
425
+ ❌ **No traffic/web data** — Domain traffic and web analytics restricted
278
426
 
279
- For comprehensive schema and query patterns, see:
280
- - `B2B_DATABASE.md` - Full database guide with examples
281
- - `B2B_SCHEMA.md` - Complete table schemas
282
- - `B2B_EMPLOYEE_SEARCH.md` - Finding employees by title
427
+ ---
283
428
 
284
- ## Example Session
429
+ ## Example: Full Research Flow
285
430
 
286
431
  **User:** "Research Ramp - give me everything"
287
432
 
288
- **Agent:**
289
433
  ```typescript
290
434
  import { orangeslice } from 'orangeslice';
291
435
 
292
- // 1. B2B Database - Company info + leadership
436
+ // 1. B2B Database - Company info
293
437
  const company = await orangeslice.b2b.sql(`
294
438
  SELECT id, company_name, domain, employee_count, locality, description
295
439
  FROM linkedin_company WHERE domain = 'ramp.com'
296
440
  `);
297
441
 
442
+ // 2. B2B Database - Leadership team
298
443
  const leadership = await orangeslice.b2b.sql(`
299
444
  SELECT lp.first_name, lp.last_name, lp.headline, pos.title
300
445
  FROM linkedin_profile lp
@@ -305,15 +450,13 @@ const leadership = await orangeslice.b2b.sql(`
305
450
  LIMIT 20
306
451
  `);
307
452
 
308
- // 2. Google Search - Recent news
453
+ // 3. Google Search - Recent news
309
454
  const news = await orangeslice.serp.search("Ramp fintech funding 2024", { tbs: "qdr:m" });
310
455
 
311
- // 3. Website Scraping - About page + socials
456
+ // 4. Website Scraping - About page + socials
312
457
  const about = await orangeslice.firecrawl.scrape("https://ramp.com/about");
313
- console.log(about.markdown); // Company description
314
- console.log(about.socialUrls); // LinkedIn, Twitter, etc.
315
458
  ```
316
459
 
317
460
  ---
318
461
 
319
- **Start by understanding what the user wants to research, then use the appropriate tools to find the information.**
462
+ **Start by understanding what the user wants to research, then use the appropriate tools to find the information. Verify results when using SERP. Always use indexed columns first when querying the B2B database.**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "orangeslice",
3
- "version": "1.4.2",
3
+ "version": "1.6.0",
4
4
  "description": "Turn any AI agent into a B2B sales research assistant with 1B+ LinkedIn profiles",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",