orangeslice 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +65 -0
- package/dist/apify.d.ts +57 -0
- package/dist/apify.js +126 -0
- package/dist/cli.js +18 -7
- package/dist/generateObject.d.ts +34 -0
- package/dist/generateObject.js +85 -0
- package/dist/geo.d.ts +50 -0
- package/dist/geo.js +91 -0
- package/dist/index.d.ts +32 -3
- package/dist/index.js +24 -3
- package/dist/serp.d.ts +4 -1
- package/dist/serp.js +2 -2
- package/docs/AGENTS.md +94 -384
- package/docs/apify.md +133 -0
- package/docs/b2b.md +178 -0
- package/docs/browser.md +173 -0
- package/docs/serp.md +167 -0
- package/docs/strategies.md +250 -0
- package/package.json +2 -2
- package/docs/B2B_CROSS_TABLE_TEST_FINDINGS.md +0 -255
- package/docs/B2B_DATABASE.md +0 -314
- package/docs/B2B_DATABASE_TEST_FINDINGS.md +0 -476
- package/docs/B2B_EMPLOYEE_SEARCH.md +0 -697
- package/docs/B2B_GENERALIZATION_RULES.md +0 -220
- package/docs/B2B_NLP_QUERY_MAPPINGS.md +0 -240
- package/docs/B2B_NORMALIZED_VS_DENORMALIZED.md +0 -952
- package/docs/B2B_SCHEMA.md +0 -1042
- package/docs/B2B_SQL_COMPREHENSIVE_TEST_FINDINGS.md +0 -301
- package/docs/B2B_TABLE_INDICES.ts +0 -496
package/docs/apify.md
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# Apify Actors
|
|
2
|
+
|
|
3
|
+
Pre-built web scrapers for social media, Google Maps, e-commerce, and more.
|
|
4
|
+
|
|
5
|
+
```typescript
|
|
6
|
+
import { orangeslice } from 'orangeslice';
|
|
7
|
+
|
|
8
|
+
// Run an actor
|
|
9
|
+
const results = await orangeslice.apify.run("username/actor-name", { input: "params" });
|
|
10
|
+
|
|
11
|
+
// Search for actors
|
|
12
|
+
const { actors } = await orangeslice.apify.search("linkedin scraper");
|
|
13
|
+
|
|
14
|
+
// Get actor input schema (what params it accepts)
|
|
15
|
+
const schema = await orangeslice.apify.getInputSchema("apify/web-scraper");
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Workflow
|
|
21
|
+
|
|
22
|
+
1. **Search** for an actor that does what you need
|
|
23
|
+
2. **Get input schema** to understand required params
|
|
24
|
+
3. **Run** the actor with your inputs
|
|
25
|
+
4. Results are returned when the actor completes
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Searching for Actors
|
|
30
|
+
|
|
31
|
+
```typescript
|
|
32
|
+
const { actors, total } = await orangeslice.apify.search("google maps reviews", 10);
|
|
33
|
+
|
|
34
|
+
// actors = [{
|
|
35
|
+
// actorId: "compass/crawler-google-places",
|
|
36
|
+
// title: "Google Maps Scraper",
|
|
37
|
+
// description: "Scrape Google Maps...",
|
|
38
|
+
// stats: { totalRuns: 1000000 },
|
|
39
|
+
// pricing: { ... }
|
|
40
|
+
// }, ...]
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Getting Input Schema
|
|
46
|
+
|
|
47
|
+
Before running, check what params the actor needs:
|
|
48
|
+
|
|
49
|
+
```typescript
|
|
50
|
+
const schema = await orangeslice.apify.getInputSchema("compass/crawler-google-places");
|
|
51
|
+
|
|
52
|
+
console.log(schema.inputProperties);
|
|
53
|
+
// {
|
|
54
|
+
// searchStringsArray: { type: "array", description: "Search queries" },
|
|
55
|
+
// maxReviews: { type: "integer", description: "Max reviews per place" },
|
|
56
|
+
// language: { type: "string", default: "en" },
|
|
57
|
+
// ...
|
|
58
|
+
// }
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Running Actors
|
|
64
|
+
|
|
65
|
+
```typescript
|
|
66
|
+
// Google Maps reviews
|
|
67
|
+
const reviews = await orangeslice.apify.run("compass/crawler-google-places", {
|
|
68
|
+
searchStringsArray: ["restaurants in NYC"],
|
|
69
|
+
maxReviews: 20,
|
|
70
|
+
language: "en"
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
// With dataset params (limit results)
|
|
74
|
+
const results = await orangeslice.apify.run("apify/web-scraper",
|
|
75
|
+
{ startUrls: [{ url: "https://example.com" }] },
|
|
76
|
+
{ limit: 100 } // Only return first 100 items
|
|
77
|
+
);
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Popular Actors
|
|
83
|
+
|
|
84
|
+
| Use Case | Actor | Example Input |
|
|
85
|
+
|----------|-------|---------------|
|
|
86
|
+
| Google Maps | `compass/crawler-google-places` | `{ searchStringsArray: ["cafes SF"] }` |
|
|
87
|
+
| Google Search | `apify/google-search-scraper` | `{ queries: "site:linkedin.com CEO" }` |
|
|
88
|
+
| Instagram | `apify/instagram-scraper` | `{ directUrls: ["https://instagram.com/user"] }` |
|
|
89
|
+
| Twitter/X | `apidojo/tweet-scraper` | `{ searchTerms: ["#startup"] }` |
|
|
90
|
+
| LinkedIn | `anchor/linkedin-profile-scraper` | `{ profileUrls: [...] }` |
|
|
91
|
+
| YouTube | `streamers/youtube-scraper` | `{ searchKeywords: ["tech reviews"] }` |
|
|
92
|
+
| TikTok | `clockworks/tiktok-scraper` | `{ profiles: ["@username"] }` |
|
|
93
|
+
| Amazon | `junglee/amazon-scraper` | `{ keyword: "laptop stand" }` |
|
|
94
|
+
| Yelp | `yin/yelp-scraper` | `{ searchTerms: "plumber", location: "NYC" }` |
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## Dataset Params
|
|
99
|
+
|
|
100
|
+
Control the results returned:
|
|
101
|
+
|
|
102
|
+
```typescript
|
|
103
|
+
await orangeslice.apify.run(actor, input, {
|
|
104
|
+
limit: 100, // Max items to return
|
|
105
|
+
offset: 0, // Skip first N items
|
|
106
|
+
clean: true, // Remove empty fields
|
|
107
|
+
fields: ["name", "url"], // Only these fields
|
|
108
|
+
unwind: "reviews" // Flatten nested array
|
|
109
|
+
});
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Timeouts
|
|
115
|
+
|
|
116
|
+
- Actors run asynchronously and are polled for completion
|
|
117
|
+
- Max wait: 5 minutes
|
|
118
|
+
- Large scrapes may timeout — use smaller batches
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## Response Format
|
|
123
|
+
|
|
124
|
+
Returns the dataset items directly as an array:
|
|
125
|
+
|
|
126
|
+
```typescript
|
|
127
|
+
const results = await orangeslice.apify.run(...);
|
|
128
|
+
// results = [
|
|
129
|
+
// { name: "...", address: "...", rating: 4.5 },
|
|
130
|
+
// { name: "...", address: "...", rating: 4.2 },
|
|
131
|
+
// ...
|
|
132
|
+
// ]
|
|
133
|
+
```
|
package/docs/b2b.md
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# B2B Database
|
|
2
|
+
|
|
3
|
+
**Scale:** 1.15B profiles, 2.6B positions, millions of companies. Naive queries timeout.
|
|
4
|
+
|
|
5
|
+
```typescript
|
|
6
|
+
import { orangeslice } from 'orangeslice';
|
|
7
|
+
|
|
8
|
+
// Returns rows directly
|
|
9
|
+
const companies = await orangeslice.b2b.sql("SELECT * FROM linkedin_company WHERE domain = 'stripe.com'");
|
|
10
|
+
|
|
11
|
+
// Returns { rows, rowCount, duration_ms }
|
|
12
|
+
const result = await orangeslice.b2b.query("SELECT * FROM linkedin_company WHERE domain = 'stripe.com'");
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Fast Lookups (Indexed)
|
|
18
|
+
|
|
19
|
+
```sql
|
|
20
|
+
-- Company by domain (FAST)
|
|
21
|
+
SELECT * FROM linkedin_company WHERE domain = 'stripe.com';
|
|
22
|
+
|
|
23
|
+
-- Company by universal_name (FAST)
|
|
24
|
+
SELECT * FROM linkedin_company WHERE universal_name = 'stripe';
|
|
25
|
+
|
|
26
|
+
-- Employees at company (FAST - filter by company ID)
|
|
27
|
+
SELECT lp.first_name, lp.last_name, pos.title
|
|
28
|
+
FROM linkedin_profile lp
|
|
29
|
+
JOIN linkedin_profile_position3 pos ON pos.linkedin_profile_id = lp.id
|
|
30
|
+
WHERE pos.linkedin_company_id = 2135371
|
|
31
|
+
AND pos.end_date IS NULL
|
|
32
|
+
LIMIT 50;
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Slow Queries (Will Timeout)
|
|
38
|
+
|
|
39
|
+
```sql
|
|
40
|
+
-- ❌ Text search on names (no index)
|
|
41
|
+
WHERE company_name ILIKE '%stripe%'
|
|
42
|
+
|
|
43
|
+
-- ❌ Headline search without company filter
|
|
44
|
+
WHERE headline ILIKE '%sales%'
|
|
45
|
+
|
|
46
|
+
-- ❌ COUNT on huge companies
|
|
47
|
+
SELECT COUNT(*) FROM ... WHERE linkedin_company_id = 1586
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Indexed Columns
|
|
53
|
+
|
|
54
|
+
| Table | Indexed Columns |
|
|
55
|
+
|-------|-----------------|
|
|
56
|
+
| `linkedin_company` | `id`, `universal_name`, `domain` |
|
|
57
|
+
| `linkedin_profile` | `id`, `linkedin_user_id` |
|
|
58
|
+
| `linkedin_profile_position3` | `linkedin_profile_id`, `linkedin_company_id` |
|
|
59
|
+
| `linkedin_job` | `linkedin_company_id`, `title_id` |
|
|
60
|
+
| `linkedin_crunchbase_funding` | `linkedin_company_id` |
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Company Size Performance
|
|
65
|
+
|
|
66
|
+
| Company Size | Simple Query | Aggregations |
|
|
67
|
+
|--------------|--------------|--------------|
|
|
68
|
+
| Small (<1K) | 4-20ms | 5-50ms |
|
|
69
|
+
| Medium (1K-10K) | 10-30ms | 100-500ms |
|
|
70
|
+
| Large (10K-100K) | 10-40ms | 1-15s |
|
|
71
|
+
| Massive (100K+) | 15-65ms | **TIMEOUT** |
|
|
72
|
+
|
|
73
|
+
**For Amazon/Google/Meta:** Only use simple `LIMIT` queries. No aggregations.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Common Company IDs
|
|
78
|
+
|
|
79
|
+
| Company | ID | Employees |
|
|
80
|
+
|---------|----------|-----------|
|
|
81
|
+
| Amazon | 1586 | 770K |
|
|
82
|
+
| Google | 1441 | 330K |
|
|
83
|
+
| Meta | 10667 | 70K |
|
|
84
|
+
| Stripe | 2135371 | ~9K |
|
|
85
|
+
| OpenAI | 11130470 | ~7K |
|
|
86
|
+
| Ramp | 1406226 | ~3.5K |
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Title Search Patterns
|
|
91
|
+
|
|
92
|
+
| Role | ILIKE Pattern |
|
|
93
|
+
|------|---------------|
|
|
94
|
+
| C-Suite | `ceo%`, `cto%`, `cfo%`, `%chief%` |
|
|
95
|
+
| VPs | `%vp %`, `%vice president%` |
|
|
96
|
+
| Directors | `%director%`, `%head of%` |
|
|
97
|
+
| Sales | `%account exec%`, `%sales rep%`, `%ae %` |
|
|
98
|
+
| SDRs | `%sales development%`, `%sdr%`, `%bdr%` |
|
|
99
|
+
| Engineering | `%engineer%`, `%developer%` |
|
|
100
|
+
| Recruiters | `%recruit%`, `%talent%`, `%sourcer%` |
|
|
101
|
+
| Legal | `%lawyer%`, `%attorney%`, `%counsel%` |
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Check If Company Is Hiring
|
|
106
|
+
|
|
107
|
+
```sql
|
|
108
|
+
-- Filter for companies with active job postings
|
|
109
|
+
EXISTS (
|
|
110
|
+
SELECT 1 FROM linkedin_job j
|
|
111
|
+
WHERE j.linkedin_company_id = lc.id
|
|
112
|
+
AND j.closed_since IS NULL
|
|
113
|
+
AND (j.valid_until IS NULL OR j.valid_until > NOW())
|
|
114
|
+
AND j.posted_date >= CURRENT_DATE - INTERVAL '90 days'
|
|
115
|
+
)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## Query Strategy
|
|
121
|
+
|
|
122
|
+
**Timeout?** Immediately fall back to SERP:
|
|
123
|
+
```
|
|
124
|
+
site:linkedin.com/company [query]
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Complex criteria?** Decompose:
|
|
128
|
+
1. Simple indexed query → get IDs
|
|
129
|
+
2. Enrich with additional data
|
|
130
|
+
3. Filter/qualify results
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Key Tables
|
|
135
|
+
|
|
136
|
+
### linkedin_company
|
|
137
|
+
```sql
|
|
138
|
+
SELECT id, company_name, universal_name, domain,
|
|
139
|
+
employee_count, industry, locality, country,
|
|
140
|
+
description, specialties, founded_year
|
|
141
|
+
FROM linkedin_company
|
|
142
|
+
WHERE domain = 'example.com';
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### linkedin_profile
|
|
146
|
+
```sql
|
|
147
|
+
SELECT id, first_name, last_name, headline,
|
|
148
|
+
locality, country, summary, linkedin_user_id
|
|
149
|
+
FROM linkedin_profile
|
|
150
|
+
WHERE id = 12345;
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### linkedin_profile_position3
|
|
154
|
+
```sql
|
|
155
|
+
SELECT linkedin_profile_id, linkedin_company_id,
|
|
156
|
+
title, start_date, end_date, description
|
|
157
|
+
FROM linkedin_profile_position3
|
|
158
|
+
WHERE linkedin_company_id = 2135371
|
|
159
|
+
AND end_date IS NULL; -- Current employees only
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### linkedin_job
|
|
163
|
+
```sql
|
|
164
|
+
SELECT id, linkedin_company_id, title,
|
|
165
|
+
locality, posted_date, closed_since
|
|
166
|
+
FROM linkedin_job
|
|
167
|
+
WHERE linkedin_company_id = 2135371
|
|
168
|
+
AND closed_since IS NULL;
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### linkedin_crunchbase_funding
|
|
172
|
+
```sql
|
|
173
|
+
SELECT linkedin_company_id, funding_type,
|
|
174
|
+
money_raised, announced_date, investor_names
|
|
175
|
+
FROM linkedin_crunchbase_funding
|
|
176
|
+
WHERE linkedin_company_id = 2135371
|
|
177
|
+
ORDER BY announced_date DESC;
|
|
178
|
+
```
|
package/docs/browser.md
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# Browser Automation
|
|
2
|
+
|
|
3
|
+
Execute Playwright code with a `page` object in scope. For dynamic/JS-heavy pages.
|
|
4
|
+
|
|
5
|
+
```typescript
|
|
6
|
+
import { orangeslice } from 'orangeslice';
|
|
7
|
+
|
|
8
|
+
const response = await orangeslice.browser.execute(`
|
|
9
|
+
await page.goto("https://example.com", { waitUntil: 'domcontentloaded' });
|
|
10
|
+
return await page.evaluate(() => document.title);
|
|
11
|
+
`);
|
|
12
|
+
// { success: true, result: "Example Domain" }
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## When to Use
|
|
18
|
+
|
|
19
|
+
| Use `firecrawl` | Use `browser` |
|
|
20
|
+
|-----------------|---------------|
|
|
21
|
+
| Static HTML | JavaScript-rendered content |
|
|
22
|
+
| Simple content extraction | Complex interactions |
|
|
23
|
+
| Fast, one-shot scraping | Login flows, clicking |
|
|
24
|
+
| Getting social URLs | Bot-protected sites |
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Basic Patterns
|
|
29
|
+
|
|
30
|
+
### Extract List Items
|
|
31
|
+
|
|
32
|
+
```typescript
|
|
33
|
+
const response = await orangeslice.browser.execute(`
|
|
34
|
+
await page.goto("https://example.com/products", { waitUntil: 'domcontentloaded' });
|
|
35
|
+
return await page.evaluate(() => {
|
|
36
|
+
return [...document.querySelectorAll('.product')].map(el => ({
|
|
37
|
+
name: el.querySelector('h2')?.textContent?.trim(),
|
|
38
|
+
price: el.querySelector('.price')?.textContent?.trim(),
|
|
39
|
+
url: el.querySelector('a')?.href
|
|
40
|
+
}));
|
|
41
|
+
});
|
|
42
|
+
`);
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Click and Wait
|
|
46
|
+
|
|
47
|
+
```typescript
|
|
48
|
+
const response = await orangeslice.browser.execute(`
|
|
49
|
+
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
50
|
+
await page.click('button.load-more');
|
|
51
|
+
await page.waitForSelector('.new-content');
|
|
52
|
+
return await page.evaluate(() => document.body.innerText);
|
|
53
|
+
`);
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Fill Form
|
|
57
|
+
|
|
58
|
+
```typescript
|
|
59
|
+
const response = await orangeslice.browser.execute(`
|
|
60
|
+
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
61
|
+
await page.fill('input[name="search"]', 'query');
|
|
62
|
+
await page.click('button[type="submit"]');
|
|
63
|
+
await page.waitForSelector('.results');
|
|
64
|
+
return await page.evaluate(() =>
|
|
65
|
+
[...document.querySelectorAll('.result')].map(e => e.textContent)
|
|
66
|
+
);
|
|
67
|
+
`);
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Workflow: Discover → Extract
|
|
73
|
+
|
|
74
|
+
### Step 1: Analyze Page Structure
|
|
75
|
+
|
|
76
|
+
```typescript
|
|
77
|
+
const response = await orangeslice.browser.execute(`
|
|
78
|
+
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
79
|
+
return await page._snapshotForAI();
|
|
80
|
+
`);
|
|
81
|
+
// Returns page structure for selector discovery
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Step 2: Extract with Discovered Selectors
|
|
85
|
+
|
|
86
|
+
```typescript
|
|
87
|
+
const response = await orangeslice.browser.execute(`
|
|
88
|
+
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
89
|
+
return await page.evaluate(() => {
|
|
90
|
+
return [...document.querySelectorAll('.discovered-selector')].map(e => ({
|
|
91
|
+
title: e.querySelector('h2')?.textContent?.trim(),
|
|
92
|
+
description: e.querySelector('p')?.textContent?.trim()
|
|
93
|
+
}));
|
|
94
|
+
});
|
|
95
|
+
`);
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## Bot Protection: Single Session
|
|
101
|
+
|
|
102
|
+
For bot-protected sites, do all navigation in ONE session:
|
|
103
|
+
|
|
104
|
+
```typescript
|
|
105
|
+
const response = await orangeslice.browser.execute(`
|
|
106
|
+
// 1. Navigate to entry page (passes bot check once)
|
|
107
|
+
await page.goto(entryUrl, { waitUntil: 'domcontentloaded' });
|
|
108
|
+
|
|
109
|
+
// 2. Get all URLs to visit
|
|
110
|
+
const urls = await page.evaluate(() =>
|
|
111
|
+
[...document.querySelectorAll('a.link')].map(a => a.href)
|
|
112
|
+
);
|
|
113
|
+
|
|
114
|
+
// 3. Visit each IN THE SAME SESSION
|
|
115
|
+
const results = [];
|
|
116
|
+
for (const url of urls.slice(0, 10)) {
|
|
117
|
+
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
118
|
+
const data = await page.evaluate(() => ({
|
|
119
|
+
title: document.querySelector('h1')?.textContent?.trim(),
|
|
120
|
+
content: document.querySelector('.main')?.textContent?.trim()
|
|
121
|
+
}));
|
|
122
|
+
results.push(data);
|
|
123
|
+
}
|
|
124
|
+
return results;
|
|
125
|
+
`);
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Options
|
|
131
|
+
|
|
132
|
+
```typescript
|
|
133
|
+
await orangeslice.browser.execute(code, {
|
|
134
|
+
timeout_sec: 60, // Execution timeout (default: 30)
|
|
135
|
+
acquire_timeout_seconds: 30 // Browser acquisition timeout
|
|
136
|
+
});
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Rules
|
|
142
|
+
|
|
143
|
+
1. **Always use `{ waitUntil: 'domcontentloaded' }`** — Prevents hanging on slow resources
|
|
144
|
+
2. **Check `response.success`** — Don't just destructure `result`
|
|
145
|
+
3. **Analyze before extracting** — Use `_snapshotForAI()` to find selectors
|
|
146
|
+
4. **Return objects, not HTML** — Use `page.evaluate()` for structured data
|
|
147
|
+
5. **3 minute hard limit** — Plan multi-page scrapes accordingly
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## Response Format
|
|
152
|
+
|
|
153
|
+
```typescript
|
|
154
|
+
interface BrowserResponse {
|
|
155
|
+
success: boolean;
|
|
156
|
+
result?: any; // Your return value
|
|
157
|
+
error?: string; // Error message if failed
|
|
158
|
+
console_logs?: string[];
|
|
159
|
+
browser_live_view_url?: string; // Debug URL
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## Convenience Methods
|
|
166
|
+
|
|
167
|
+
```typescript
|
|
168
|
+
// Get page snapshot for analysis
|
|
169
|
+
const snapshot = await orangeslice.browser.snapshot(url);
|
|
170
|
+
|
|
171
|
+
// Get just the text content
|
|
172
|
+
const text = await orangeslice.browser.text(url);
|
|
173
|
+
```
|
package/docs/serp.md
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# Google SERP
|
|
2
|
+
|
|
3
|
+
Google search with advanced dorking. Fast but **requires verification**.
|
|
4
|
+
|
|
5
|
+
```typescript
|
|
6
|
+
import { orangeslice } from 'orangeslice';
|
|
7
|
+
|
|
8
|
+
// Basic search
|
|
9
|
+
const { results } = await orangeslice.serp.search("AI CRM software");
|
|
10
|
+
|
|
11
|
+
// With options
|
|
12
|
+
const { results } = await orangeslice.serp.search("Stripe hiring", {
|
|
13
|
+
tbs: "qdr:m", // Past month
|
|
14
|
+
page: 0, // First page
|
|
15
|
+
advance_search: true // Include snippets
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
// Convenience: just organic results
|
|
19
|
+
const results = await orangeslice.serp.organic("query");
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Dorking Cheatsheet
|
|
25
|
+
|
|
26
|
+
### Core Operators
|
|
27
|
+
|
|
28
|
+
| Operator | Example | Effect |
|
|
29
|
+
|----------|---------|--------|
|
|
30
|
+
| `"..."` | `"exact phrase"` | Match exact text |
|
|
31
|
+
| `OR` | `CEO OR Founder` | Match either term |
|
|
32
|
+
| `-` | `startup -jobs` | Exclude term |
|
|
33
|
+
| `site:` | `site:linkedin.com` | Restrict to domain |
|
|
34
|
+
| `inurl:` | `inurl:status` | URL must contain |
|
|
35
|
+
| `intitle:` | `intitle:"series A"` | Title must contain |
|
|
36
|
+
|
|
37
|
+
### Platform Dorks
|
|
38
|
+
|
|
39
|
+
| Goal | Dork |
|
|
40
|
+
|------|------|
|
|
41
|
+
| LinkedIn profiles | `site:linkedin.com/in "query"` |
|
|
42
|
+
| LinkedIn companies | `site:linkedin.com/company "query"` |
|
|
43
|
+
| LinkedIn posts | `site:linkedin.com/posts "query"` |
|
|
44
|
+
| Twitter/X posts | `site:x.com inurl:status "query"` |
|
|
45
|
+
| Twitter/X profiles | `site:x.com -inurl:status "query"` |
|
|
46
|
+
| Reddit threads | `site:reddit.com "query"` |
|
|
47
|
+
| Crunchbase | `site:crunchbase.com/organization "query"` |
|
|
48
|
+
| GitHub | `site:github.com "query"` |
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## B2B Prospecting Dorks
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
# Find employees at company
|
|
56
|
+
"Stripe" site:linkedin.com/in
|
|
57
|
+
|
|
58
|
+
# Find leadership
|
|
59
|
+
"Acme Corp" CEO OR Founder OR "Co-founder" site:linkedin.com/in
|
|
60
|
+
|
|
61
|
+
# Find by title
|
|
62
|
+
"VP Sales" "Series A" site:linkedin.com/in
|
|
63
|
+
|
|
64
|
+
# Find company pages by criteria
|
|
65
|
+
"YC W24" site:linkedin.com/company
|
|
66
|
+
"Series B" fintech site:linkedin.com/company
|
|
67
|
+
|
|
68
|
+
# Find companies by product category
|
|
69
|
+
"AI CRM" OR "AI-powered CRM" site:linkedin.com/company
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Time Filters
|
|
75
|
+
|
|
76
|
+
| Value | Period |
|
|
77
|
+
|-------|--------|
|
|
78
|
+
| `qdr:h` | Past hour |
|
|
79
|
+
| `qdr:d` | Past 24h |
|
|
80
|
+
| `qdr:w` | Past week |
|
|
81
|
+
| `qdr:m` | Past month |
|
|
82
|
+
| `qdr:y` | Past year |
|
|
83
|
+
|
|
84
|
+
```typescript
|
|
85
|
+
// Recent news only
|
|
86
|
+
orangeslice.serp.search("Stripe funding", { tbs: "qdr:m" });
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Query Permutation Strategy
|
|
92
|
+
|
|
93
|
+
SERP is cheap. Run 10-30 variations in parallel:
|
|
94
|
+
|
|
95
|
+
| Dimension | Variations |
|
|
96
|
+
|-----------|------------|
|
|
97
|
+
| Name | Full name, initials, nicknames |
|
|
98
|
+
| Company | Full name, abbreviation, domain |
|
|
99
|
+
| Title | CEO/Founder/Chief, VP/Director, formal/informal |
|
|
100
|
+
| Location | City, metro area, state |
|
|
101
|
+
|
|
102
|
+
```typescript
|
|
103
|
+
const queries = [
|
|
104
|
+
`"John Smith" "Acme" site:linkedin.com/in`,
|
|
105
|
+
`"J. Smith" Acme site:linkedin.com/in`,
|
|
106
|
+
`"John Smith" CEO site:linkedin.com/in`,
|
|
107
|
+
];
|
|
108
|
+
const results = await Promise.all(queries.map(q => orangeslice.serp.search(q)));
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Verification Required
|
|
114
|
+
|
|
115
|
+
**Dorking returns false positives.** Always verify:
|
|
116
|
+
|
|
117
|
+
1. **Enrich via B2B database** — Get actual company/person data
|
|
118
|
+
2. **Scrape website** — Check product page, about page
|
|
119
|
+
3. **AI classification** — "Based on [data], does this match [criteria]?"
|
|
120
|
+
|
|
121
|
+
### Example: Find AI Companies
|
|
122
|
+
|
|
123
|
+
```typescript
|
|
124
|
+
// 1. Dork for candidates
|
|
125
|
+
const { results } = await orangeslice.serp.search(
|
|
126
|
+
`"AI-powered" site:linkedin.com/company`
|
|
127
|
+
);
|
|
128
|
+
|
|
129
|
+
// 2. Extract LinkedIn URLs
|
|
130
|
+
const linkedinUrls = results
|
|
131
|
+
.map(r => r.link)
|
|
132
|
+
.filter(url => url.includes('linkedin.com/company'));
|
|
133
|
+
|
|
134
|
+
// 3. Enrich via B2B database
|
|
135
|
+
for (const url of linkedinUrls) {
|
|
136
|
+
const universalName = url.split('/company/')[1]?.split('/')[0];
|
|
137
|
+
if (universalName) {
|
|
138
|
+
const company = await orangeslice.b2b.sql(`
|
|
139
|
+
SELECT * FROM linkedin_company
|
|
140
|
+
WHERE universal_name = '${universalName}'
|
|
141
|
+
`);
|
|
142
|
+
// 4. Verify with actual description
|
|
143
|
+
if (company[0]?.description?.toLowerCase().includes('artificial intelligence')) {
|
|
144
|
+
// Confirmed AI company
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Response Format
|
|
153
|
+
|
|
154
|
+
```typescript
|
|
155
|
+
interface SerpResult {
|
|
156
|
+
title: string;
|
|
157
|
+
link: string;
|
|
158
|
+
snippet?: string;
|
|
159
|
+
displayed_link?: string;
|
|
160
|
+
position?: number;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
interface SerpResponse {
|
|
164
|
+
results: SerpResult[];
|
|
165
|
+
related_questions?: Array<{ question: string; snippet: string }>;
|
|
166
|
+
}
|
|
167
|
+
```
|