orangeslice 1.7.0 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/apify.js +3 -2
- package/dist/generateObject.js +3 -2
- package/dist/geo.js +3 -2
- package/docs/b2b-docs/B2B_CROSS_TABLE_TEST_FINDINGS.md +255 -0
- package/docs/b2b-docs/B2B_DATABASE.md +314 -0
- package/docs/b2b-docs/B2B_DATABASE_TEST_FINDINGS.md +476 -0
- package/docs/b2b-docs/B2B_EMPLOYEE_SEARCH.md +697 -0
- package/docs/b2b-docs/B2B_GENERALIZATION_RULES.md +220 -0
- package/docs/b2b-docs/B2B_NLP_QUERY_MAPPINGS.md +240 -0
- package/docs/b2b-docs/B2B_NORMALIZED_VS_DENORMALIZED.md +952 -0
- package/docs/b2b-docs/B2B_SCHEMA.md +1042 -0
- package/docs/b2b-docs/B2B_SQL_COMPREHENSIVE_TEST_FINDINGS.md +301 -0
- package/docs/b2b-docs/B2B_TABLE_INDICES.ts +496 -0
- package/package.json +1 -1
package/dist/apify.js
CHANGED
|
@@ -5,8 +5,9 @@ exports.run = run;
|
|
|
5
5
|
exports.search = search;
|
|
6
6
|
exports.getInputSchema = getInputSchema;
|
|
7
7
|
const queue_1 = require("./queue");
|
|
8
|
-
const API_URL = process.env.ORANGESLICE_API_URL
|
|
9
|
-
"
|
|
8
|
+
const API_URL = process.env.ORANGESLICE_API_URL
|
|
9
|
+
? process.env.ORANGESLICE_API_URL.replace(/\?.*/, "") + "?functionId=apify"
|
|
10
|
+
: "https://orangeslice.ai/api/function?functionId=apify";
|
|
10
11
|
// Rate limit: 2 concurrent, 500ms between requests (Apify runs can be expensive)
|
|
11
12
|
const queue = (0, queue_1.createQueue)(2);
|
|
12
13
|
const rateLimiter = (0, queue_1.createRateLimiter)(500);
|
package/dist/generateObject.js
CHANGED
|
@@ -4,8 +4,9 @@ exports.generateObject = void 0;
|
|
|
4
4
|
exports.generate = generate;
|
|
5
5
|
exports.extract = extract;
|
|
6
6
|
const queue_1 = require("./queue");
|
|
7
|
-
const API_URL = process.env.ORANGESLICE_API_URL
|
|
8
|
-
|
|
7
|
+
const API_URL = process.env.ORANGESLICE_API_URL
|
|
8
|
+
? process.env.ORANGESLICE_API_URL.replace(/\?.*/, "") + "?functionId=generateObject"
|
|
9
|
+
: "https://orangeslice.ai/api/function?functionId=generateObject";
|
|
9
10
|
// Rate limit: 2 concurrent, 200ms between requests
|
|
10
11
|
const queue = (0, queue_1.createQueue)(2);
|
|
11
12
|
const rateLimiter = (0, queue_1.createRateLimiter)(200);
|
package/dist/geo.js
CHANGED
|
@@ -5,8 +5,9 @@ exports.parseAddress = parseAddress;
|
|
|
5
5
|
exports.geocode = geocode;
|
|
6
6
|
exports.getCityState = getCityState;
|
|
7
7
|
const queue_1 = require("./queue");
|
|
8
|
-
const API_URL = process.env.ORANGESLICE_API_URL
|
|
9
|
-
|
|
8
|
+
const API_URL = process.env.ORANGESLICE_API_URL
|
|
9
|
+
? process.env.ORANGESLICE_API_URL.replace(/\?.*/, "") + "?functionId=geo"
|
|
10
|
+
: "https://orangeslice.ai/api/function?functionId=geo";
|
|
10
11
|
// Rate limit: 2 concurrent, 100ms between requests
|
|
11
12
|
const queue = (0, queue_1.createQueue)(2);
|
|
12
13
|
const rateLimiter = (0, queue_1.createRateLimiter)(100);
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
# B2B Cross-Table Query Test Findings
|
|
2
|
+
|
|
3
|
+
Comprehensive performance comparison between normalized tables (`linkedin_profile`, `linkedin_company`) and denormalized views (`lkd_profile`, `lkd_company`) for cross-table queries.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Executive Summary
|
|
8
|
+
|
|
9
|
+
| Pattern | Normalized | Denormalized | Winner | Speedup |
|
|
10
|
+
| ---------------------------------- | ---------- | ------------ | ------------ | ------- |
|
|
11
|
+
| **Company ID lookup → employees** | 48ms | 279ms | Normalized | 5.8x |
|
|
12
|
+
| **Company name (org) search** | 274ms | 8,600ms | Normalized | 31x |
|
|
13
|
+
| **GIN-indexed org ILIKE** | 430ms | 29,409ms | Normalized | 68x |
|
|
14
|
+
| **Title ILIKE (common term)** | 64ms | 313ms | Normalized | 4.9x |
|
|
15
|
+
| **updated_at filter** | 4ms | 14ms | Normalized | 3.5x |
|
|
16
|
+
| **Company ID direct lookup** | 4ms | 31ms | Normalized | 7.8x |
|
|
17
|
+
| **Headline (rare term)** | 2,530ms | 1,258ms | Denormalized | 2x |
|
|
18
|
+
| **Skill array search** | 216ms | 169ms | Denormalized | 1.3x |
|
|
19
|
+
| **Industry + employee_count** | 742ms | 202ms | Denormalized | 3.7x |
|
|
20
|
+
| **Headline + company size (JOIN)** | 20,205ms | 217ms | Denormalized | 93x |
|
|
21
|
+
| **Multi-skill + company size** | 28,173ms | 1,281ms | Denormalized | 22x |
|
|
22
|
+
| **Skill + company industry** | TIMEOUT | 3,553ms | Denormalized | ∞ |
|
|
23
|
+
| **Complex multi-filter + company** | TIMEOUT | 4,947ms | Denormalized | ∞ |
|
|
24
|
+
| **AI company + SF location** | TIMEOUT | 11,061ms | Denormalized | ∞ |
|
|
25
|
+
|
|
26
|
+
**Key Finding**: When combining profile text filters (headline, skills) with company constraints (employee_count, industry), **denormalized JOINs are 20-90x faster** and often the only option that completes within timeout.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Critical Pattern: Profile + Company Combined Filters
|
|
31
|
+
|
|
32
|
+
The most important discovery: **cross-table queries with text filters perform dramatically better with denormalized tables**.
|
|
33
|
+
|
|
34
|
+
### Normalized Multi-JOIN (Often Fails)
|
|
35
|
+
|
|
36
|
+
```sql
|
|
37
|
+
-- ❌ TIMEOUT or 20+ seconds
|
|
38
|
+
SELECT lp.id, lp.first_name, lp.headline, lc.company_name
|
|
39
|
+
FROM linkedin_profile lp
|
|
40
|
+
JOIN linkedin_profile_position3 pos ON pos.linkedin_profile_id = lp.id
|
|
41
|
+
JOIN linkedin_company lc ON lc.id = pos.linkedin_company_id
|
|
42
|
+
WHERE pos.end_date IS NULL
|
|
43
|
+
AND lp.headline ILIKE '%engineer%'
|
|
44
|
+
AND lc.employee_count > 1000
|
|
45
|
+
LIMIT 50
|
|
46
|
+
-- Result: 20,205ms
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Denormalized JOIN (Fast)
|
|
50
|
+
|
|
51
|
+
```sql
|
|
52
|
+
-- ✅ 217ms - 93x faster
|
|
53
|
+
SELECT lkd.profile_id, lkd.first_name, lkd.headline, lkdc.name
|
|
54
|
+
FROM lkd_profile lkd
|
|
55
|
+
JOIN lkd_company lkdc ON lkdc.linkedin_company_id = lkd.linkedin_company_id
|
|
56
|
+
WHERE lkd.headline ILIKE '%engineer%'
|
|
57
|
+
AND lkdc.employee_count > 1000
|
|
58
|
+
LIMIT 50
|
|
59
|
+
-- Result: 217ms
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Test Results by Category
|
|
65
|
+
|
|
66
|
+
### A. Company-First Queries
|
|
67
|
+
|
|
68
|
+
| Test | Query | Normalized | Denormalized | Winner |
|
|
69
|
+
| ---- | ------------------------------- | ---------- | ------------ | ----------------- |
|
|
70
|
+
| A1 | Employees at company ID | **48ms** | 279ms | Normalized (5.8x) |
|
|
71
|
+
| A2 | Employees by company name (org) | **274ms** | 8,600ms | Normalized (31x) |
|
|
72
|
+
| A3 | Engineers at large companies | **96ms** | 234ms | Normalized (2.4x) |
|
|
73
|
+
|
|
74
|
+
**Conclusion**: For company-first queries, normalized tables win due to indexed lookups.
|
|
75
|
+
|
|
76
|
+
### B. Profile-First Queries
|
|
77
|
+
|
|
78
|
+
| Test | Query | Normalized | Denormalized | Winner |
|
|
79
|
+
| ---- | -------------------------- | ---------- | ------------ | ------------------- |
|
|
80
|
+
| B1 | Python developers | 216ms | **169ms** | Denormalized (1.3x) |
|
|
81
|
+
| B2 | US Data Scientists | 644ms | **557ms** | Denormalized (1.2x) |
|
|
82
|
+
| B3 | Senior engineers + company | 4,535ms | **196ms** | Denormalized (23x) |
|
|
83
|
+
|
|
84
|
+
**Conclusion**: Simple profile queries are similar; profile + company queries favor denormalized.
|
|
85
|
+
|
|
86
|
+
### C. Complex Prospecting Queries
|
|
87
|
+
|
|
88
|
+
| Test | Query | Normalized | Denormalized | Winner |
|
|
89
|
+
| ---- | ----------------------------------------- | ----------- | ------------ | ----------------- |
|
|
90
|
+
| C1 | Decision makers at funded startups | **1,198ms** | 3,124ms | Normalized (2.6x) |
|
|
91
|
+
| C2 | AI company employees in SF | TIMEOUT | **11,061ms** | Denormalized (∞) |
|
|
92
|
+
| C3 | Hybrid (normalized profile + lkd_company) | 9,631ms | - | - |
|
|
93
|
+
|
|
94
|
+
**Conclusion**: When funding table is used (indexed JOIN), normalized wins. When text filters span tables, denormalized wins.
|
|
95
|
+
|
|
96
|
+
### D. Company Lookups
|
|
97
|
+
|
|
98
|
+
| Test | Query | Normalized | Denormalized | Winner |
|
|
99
|
+
| ---- | -------------------------- | ---------- | ------------ | ------------------- |
|
|
100
|
+
| D1 | Company by ID | **4ms** | 31ms | Normalized (7.8x) |
|
|
101
|
+
| D2 | Industry + employee filter | 742ms | **202ms** | Denormalized (3.7x) |
|
|
102
|
+
|
|
103
|
+
### E. Edge Cases
|
|
104
|
+
|
|
105
|
+
| Test | Query | Normalized | Denormalized | Winner |
|
|
106
|
+
| ---- | --------------------- | ---------- | ------------ | ------------------- |
|
|
107
|
+
| E1 | Headline (blockchain) | 713ms | **384ms** | Denormalized (1.9x) |
|
|
108
|
+
| E2 | Company description | 144ms | 152ms | Tie |
|
|
109
|
+
|
|
110
|
+
### F. Verification Tests
|
|
111
|
+
|
|
112
|
+
| Test | Query | Normalized | Denormalized | Winner |
|
|
113
|
+
| ---- | ---------------------------- | ---------- | ------------ | ------------------- |
|
|
114
|
+
| F1 | Multi-skill + company size | 28,173ms | **1,281ms** | Denormalized (22x) |
|
|
115
|
+
| F2 | Country + org (GIN) | **990ms** | 4,594ms | Normalized (4.6x) |
|
|
116
|
+
| F3 | Title regex + company filter | 434ms | **227ms** | Denormalized (1.9x) |
|
|
117
|
+
|
|
118
|
+
### G. Index Pattern Tests
|
|
119
|
+
|
|
120
|
+
| Test | Query | Normalized | Denormalized | Winner |
|
|
121
|
+
| ---- | ------------------------- | ---------- | ------------ | ----------------- |
|
|
122
|
+
| G1 | org ILIKE (GIN indexed) | **430ms** | 29,409ms | Normalized (68x) |
|
|
123
|
+
| G2 | headline ILIKE (no index) | 2,530ms | **1,258ms** | Denormalized (2x) |
|
|
124
|
+
| G3 | title ILIKE | **64ms** | 313ms | Normalized (4.9x) |
|
|
125
|
+
| G4 | updated_at filter | **4ms** | 14ms | Normalized (3.5x) |
|
|
126
|
+
|
|
127
|
+
### H. Cross-Table JOIN Patterns
|
|
128
|
+
|
|
129
|
+
| Test | Query | Normalized | Denormalized | Winner |
|
|
130
|
+
| ---- | ------------------------- | ---------- | ------------ | ------------------ |
|
|
131
|
+
| H1 | Headline + employee_count | 20,205ms | **217ms** | Denormalized (93x) |
|
|
132
|
+
| H2 | Skill + company industry | TIMEOUT | **3,553ms** | Denormalized (∞) |
|
|
133
|
+
| H3 | Multi-filter + company | TIMEOUT | **4,947ms** | Denormalized (∞) |
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Decision Rules for Cross-Table Queries
|
|
138
|
+
|
|
139
|
+
### Use Normalized (`linkedin_profile` + `linkedin_company` JOINs) When:
|
|
140
|
+
|
|
141
|
+
1. **Company-first lookup** - Start with company ID, get employees
|
|
142
|
+
2. **GIN-indexed field** - Searching `linkedin_profile.org` (company name)
|
|
143
|
+
3. **Indexed lookups** - `updated_at`, company ID, profile ID
|
|
144
|
+
4. **Title field search** - `linkedin_profile.title` is faster
|
|
145
|
+
5. **Indexed JOIN tables** - `linkedin_crunchbase_funding`, `linkedin_profile_position3` by company
|
|
146
|
+
|
|
147
|
+
### Use Denormalized (`lkd_profile` JOIN `lkd_company`) When:
|
|
148
|
+
|
|
149
|
+
1. **Headline + company filter** - 93x faster
|
|
150
|
+
2. **Skill + company constraint** - Normalized times out
|
|
151
|
+
3. **Multi-filter combinations** - 22x faster
|
|
152
|
+
4. **Industry + employee_count** - 3.7x faster
|
|
153
|
+
5. **Text filter spanning profile + company** - Often only option
|
|
154
|
+
|
|
155
|
+
### Never Use:
|
|
156
|
+
|
|
157
|
+
1. `lkd_profile.company_name` ILIKE - Use `linkedin_profile.org` (68x faster)
|
|
158
|
+
2. Normalized multi-JOIN with headline filter - Will timeout or be 20s+
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Recommended Query Patterns
|
|
163
|
+
|
|
164
|
+
### Pattern 1: Find Employees at Company by Name
|
|
165
|
+
|
|
166
|
+
```sql
|
|
167
|
+
-- ✅ BEST: Use GIN-indexed org field
|
|
168
|
+
SELECT id, first_name, title, headline, org
|
|
169
|
+
FROM linkedin_profile
|
|
170
|
+
WHERE org ILIKE '%Google%'
|
|
171
|
+
LIMIT 50
|
|
172
|
+
-- Result: 274ms
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Pattern 2: Find Engineers at Large Companies
|
|
176
|
+
|
|
177
|
+
```sql
|
|
178
|
+
-- ✅ BEST: Denormalized JOIN (93x faster)
|
|
179
|
+
SELECT lkd.profile_id, lkd.first_name, lkd.headline, lkdc.name, lkdc.employee_count
|
|
180
|
+
FROM lkd_profile lkd
|
|
181
|
+
JOIN lkd_company lkdc ON lkdc.linkedin_company_id = lkd.linkedin_company_id
|
|
182
|
+
WHERE lkd.headline ILIKE '%engineer%'
|
|
183
|
+
AND lkdc.employee_count > 1000
|
|
184
|
+
LIMIT 50
|
|
185
|
+
-- Result: 217ms
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Pattern 3: Find People with Skills at Specific Company Types
|
|
189
|
+
|
|
190
|
+
```sql
|
|
191
|
+
-- ✅ BEST: Denormalized (normalized times out)
|
|
192
|
+
SELECT lkd.profile_id, lkd.first_name, lkd.headline, lkdc.name
|
|
193
|
+
FROM lkd_profile lkd
|
|
194
|
+
JOIN lkd_company lkdc ON lkdc.linkedin_company_id = lkd.linkedin_company_id
|
|
195
|
+
WHERE 'Python' = ANY(lkd.skills)
|
|
196
|
+
AND 'SQL' = ANY(lkd.skills)
|
|
197
|
+
AND lkdc.employee_count BETWEEN 100 AND 5000
|
|
198
|
+
LIMIT 50
|
|
199
|
+
-- Result: 1,281ms (normalized: 28,173ms)
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### Pattern 4: Prospecting Query (Profile Criteria + Company Criteria)
|
|
203
|
+
|
|
204
|
+
```sql
|
|
205
|
+
-- ✅ BEST: Denormalized for multi-filter
|
|
206
|
+
SELECT lkd.profile_id, lkd.first_name, lkd.title, lkdc.name, lkdc.employee_count
|
|
207
|
+
FROM lkd_profile lkd
|
|
208
|
+
JOIN lkd_company lkdc ON lkdc.linkedin_company_id = lkd.linkedin_company_id
|
|
209
|
+
WHERE lkd.title ~* '(manager|director|lead)'
|
|
210
|
+
AND lkdc.employee_count BETWEEN 100 AND 1000
|
|
211
|
+
LIMIT 50
|
|
212
|
+
-- Result: 227ms (normalized: 434ms)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Pattern 5: Decision Makers at Funded Startups
|
|
216
|
+
|
|
217
|
+
```sql
|
|
218
|
+
-- ✅ BEST: Normalized when using indexed funding table
|
|
219
|
+
SELECT DISTINCT lp.id, lp.first_name, lp.title, lc.company_name
|
|
220
|
+
FROM linkedin_profile lp
|
|
221
|
+
JOIN linkedin_profile_position3 pos ON pos.linkedin_profile_id = lp.id
|
|
222
|
+
JOIN linkedin_company lc ON lc.id = pos.linkedin_company_id
|
|
223
|
+
JOIN linkedin_crunchbase_funding cf ON cf.linkedin_company_id = lc.id
|
|
224
|
+
WHERE pos.end_date IS NULL
|
|
225
|
+
AND lp.title ~* '(CEO|CTO|VP|Director|Head)'
|
|
226
|
+
AND lc.employee_count BETWEEN 10 AND 500
|
|
227
|
+
LIMIT 50
|
|
228
|
+
-- Result: 1,198ms
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## Summary: The Cross-Table Golden Rules
|
|
234
|
+
|
|
235
|
+
1. **Company name search** → Always use `linkedin_profile.org` (GIN indexed, 68x faster)
|
|
236
|
+
2. **Headline/skill + company constraint** → Always use denormalized JOIN (20-93x faster, normalized often times out)
|
|
237
|
+
3. **Company-first lookups** → Use normalized (5-8x faster)
|
|
238
|
+
4. **Indexed table JOINs (funding, positions)** → Normalized is fine
|
|
239
|
+
5. **Multi-filter profile + company** → Denormalized is the only option that works
|
|
240
|
+
|
|
241
|
+
### Quick Decision:
|
|
242
|
+
|
|
243
|
+
```
|
|
244
|
+
Need to search by company name?
|
|
245
|
+
└─ YES → Use linkedin_profile.org
|
|
246
|
+
|
|
247
|
+
Need profile text filter (headline/skills) + company constraint?
|
|
248
|
+
└─ YES → Use lkd_profile JOIN lkd_company
|
|
249
|
+
|
|
250
|
+
Need company ID lookup or indexed JOIN?
|
|
251
|
+
└─ YES → Use normalized tables
|
|
252
|
+
|
|
253
|
+
Default for prospecting queries:
|
|
254
|
+
└─ Use lkd_profile JOIN lkd_company
|
|
255
|
+
```
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# B2B Database Guide
|
|
2
|
+
|
|
3
|
+
A comprehensive B2B data enrichment database with LinkedIn profiles, companies, job postings, and funding data.
|
|
4
|
+
|
|
5
|
+
## Database Scale
|
|
6
|
+
|
|
7
|
+
| Table | Estimated Rows |
|
|
8
|
+
| ----------------------------- | ---------------- |
|
|
9
|
+
| `linkedin_profile` | **1.15 billion** |
|
|
10
|
+
| `linkedin_profile_position3` | **2.6 billion** |
|
|
11
|
+
| `linkedin_job` | **1.48 billion** |
|
|
12
|
+
| `linkedin_profile_education2` | **965 million** |
|
|
13
|
+
| `linkedin_profile_slug` | **1.14 billion** |
|
|
14
|
+
| `person` | **1.32 billion** |
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## Access Permissions
|
|
19
|
+
|
|
20
|
+
### ✅ Tables WITH Access (48 tables)
|
|
21
|
+
|
|
22
|
+
| Category | Tables |
|
|
23
|
+
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
24
|
+
| **Core LinkedIn** | `linkedin_profile`, `linkedin_company`, `linkedin_job`, `linkedin_profile_position3`, `linkedin_profile_education2` |
|
|
25
|
+
| **Slugs** | `linkedin_profile_slug`, `linkedin_company_slug` |
|
|
26
|
+
| **Funding** | `linkedin_crunchbase_funding` |
|
|
27
|
+
| **Reference** | `linkedin_industry`, `linkedin_school`, `linkedin_language`, `linkedin_specialty`, `linkedin_work_modality` |
|
|
28
|
+
| **Profile Details** | `linkedin_profile_certification`, `linkedin_profile_award`, `linkedin_profile_project`, `linkedin_profile_volunteer_experience`, `linkedin_profile_recommendation2`, `linkedin_profile_test_scores`, `linkedin_profile_language_proficiency` |
|
|
29
|
+
| **Company Details** | `linkedin_company_address2`, `linkedin_company_post` |
|
|
30
|
+
| **Content** | `linkedin_article`, `linkedin_patent`, `linkedin_project`, `linkedin_publication2` |
|
|
31
|
+
| **Jobs** | `job_title`, `job_function`, `job_seniority`, `job_employment_type`, `job_academic_qualification` |
|
|
32
|
+
| **Geography** | `country`, `locality`, `naics_code` |
|
|
33
|
+
| **Other** | `company`, `company_type`, `person`, `language_proficiency` |
|
|
34
|
+
|
|
35
|
+
### ❌ Tables WITHOUT Access (84 tables)
|
|
36
|
+
|
|
37
|
+
| Category | Tables (Permission Denied) |
|
|
38
|
+
| --------------------- | ----------------------------------------------------------------- |
|
|
39
|
+
| **Contact Data** | `email_address`, `email_address_linkedin_profile`, `phone_number` |
|
|
40
|
+
| **Company Reference** | `company_size`, `company_location`, `company_country` |
|
|
41
|
+
| **Geography** | `linkedin_geo` |
|
|
42
|
+
| **Indeed** | All `indeed_*` tables |
|
|
43
|
+
| **Domain/Web** | `domain`, `host`, `domain_traffic_estimate`, `web_tag` |
|
|
44
|
+
| **App Store** | `ios_sdk`, `play_store_sdk`, `itunes_*`, `play_store_*` |
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Top Industries
|
|
49
|
+
|
|
50
|
+
| ID | Industry | Companies |
|
|
51
|
+
| --- | ------------------------------------ | --------- |
|
|
52
|
+
| 48 | Construction | 1.96M |
|
|
53
|
+
| 44 | Real Estate | 1.80M |
|
|
54
|
+
| 96 | IT Services and IT Consulting | 1.79M |
|
|
55
|
+
| 27 | Retail | 1.51M |
|
|
56
|
+
| 80 | Advertising Services | 1.33M |
|
|
57
|
+
| 11 | Business Consulting and Services | 1.32M |
|
|
58
|
+
| 4 | **Software Development** | 1.19M |
|
|
59
|
+
| 6 | Technology, Information and Internet | 617K |
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## GTM Query Examples
|
|
64
|
+
|
|
65
|
+
### 🏢 Company Enrichment
|
|
66
|
+
|
|
67
|
+
**Find company by universal_name (FAST ~300ms)**
|
|
68
|
+
|
|
69
|
+
```sql
|
|
70
|
+
SELECT id, company_name, domain, website, employee_count,
|
|
71
|
+
locality, country_code, description
|
|
72
|
+
FROM linkedin_company
|
|
73
|
+
WHERE universal_name = 'stripe';
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
**Find company by domain (FAST ~500ms)**
|
|
77
|
+
|
|
78
|
+
```sql
|
|
79
|
+
SELECT id, company_name, universal_name, employee_count, locality
|
|
80
|
+
FROM linkedin_company
|
|
81
|
+
WHERE domain = 'openai.com';
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Full company enrichment with industry**
|
|
85
|
+
|
|
86
|
+
```sql
|
|
87
|
+
SELECT lc.company_name, lc.domain, lc.employee_count, lc.locality,
|
|
88
|
+
lc.description, li.name as industry_name
|
|
89
|
+
FROM linkedin_company lc
|
|
90
|
+
LEFT JOIN linkedin_industry li ON li.id = lc.industry_code
|
|
91
|
+
WHERE lc.universal_name = 'openai';
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### 👤 Profile Lookup
|
|
95
|
+
|
|
96
|
+
**Find profile by LinkedIn slug (FAST ~400ms - use key64)**
|
|
97
|
+
|
|
98
|
+
```sql
|
|
99
|
+
SELECT lp.first_name, lp.last_name, lp.headline, lp.location_name
|
|
100
|
+
FROM linkedin_profile lp
|
|
101
|
+
JOIN linkedin_profile_slug lps ON lps.linkedin_profile_id = lp.id
|
|
102
|
+
WHERE lps.slug_key64 = key64('satyanadella');
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**Find company by LinkedIn slug (FAST ~400ms - use key64)**
|
|
106
|
+
|
|
107
|
+
```sql
|
|
108
|
+
SELECT lc.id, lc.company_name, lc.domain, lc.employee_count
|
|
109
|
+
FROM linkedin_company lc
|
|
110
|
+
JOIN linkedin_company_slug lcs ON lcs.linkedin_company_id = lc.id
|
|
111
|
+
WHERE lcs.slug_key64 = key64('meta');
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### 👥 Find Employees at a Company
|
|
115
|
+
|
|
116
|
+
**Current employees at a company**
|
|
117
|
+
|
|
118
|
+
```sql
|
|
119
|
+
SELECT lp.first_name, lp.last_name, lp.headline, pos.title
|
|
120
|
+
FROM linkedin_profile lp
|
|
121
|
+
JOIN linkedin_profile_position3 pos ON pos.linkedin_profile_id = lp.id
|
|
122
|
+
WHERE pos.linkedin_company_id = 2135371 -- Stripe's ID
|
|
123
|
+
AND pos.end_date IS NULL
|
|
124
|
+
LIMIT 100;
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Headcount by role at a company**
|
|
128
|
+
|
|
129
|
+
```sql
|
|
130
|
+
SELECT pos.title, COUNT(*) as count
|
|
131
|
+
FROM linkedin_profile_position3 pos
|
|
132
|
+
WHERE pos.linkedin_company_id = 11130470 -- OpenAI
|
|
133
|
+
AND pos.end_date IS NULL
|
|
134
|
+
GROUP BY pos.title
|
|
135
|
+
ORDER BY count DESC
|
|
136
|
+
LIMIT 20;
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### 🎯 Find Decision Makers
|
|
140
|
+
|
|
141
|
+
**VPs of Sales at mid-to-large companies**
|
|
142
|
+
|
|
143
|
+
```sql
|
|
144
|
+
SELECT lp.first_name, lp.last_name, lp.headline,
|
|
145
|
+
pos.title, lc.company_name, lc.employee_count
|
|
146
|
+
FROM linkedin_profile lp
|
|
147
|
+
JOIN linkedin_profile_position3 pos ON pos.linkedin_profile_id = lp.id
|
|
148
|
+
JOIN linkedin_company lc ON lc.id = pos.linkedin_company_id
|
|
149
|
+
WHERE pos.title ILIKE '%vp%sales%'
|
|
150
|
+
AND pos.end_date IS NULL
|
|
151
|
+
AND lc.employee_count > 50
|
|
152
|
+
LIMIT 20;
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**C-suite at mid-size software companies (~10 seconds)**
|
|
156
|
+
|
|
157
|
+
```sql
|
|
158
|
+
SELECT lp.first_name, lp.last_name, pos.title,
|
|
159
|
+
lc.company_name, lc.employee_count
|
|
160
|
+
FROM linkedin_profile lp
|
|
161
|
+
JOIN linkedin_profile_position3 pos ON pos.linkedin_profile_id = lp.id
|
|
162
|
+
JOIN linkedin_company lc ON lc.id = pos.linkedin_company_id
|
|
163
|
+
WHERE pos.end_date IS NULL
|
|
164
|
+
AND lc.employee_count BETWEEN 100 AND 500
|
|
165
|
+
AND lc.industry_code = 4 -- Software Development
|
|
166
|
+
AND (pos.title ILIKE 'ceo%' OR pos.title ILIKE 'cto%'
|
|
167
|
+
OR pos.title ILIKE 'cfo%' OR pos.title ILIKE 'chief%')
|
|
168
|
+
LIMIT 20;
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### 🔍 Find People by Skills
|
|
172
|
+
|
|
173
|
+
```sql
|
|
174
|
+
SELECT lp.first_name, lp.last_name, lp.headline,
|
|
175
|
+
lp.skills[1:5] as top_skills, lp.location_name
|
|
176
|
+
FROM linkedin_profile lp
|
|
177
|
+
WHERE 'Python' = ANY(lp.skills)
|
|
178
|
+
AND 'Machine Learning' = ANY(lp.skills)
|
|
179
|
+
LIMIT 20;
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### 🎓 Find Alumni
|
|
183
|
+
|
|
184
|
+
```sql
|
|
185
|
+
SELECT lp.first_name, lp.last_name, lp.headline,
|
|
186
|
+
edu.school_name, edu.degree
|
|
187
|
+
FROM linkedin_profile lp
|
|
188
|
+
JOIN linkedin_profile_education2 edu ON edu.linkedin_profile_id = lp.id
|
|
189
|
+
WHERE edu.school_name ILIKE '%stanford%'
|
|
190
|
+
LIMIT 20;
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### 📍 Find People by Location
|
|
194
|
+
|
|
195
|
+
```sql
|
|
196
|
+
SELECT lp.first_name, lp.last_name, lp.headline, lp.location_name
|
|
197
|
+
FROM linkedin_profile lp
|
|
198
|
+
WHERE lp.location_name ILIKE '%san francisco%'
|
|
199
|
+
LIMIT 20;
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### 💼 Job Postings
|
|
203
|
+
|
|
204
|
+
**Recent jobs at a company with salary info**
|
|
205
|
+
|
|
206
|
+
```sql
|
|
207
|
+
SELECT lj.title, lj.location, lj.salary_range,
|
|
208
|
+
lj.salary_min, lj.salary_max, lj.posted_date, lj.applicants
|
|
209
|
+
FROM linkedin_job lj
|
|
210
|
+
WHERE lj.linkedin_company_id = 2135371 -- Stripe
|
|
211
|
+
ORDER BY lj.posted_date DESC NULLS LAST
|
|
212
|
+
LIMIT 20;
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### 💰 Companies with Funding
|
|
216
|
+
|
|
217
|
+
```sql
|
|
218
|
+
SELECT lc.company_name, lc.domain, lc.employee_count,
|
|
219
|
+
cf.round_name, cf.round_date, cf.round_amount,
|
|
220
|
+
cf.investor_names[1:3] as top_investors
|
|
221
|
+
FROM linkedin_company lc
|
|
222
|
+
JOIN linkedin_crunchbase_funding cf ON cf.linkedin_company_id = lc.id
|
|
223
|
+
WHERE cf.round_date >= '2024-01-01'
|
|
224
|
+
ORDER BY cf.round_date DESC
|
|
225
|
+
LIMIT 20;
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## Performance Guide
|
|
231
|
+
|
|
232
|
+
### ✅ Fast Queries (indexed lookups)
|
|
233
|
+
|
|
234
|
+
| Lookup Method | Speed | Example |
|
|
235
|
+
| ---------------------------------- | ------ | ---------------------------------- |
|
|
236
|
+
| Company by `universal_name` | ~300ms | `WHERE universal_name = 'stripe'` |
|
|
237
|
+
| Company by `domain` | ~500ms | `WHERE domain = 'stripe.com'` |
|
|
238
|
+
| Company by `id` | ~300ms | `WHERE id = 2135371` |
|
|
239
|
+
| Profile/Company by `slug_key64` | ~400ms | `WHERE slug_key64 = key64('meta')` |
|
|
240
|
+
| Positions by `linkedin_company_id` | ~500ms | `WHERE linkedin_company_id = X` |
|
|
241
|
+
| Positions by `linkedin_profile_id` | ~500ms | `WHERE linkedin_profile_id = X` |
|
|
242
|
+
| Jobs by `linkedin_company_id` | ~1s | `WHERE linkedin_company_id = X` |
|
|
243
|
+
| Education by `linkedin_profile_id` | ~500ms | `WHERE linkedin_profile_id = X` |
|
|
244
|
+
|
|
245
|
+
### ⚠️ Slow Queries (full table scans)
|
|
246
|
+
|
|
247
|
+
| Query Type | Speed | Reason |
|
|
248
|
+
| ----------------------------------- | --------- | ------------------------ |
|
|
249
|
+
| Company by `company_name ILIKE` | Timeout | No index on company_name |
|
|
250
|
+
| Profile by `slug` (without key64) | Timeout | No index on raw slug |
|
|
251
|
+
| Profile by `headline ILIKE` | Very slow | Full text scan |
|
|
252
|
+
| Aggregate queries across industries | 60s+ | Large table joins |
|
|
253
|
+
| Profile by `location_name ILIKE` | Slow | No index |
|
|
254
|
+
|
|
255
|
+
### 🔑 Key Functions
|
|
256
|
+
|
|
257
|
+
```sql
|
|
258
|
+
-- Convert slug to indexed key64 for fast lookups
|
|
259
|
+
key64('stripe') -- Returns bigint for index lookup
|
|
260
|
+
|
|
261
|
+
-- Example: Fast profile lookup
|
|
262
|
+
WHERE slug_key64 = key64('satyanadella')
|
|
263
|
+
|
|
264
|
+
-- Example: Fast company lookup
|
|
265
|
+
WHERE slug_key64 = key64('meta')
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
---
|
|
269
|
+
|
|
270
|
+
## Common Gotchas
|
|
271
|
+
|
|
272
|
+
1. **Use `company_name` not `name`** - The column is `company_name` on linkedin_company
|
|
273
|
+
2. **Use `slug_key64` for slug lookups** - Raw `slug` column is not indexed
|
|
274
|
+
3. **Use `linkedin_profile_position3`** - position2 is legacy
|
|
275
|
+
4. **Use `industry_code`** - Not `linkedin_industry_id` on linkedin_company
|
|
276
|
+
5. **NULL `end_date` = current position** - Check for current employees
|
|
277
|
+
6. **Email/phone tables are restricted** - Cannot access contact data directly
|
|
278
|
+
7. **Always use LIMIT** - Tables are massive
|
|
279
|
+
8. **Use `ILIKE` for case-insensitive** - Names vary in casing
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
## Schema Exploration
|
|
284
|
+
|
|
285
|
+
```sql
|
|
286
|
+
-- List accessible tables
|
|
287
|
+
SELECT table_name
|
|
288
|
+
FROM information_schema.table_privileges
|
|
289
|
+
WHERE grantee = 'jzt2be9botwq' AND privilege_type = 'SELECT'
|
|
290
|
+
ORDER BY table_name;
|
|
291
|
+
|
|
292
|
+
-- Describe a table
|
|
293
|
+
SELECT column_name, data_type, is_nullable
|
|
294
|
+
FROM information_schema.columns
|
|
295
|
+
WHERE table_name = 'linkedin_profile';
|
|
296
|
+
|
|
297
|
+
-- Check table indexes
|
|
298
|
+
SELECT indexname, indexdef
|
|
299
|
+
FROM pg_indexes
|
|
300
|
+
WHERE tablename = 'linkedin_company';
|
|
301
|
+
|
|
302
|
+
-- Check table row counts
|
|
303
|
+
SELECT relname AS table_name, reltuples::bigint AS estimated_rows
|
|
304
|
+
FROM pg_class
|
|
305
|
+
WHERE relkind = 'r' AND relnamespace = 'public'::regnamespace
|
|
306
|
+
ORDER BY reltuples DESC
|
|
307
|
+
LIMIT 25;
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
---
|
|
311
|
+
|
|
312
|
+
## Related Documentation
|
|
313
|
+
|
|
314
|
+
- **[B2B_SCHEMA.md](./B2B_SCHEMA.md)** - Complete schema reference with all columns and indexes
|