@hasna/experts 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -13
- package/dist/cli/index.js +1842 -74
- package/dist/connectors.d.ts +63 -4
- package/dist/connectors.d.ts.map +1 -1
- package/dist/contacts.d.ts +96 -0
- package/dist/contacts.d.ts.map +1 -0
- package/dist/crawl.d.ts +1 -0
- package/dist/crawl.d.ts.map +1 -1
- package/dist/db.d.ts +97 -2
- package/dist/db.d.ts.map +1 -1
- package/dist/embed.d.ts +57 -0
- package/dist/embed.d.ts.map +1 -0
- package/dist/enrich.d.ts +81 -1
- package/dist/enrich.d.ts.map +1 -1
- package/dist/format.d.ts +4 -1
- package/dist/format.d.ts.map +1 -1
- package/dist/identity.d.ts +23 -0
- package/dist/identity.d.ts.map +1 -0
- package/dist/index.d.ts +7 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1544 -21
- package/dist/score.d.ts +25 -0
- package/dist/score.d.ts.map +1 -0
- package/dist/sdk.d.ts +26 -1
- package/dist/sdk.d.ts.map +1 -1
- package/dist/sdk.js +12 -1
- package/dist/server/index.d.ts.map +1 -1
- package/dist/server/index.js +960 -14
- package/dist/sources/adplist.d.ts +43 -0
- package/dist/sources/adplist.d.ts.map +1 -0
- package/dist/sources/clarity.d.ts +37 -0
- package/dist/sources/clarity.d.ts.map +1 -0
- package/dist/sources/common.d.ts +14 -0
- package/dist/sources/common.d.ts.map +1 -0
- package/dist/sources/glg.d.ts +36 -0
- package/dist/sources/glg.d.ts.map +1 -0
- package/dist/sources/index.d.ts +5 -1
- package/dist/sources/index.d.ts.map +1 -1
- package/dist/sources/mentorcruise.d.ts +47 -0
- package/dist/sources/mentorcruise.d.ts.map +1 -0
- package/dist/sync.d.ts +71 -0
- package/dist/sync.d.ts.map +1 -0
- package/dist/types.d.ts +34 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +1 -1
package/dist/cli/index.js
CHANGED
|
@@ -32,6 +32,206 @@ function expertText(e) {
|
|
|
32
32
|
return [e.title, e.headline, e.bio].filter(Boolean).join(". ");
|
|
33
33
|
}
|
|
34
34
|
|
|
35
|
+
// src/score.ts
|
|
36
|
+
var DEFAULT_WEIGHTS = {
|
|
37
|
+
rating: 0.3,
|
|
38
|
+
reviews: 0.2,
|
|
39
|
+
followers: 0.25,
|
|
40
|
+
featured: 0.1,
|
|
41
|
+
verified: 0.05,
|
|
42
|
+
recency: 0.1
|
|
43
|
+
};
|
|
44
|
+
var clamp01 = (n) => Math.max(0, Math.min(1, n));
|
|
45
|
+
var logNorm = (x, cap) => clamp01(Math.log10(1 + Math.max(0, x)) / Math.log10(1 + cap));
|
|
46
|
+
function authorityScore(e, inputs = {}, weights = DEFAULT_WEIGHTS) {
|
|
47
|
+
const rating = clamp01((e.rating || 0) / 5);
|
|
48
|
+
const reviews = logNorm(e.ratingCount || 0, 1000);
|
|
49
|
+
const followers = logNorm(inputs.followers ?? 0, 1e6);
|
|
50
|
+
const featured = e.featured ? 1 : 0;
|
|
51
|
+
const verified = e.verified ? 1 : 0;
|
|
52
|
+
const recency = inputs.daysSinceLastTweet == null ? 0 : clamp01(1 - inputs.daysSinceLastTweet / 30);
|
|
53
|
+
const raw = weights.rating * rating + weights.reviews * reviews + weights.followers * followers + weights.featured * featured + weights.verified * verified + weights.recency * recency;
|
|
54
|
+
return Math.round(raw * 1000) / 10;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// src/embed.ts
|
|
58
|
+
var STOPWORDS = new Set([
|
|
59
|
+
"the",
|
|
60
|
+
"a",
|
|
61
|
+
"an",
|
|
62
|
+
"and",
|
|
63
|
+
"or",
|
|
64
|
+
"of",
|
|
65
|
+
"to",
|
|
66
|
+
"in",
|
|
67
|
+
"for",
|
|
68
|
+
"on",
|
|
69
|
+
"at",
|
|
70
|
+
"is",
|
|
71
|
+
"are",
|
|
72
|
+
"with",
|
|
73
|
+
"by",
|
|
74
|
+
"as",
|
|
75
|
+
"be",
|
|
76
|
+
"this",
|
|
77
|
+
"that",
|
|
78
|
+
"it",
|
|
79
|
+
"from",
|
|
80
|
+
"i",
|
|
81
|
+
"you",
|
|
82
|
+
"we",
|
|
83
|
+
"they"
|
|
84
|
+
]);
|
|
85
|
+
function tokenize(text) {
|
|
86
|
+
return (text || "").toLowerCase().replace(/https?:\/\/\S+/g, " ").split(/[^a-z0-9]+/).filter((t) => t.length >= 2 && !STOPWORDS.has(t));
|
|
87
|
+
}
|
|
88
|
+
function fnv1a(s) {
|
|
89
|
+
let h = 2166136261;
|
|
90
|
+
for (let i = 0;i < s.length; i++) {
|
|
91
|
+
h ^= s.charCodeAt(i);
|
|
92
|
+
h = h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24)) >>> 0;
|
|
93
|
+
}
|
|
94
|
+
return h >>> 0;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
class HashingEmbedder {
|
|
98
|
+
id = "hash-v1";
|
|
99
|
+
dim;
|
|
100
|
+
constructor(dim = 512) {
|
|
101
|
+
this.dim = dim;
|
|
102
|
+
}
|
|
103
|
+
one(text) {
|
|
104
|
+
const v = new Array(this.dim).fill(0);
|
|
105
|
+
const toks = tokenize(text);
|
|
106
|
+
for (let i = 0;i < toks.length; i++) {
|
|
107
|
+
const uni = toks[i];
|
|
108
|
+
v[fnv1a(uni) % this.dim] += 1;
|
|
109
|
+
if (i + 1 < toks.length) {
|
|
110
|
+
const bi = uni + "_" + toks[i + 1];
|
|
111
|
+
v[fnv1a(bi) % this.dim] += 0.5;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
const norm = Math.sqrt(v.reduce((s, x) => s + x * x, 0)) || 1;
|
|
115
|
+
return v.map((x) => x / norm);
|
|
116
|
+
}
|
|
117
|
+
async embed(texts) {
|
|
118
|
+
return texts.map((t) => this.one(t));
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
class OpenAIEmbedder {
|
|
123
|
+
id;
|
|
124
|
+
dim = 1536;
|
|
125
|
+
apiKey;
|
|
126
|
+
model;
|
|
127
|
+
fetchFn;
|
|
128
|
+
constructor(opts = {}) {
|
|
129
|
+
this.apiKey = opts.apiKey ?? process.env.OPENAI_API_KEY ?? "";
|
|
130
|
+
this.model = opts.model ?? "text-embedding-3-small";
|
|
131
|
+
this.fetchFn = opts.fetchFn ?? fetch;
|
|
132
|
+
this.id = `openai:${this.model}`;
|
|
133
|
+
}
|
|
134
|
+
async embed(texts) {
|
|
135
|
+
const res = await this.fetchFn("https://api.openai.com/v1/embeddings", {
|
|
136
|
+
method: "POST",
|
|
137
|
+
headers: { "content-type": "application/json", authorization: `Bearer ${this.apiKey}` },
|
|
138
|
+
body: JSON.stringify({ model: this.model, input: texts })
|
|
139
|
+
});
|
|
140
|
+
if (!res.ok)
|
|
141
|
+
throw new Error(`OpenAI embeddings ${res.status}: ${(await res.text()).slice(0, 200)}`);
|
|
142
|
+
const data = await res.json();
|
|
143
|
+
return data.data.map((d) => d.embedding);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
function getEmbedder() {
|
|
147
|
+
if (process.env.EXPERTS_EMBEDDER === "openai" && process.env.OPENAI_API_KEY) {
|
|
148
|
+
return new OpenAIEmbedder;
|
|
149
|
+
}
|
|
150
|
+
return new HashingEmbedder;
|
|
151
|
+
}
|
|
152
|
+
function cosine(a, b) {
|
|
153
|
+
let dot = 0;
|
|
154
|
+
let na = 0;
|
|
155
|
+
let nb = 0;
|
|
156
|
+
const n = Math.min(a.length, b.length);
|
|
157
|
+
for (let i = 0;i < n; i++) {
|
|
158
|
+
dot += a[i] * b[i];
|
|
159
|
+
na += a[i] * a[i];
|
|
160
|
+
nb += b[i] * b[i];
|
|
161
|
+
}
|
|
162
|
+
const d = Math.sqrt(na) * Math.sqrt(nb);
|
|
163
|
+
return d === 0 ? 0 : dot / d;
|
|
164
|
+
}
|
|
165
|
+
function packVector(v) {
|
|
166
|
+
const f = new Float32Array(v);
|
|
167
|
+
return new Uint8Array(f.buffer);
|
|
168
|
+
}
|
|
169
|
+
function unpackVector(buf) {
|
|
170
|
+
const ab = buf instanceof Uint8Array ? buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength) : buf;
|
|
171
|
+
return Array.from(new Float32Array(ab));
|
|
172
|
+
}
|
|
173
|
+
function expertEmbedText(e) {
|
|
174
|
+
return [e.fullName, e.title, e.headline, e.bio, e.topics.join(" "), e.tags.join(" ")].filter(Boolean).join(". ");
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// src/identity.ts
|
|
178
|
+
function socialHandles(e) {
|
|
179
|
+
const out = new Set;
|
|
180
|
+
for (const [platform, url] of Object.entries(e.socials || {})) {
|
|
181
|
+
if (!url)
|
|
182
|
+
continue;
|
|
183
|
+
const m = String(url).match(/(?:[a-z]+\.[a-z]+\/@?)([A-Za-z0-9_.-]+)/i);
|
|
184
|
+
const handle = (m ? m[1] : String(url)).toLowerCase().replace(/\/+$/, "");
|
|
185
|
+
if (handle)
|
|
186
|
+
out.add(`${platform}:${handle}`);
|
|
187
|
+
}
|
|
188
|
+
return out;
|
|
189
|
+
}
|
|
190
|
+
var key = (e) => `${e.source}:${e.sourceId}`;
|
|
191
|
+
function clusterPersons(experts) {
|
|
192
|
+
const parent = new Map;
|
|
193
|
+
const find = (x) => {
|
|
194
|
+
let r = x;
|
|
195
|
+
while (parent.get(r) !== r)
|
|
196
|
+
r = parent.get(r);
|
|
197
|
+
let c = x;
|
|
198
|
+
while (parent.get(c) !== r) {
|
|
199
|
+
const n = parent.get(c);
|
|
200
|
+
parent.set(c, r);
|
|
201
|
+
c = n;
|
|
202
|
+
}
|
|
203
|
+
return r;
|
|
204
|
+
};
|
|
205
|
+
const union = (a, b) => {
|
|
206
|
+
const ra = find(a);
|
|
207
|
+
const rb = find(b);
|
|
208
|
+
if (ra === rb)
|
|
209
|
+
return;
|
|
210
|
+
if (ra < rb)
|
|
211
|
+
parent.set(rb, ra);
|
|
212
|
+
else
|
|
213
|
+
parent.set(ra, rb);
|
|
214
|
+
};
|
|
215
|
+
for (const e of experts)
|
|
216
|
+
parent.set(key(e), key(e));
|
|
217
|
+
const byHandle = new Map;
|
|
218
|
+
for (const e of experts) {
|
|
219
|
+
for (const h of socialHandles(e)) {
|
|
220
|
+
if (!byHandle.has(h))
|
|
221
|
+
byHandle.set(h, []);
|
|
222
|
+
byHandle.get(h).push(key(e));
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
for (const members of byHandle.values()) {
|
|
226
|
+
for (let i = 1;i < members.length; i++)
|
|
227
|
+
union(members[0], members[i]);
|
|
228
|
+
}
|
|
229
|
+
const out = new Map;
|
|
230
|
+
for (const e of experts)
|
|
231
|
+
out.set(key(e), find(key(e)));
|
|
232
|
+
return out;
|
|
233
|
+
}
|
|
234
|
+
|
|
35
235
|
// src/db.ts
|
|
36
236
|
function defaultDbPath() {
|
|
37
237
|
return process.env.OPEN_EXPERTS_DB || join(homedir(), ".hasna", "experts", "experts.db");
|
|
@@ -115,8 +315,59 @@ class ExpertsDB {
|
|
|
115
315
|
PRIMARY KEY (source, tweet_id)
|
|
116
316
|
);
|
|
117
317
|
CREATE INDEX IF NOT EXISTS idx_tweets_expert ON tweets(source, source_id, created_at DESC);
|
|
318
|
+
|
|
319
|
+
-- Enrichment: recent YouTube videos per expert.
|
|
320
|
+
CREATE TABLE IF NOT EXISTS videos (
|
|
321
|
+
source TEXT NOT NULL, source_id TEXT NOT NULL,
|
|
322
|
+
video_id TEXT NOT NULL, title TEXT, description TEXT,
|
|
323
|
+
published_at TEXT, url TEXT, thumbnail TEXT, view_count INTEGER,
|
|
324
|
+
PRIMARY KEY (source, video_id)
|
|
325
|
+
);
|
|
326
|
+
CREATE INDEX IF NOT EXISTS idx_videos_expert ON videos(source, source_id, published_at DESC);
|
|
327
|
+
|
|
328
|
+
-- Generic external enrichment (linkedin, site/newsletter, \u2026) as JSON.
|
|
329
|
+
CREATE TABLE IF NOT EXISTS ext_profiles (
|
|
330
|
+
source TEXT NOT NULL, source_id TEXT NOT NULL, kind TEXT NOT NULL,
|
|
331
|
+
data TEXT, enriched_at TEXT,
|
|
332
|
+
PRIMARY KEY (source, source_id, kind)
|
|
333
|
+
);
|
|
334
|
+
|
|
335
|
+
-- Enrichment: discovered contact methods (multiple email/phone per expert).
|
|
336
|
+
CREATE TABLE IF NOT EXISTS contacts (
|
|
337
|
+
source TEXT NOT NULL, source_id TEXT NOT NULL,
|
|
338
|
+
type TEXT NOT NULL, value TEXT NOT NULL,
|
|
339
|
+
label TEXT, provider TEXT, confidence REAL,
|
|
340
|
+
status TEXT DEFAULT 'unverified', verified_at TEXT, created_at TEXT,
|
|
341
|
+
PRIMARY KEY (source, source_id, type, value)
|
|
342
|
+
);
|
|
343
|
+
CREATE INDEX IF NOT EXISTS idx_contacts_expert ON contacts(source, source_id);
|
|
344
|
+
CREATE INDEX IF NOT EXISTS idx_contacts_status ON contacts(status);
|
|
345
|
+
|
|
346
|
+
-- Semantic search: one embedding vector per expert.
|
|
347
|
+
CREATE TABLE IF NOT EXISTS vectors (
|
|
348
|
+
source TEXT NOT NULL, source_id TEXT NOT NULL,
|
|
349
|
+
embedder TEXT NOT NULL, dim INTEGER, vec BLOB,
|
|
350
|
+
PRIMARY KEY (source, source_id)
|
|
351
|
+
);
|
|
352
|
+
|
|
353
|
+
-- Identity resolution: maps each expert record to a canonical person.
|
|
354
|
+
CREATE TABLE IF NOT EXISTS persons (
|
|
355
|
+
source TEXT NOT NULL, source_id TEXT NOT NULL, person_id TEXT NOT NULL,
|
|
356
|
+
PRIMARY KEY (source, source_id)
|
|
357
|
+
);
|
|
358
|
+
CREATE INDEX IF NOT EXISTS idx_persons_person ON persons(person_id);
|
|
359
|
+
|
|
360
|
+
-- Change detection: a log of what changed between crawls.
|
|
361
|
+
CREATE TABLE IF NOT EXISTS changes (
|
|
362
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
363
|
+
source TEXT NOT NULL, source_id TEXT NOT NULL,
|
|
364
|
+
kind TEXT NOT NULL, field TEXT, old_value TEXT, new_value TEXT,
|
|
365
|
+
detected_at TEXT
|
|
366
|
+
);
|
|
367
|
+
CREATE INDEX IF NOT EXISTS idx_changes_time ON changes(detected_at DESC);
|
|
118
368
|
`);
|
|
119
369
|
this.addColumnIfMissing("experts", "avatar_local", "TEXT");
|
|
370
|
+
this.addColumnIfMissing("experts", "authority", "REAL DEFAULT 0");
|
|
120
371
|
}
|
|
121
372
|
addColumnIfMissing(table, column, type) {
|
|
122
373
|
const cols = this.db.query(`PRAGMA table_info(${table})`).all();
|
|
@@ -202,6 +453,7 @@ class ExpertsDB {
|
|
|
202
453
|
socials: JSON.parse(r.socials || "{}"),
|
|
203
454
|
extra: JSON.parse(r.extra || "{}"),
|
|
204
455
|
avatarLocal: r.avatar_local || undefined,
|
|
456
|
+
authority: r.authority ?? 0,
|
|
205
457
|
crawledAt: r.crawled_at
|
|
206
458
|
};
|
|
207
459
|
}
|
|
@@ -249,7 +501,7 @@ class ExpertsDB {
|
|
|
249
501
|
where.push("rating >= ?");
|
|
250
502
|
params.push(filters.minRating);
|
|
251
503
|
}
|
|
252
|
-
const sortCol = filters.sort === "price" ? "price" : filters.sort === "name" ? "full_name" : filters.sort === "reviews" ? "rating_count" : "rating";
|
|
504
|
+
const sortCol = filters.sort === "price" ? "price" : filters.sort === "name" ? "full_name" : filters.sort === "reviews" ? "rating_count" : filters.sort === "authority" ? "authority" : "rating";
|
|
253
505
|
const defaultAsc = filters.sort === "name";
|
|
254
506
|
const dir = filters.desc ?? !defaultAsc ? "DESC" : "ASC";
|
|
255
507
|
let sql = "SELECT * FROM experts";
|
|
@@ -325,11 +577,11 @@ class ExpertsDB {
|
|
|
325
577
|
sql += " ORDER BY name";
|
|
326
578
|
return this.db.query(sql).all(...params);
|
|
327
579
|
}
|
|
328
|
-
setMeta(
|
|
329
|
-
this.db.query("INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value=excluded.value").run(
|
|
580
|
+
setMeta(key2, value) {
|
|
581
|
+
this.db.query("INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value=excluded.value").run(key2, value);
|
|
330
582
|
}
|
|
331
|
-
getMeta(
|
|
332
|
-
const row = this.db.query("SELECT value FROM meta WHERE key = ?").get(
|
|
583
|
+
getMeta(key2) {
|
|
584
|
+
const row = this.db.query("SELECT value FROM meta WHERE key = ?").get(key2);
|
|
333
585
|
return row ? row.value : null;
|
|
334
586
|
}
|
|
335
587
|
stats(source) {
|
|
@@ -353,12 +605,12 @@ class ExpertsDB {
|
|
|
353
605
|
const nodeIds = new Map;
|
|
354
606
|
const insertNode = this.db.query("INSERT INTO kg_nodes (type, key, label) VALUES (?, ?, ?) ON CONFLICT(type, key) DO UPDATE SET label=excluded.label RETURNING id");
|
|
355
607
|
const insertEdge = this.db.query("INSERT OR REPLACE INTO kg_edges (src, dst, rel, weight) VALUES (?, ?, ?, ?)");
|
|
356
|
-
const node = (type,
|
|
357
|
-
const ck = `${type}\x00${
|
|
608
|
+
const node = (type, key2, label) => {
|
|
609
|
+
const ck = `${type}\x00${key2.toLowerCase()}`;
|
|
358
610
|
const cached = nodeIds.get(ck);
|
|
359
611
|
if (cached != null)
|
|
360
612
|
return cached;
|
|
361
|
-
const id = insertNode.get(type,
|
|
613
|
+
const id = insertNode.get(type, key2.toLowerCase(), label).id;
|
|
362
614
|
nodeIds.set(ck, id);
|
|
363
615
|
return id;
|
|
364
616
|
};
|
|
@@ -368,7 +620,8 @@ class ExpertsDB {
|
|
|
368
620
|
for (const topic of e.topics) {
|
|
369
621
|
insertEdge.run(eId, node("topic", topic, topic), "IN_TOPIC", 1);
|
|
370
622
|
}
|
|
371
|
-
const
|
|
623
|
+
const tweetText = this.recentTweets(e.source, e.sourceId, 30).map((t) => t.text).join(". ");
|
|
624
|
+
const tags = inferTags(expertText(e) + ". " + tweetText, vocabulary);
|
|
372
625
|
for (const tag of tags) {
|
|
373
626
|
insertEdge.run(eId, node("tag", tag, tag), "HAS_TAG", 1);
|
|
374
627
|
}
|
|
@@ -381,11 +634,51 @@ class ExpertsDB {
|
|
|
381
634
|
this.setMeta("graph_built", new Date().toISOString());
|
|
382
635
|
return { nodes, edges };
|
|
383
636
|
}
|
|
384
|
-
|
|
385
|
-
const
|
|
637
|
+
rescore(source) {
|
|
638
|
+
const experts = this.list({ source });
|
|
639
|
+
const upd = this.db.query("UPDATE experts SET authority = ? WHERE source = ? AND source_id = ?");
|
|
640
|
+
const followerStmt = this.db.query("SELECT followers FROM x_profiles WHERE source = ? AND source_id = ?");
|
|
641
|
+
const lastTweetStmt = this.db.query("SELECT MAX(created_at) AS t FROM tweets WHERE source = ? AND source_id = ?");
|
|
642
|
+
const tx = this.db.transaction((rows) => {
|
|
643
|
+
for (const e of rows) {
|
|
644
|
+
const fr = followerStmt.get(e.source, e.sourceId);
|
|
645
|
+
const lt = lastTweetStmt.get(e.source, e.sourceId);
|
|
646
|
+
let daysSince;
|
|
647
|
+
if (lt?.t) {
|
|
648
|
+
const ms = Date.now() - Date.parse(lt.t);
|
|
649
|
+
if (!Number.isNaN(ms))
|
|
650
|
+
daysSince = ms / 86400000;
|
|
651
|
+
}
|
|
652
|
+
const score = authorityScore(e, { followers: fr?.followers ?? 0, daysSinceLastTweet: daysSince });
|
|
653
|
+
upd.run(score, e.source, e.sourceId);
|
|
654
|
+
}
|
|
655
|
+
});
|
|
656
|
+
tx(experts);
|
|
657
|
+
this.setMeta("rescored_at", new Date().toISOString());
|
|
658
|
+
return experts.length;
|
|
659
|
+
}
|
|
660
|
+
stalest(opts = {}) {
|
|
661
|
+
const where = opts.source ? "WHERE e.source = ?" : "";
|
|
662
|
+
const params = opts.source ? [opts.source] : [];
|
|
663
|
+
const sql = `
|
|
664
|
+
SELECT e.*, COALESCE(
|
|
665
|
+
(SELECT MAX(enriched_at) FROM x_profiles xp WHERE xp.source=e.source AND xp.source_id=e.source_id),
|
|
666
|
+
e.crawled_at
|
|
667
|
+
) AS last_seen
|
|
668
|
+
FROM experts e ${where}
|
|
669
|
+
ORDER BY last_seen ASC
|
|
670
|
+
LIMIT ?`;
|
|
671
|
+
params.push(opts.limit ?? 25);
|
|
672
|
+
return this.db.query(sql).all(...params).map((r) => ({
|
|
673
|
+
expert: this.rowToExpert(r),
|
|
674
|
+
lastSeen: r.last_seen || ""
|
|
675
|
+
}));
|
|
676
|
+
}
|
|
677
|
+
expertFromNodeKey(key2) {
|
|
678
|
+
const idx = key2.indexOf(":");
|
|
386
679
|
if (idx < 0)
|
|
387
680
|
return null;
|
|
388
|
-
return this.get(
|
|
681
|
+
return this.get(key2.slice(idx + 1), key2.slice(0, idx));
|
|
389
682
|
}
|
|
390
683
|
findByNeeds(needs, opts = {}) {
|
|
391
684
|
const cleaned = needs.map((n) => n.trim().toLowerCase()).filter(Boolean);
|
|
@@ -574,6 +867,270 @@ class ExpertsDB {
|
|
|
574
867
|
const avatars = this.db.query(`SELECT COUNT(*) AS n FROM experts WHERE avatar_local IS NOT NULL${source ? " AND source = ?" : ""}`).get(...args).n;
|
|
575
868
|
return { withHandle, enriched, tweets, avatars };
|
|
576
869
|
}
|
|
870
|
+
recordChanges(source, incoming) {
|
|
871
|
+
const existing = new Map(this.list({ source }).map((e) => [e.sourceId, e]));
|
|
872
|
+
const now = new Date().toISOString();
|
|
873
|
+
const watched = ["price", "title", "headline", "bio", "slug"];
|
|
874
|
+
const stmt = this.db.query("INSERT INTO changes (source, source_id, kind, field, old_value, new_value, detected_at) VALUES (?, ?, ?, ?, ?, ?, ?)");
|
|
875
|
+
let count = 0;
|
|
876
|
+
const tx = this.db.transaction((rows) => {
|
|
877
|
+
for (const e of rows) {
|
|
878
|
+
const prev = existing.get(e.sourceId);
|
|
879
|
+
if (!prev) {
|
|
880
|
+
stmt.run(source, e.sourceId, "added", null, null, e.fullName || e.slug, now);
|
|
881
|
+
count++;
|
|
882
|
+
continue;
|
|
883
|
+
}
|
|
884
|
+
for (const f of watched) {
|
|
885
|
+
const a = String(prev[f] ?? "");
|
|
886
|
+
const b = String(e[f] ?? "");
|
|
887
|
+
if (a !== b) {
|
|
888
|
+
stmt.run(source, e.sourceId, "updated", f, a, b, now);
|
|
889
|
+
count++;
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
});
|
|
894
|
+
tx(incoming);
|
|
895
|
+
return count;
|
|
896
|
+
}
|
|
897
|
+
changes(opts = {}) {
|
|
898
|
+
const where = opts.source ? "WHERE source = ?" : "";
|
|
899
|
+
const params = opts.source ? [opts.source] : [];
|
|
900
|
+
params.push(opts.limit ?? 50);
|
|
901
|
+
return this.db.query(`SELECT * FROM changes ${where} ORDER BY detected_at DESC, id DESC LIMIT ?`).all(...params);
|
|
902
|
+
}
|
|
903
|
+
rebuildPersons() {
|
|
904
|
+
const experts = this.list();
|
|
905
|
+
const mapping = clusterPersons(experts);
|
|
906
|
+
const tx = this.db.transaction(() => {
|
|
907
|
+
this.db.exec("DELETE FROM persons");
|
|
908
|
+
const stmt = this.db.query("INSERT OR REPLACE INTO persons (source, source_id, person_id) VALUES (?, ?, ?)");
|
|
909
|
+
for (const [k, pid] of mapping) {
|
|
910
|
+
const idx = k.indexOf(":");
|
|
911
|
+
stmt.run(k.slice(0, idx), k.slice(idx + 1), pid);
|
|
912
|
+
}
|
|
913
|
+
});
|
|
914
|
+
tx();
|
|
915
|
+
const persons = new Set(mapping.values()).size;
|
|
916
|
+
this.setMeta("persons_built", new Date().toISOString());
|
|
917
|
+
return { experts: experts.length, persons };
|
|
918
|
+
}
|
|
919
|
+
personIdOf(source, sourceId) {
|
|
920
|
+
const r = this.db.query("SELECT person_id FROM persons WHERE source = ? AND source_id = ?").get(source, sourceId);
|
|
921
|
+
return r ? r.person_id : `${source}:${sourceId}`;
|
|
922
|
+
}
|
|
923
|
+
expertsForPerson(personId) {
|
|
924
|
+
const rows = this.db.query("SELECT e.* FROM persons p JOIN experts e ON e.source=p.source AND e.source_id=p.source_id WHERE p.person_id = ?").all(personId);
|
|
925
|
+
return rows.map((r) => this.rowToExpert(r));
|
|
926
|
+
}
|
|
927
|
+
personStats() {
|
|
928
|
+
const experts = this.count();
|
|
929
|
+
const row = this.db.query("SELECT COUNT(DISTINCT person_id) n FROM persons").get();
|
|
930
|
+
const persons = row?.n ?? 0;
|
|
931
|
+
const dupes = this.db.query("SELECT person_id, COUNT(*) c FROM persons GROUP BY person_id HAVING c > 1 ORDER BY c DESC").all();
|
|
932
|
+
return { experts, persons: persons || experts, duplicated: dupes.length };
|
|
933
|
+
}
|
|
934
|
+
async buildEmbeddings(embedder, opts = {}) {
|
|
935
|
+
const log = opts.onLog ?? (() => {});
|
|
936
|
+
const experts = this.list({ source: opts.source });
|
|
937
|
+
const batch = opts.batch ?? 64;
|
|
938
|
+
const stmt = this.db.query("INSERT OR REPLACE INTO vectors (source, source_id, embedder, dim, vec) VALUES (?, ?, ?, ?, ?)");
|
|
939
|
+
let done = 0;
|
|
940
|
+
for (let i = 0;i < experts.length; i += batch) {
|
|
941
|
+
const slice = experts.slice(i, i + batch);
|
|
942
|
+
const vecs = await embedder.embed(slice.map((e) => expertEmbedText(e)));
|
|
943
|
+
const tx = this.db.transaction(() => {
|
|
944
|
+
slice.forEach((e, j) => stmt.run(e.source, e.sourceId, embedder.id, embedder.dim, packVector(vecs[j])));
|
|
945
|
+
});
|
|
946
|
+
tx();
|
|
947
|
+
done += slice.length;
|
|
948
|
+
log(` embedded ${done}/${experts.length}`);
|
|
949
|
+
}
|
|
950
|
+
this.setMeta("embedder", embedder.id);
|
|
951
|
+
this.setMeta("embedded_at", new Date().toISOString());
|
|
952
|
+
return done;
|
|
953
|
+
}
|
|
954
|
+
vectorCount() {
|
|
955
|
+
return this.db.query("SELECT COUNT(*) n FROM vectors").get().n;
|
|
956
|
+
}
|
|
957
|
+
semanticSearch(queryVec, opts = {}) {
|
|
958
|
+
const where = opts.source ? "WHERE v.source = ?" : "";
|
|
959
|
+
const params = opts.source ? [opts.source] : [];
|
|
960
|
+
const rows = this.db.query(`SELECT e.*, v.vec AS _vec FROM vectors v JOIN experts e ON e.source=v.source AND e.source_id=v.source_id ${where}`).all(...params);
|
|
961
|
+
const scored = rows.map((r) => ({ expert: this.rowToExpert(r), score: cosine(queryVec, unpackVector(r._vec)) }));
|
|
962
|
+
scored.sort((a, b) => b.score - a.score);
|
|
963
|
+
return scored.slice(0, opts.limit ?? 25);
|
|
964
|
+
}
|
|
965
|
+
upsertExt(source, sourceId, kind, data) {
|
|
966
|
+
this.db.query(`
|
|
967
|
+
INSERT INTO ext_profiles (source, source_id, kind, data, enriched_at)
|
|
968
|
+
VALUES (?, ?, ?, ?, ?)
|
|
969
|
+
ON CONFLICT(source, source_id, kind) DO UPDATE SET data=excluded.data, enriched_at=excluded.enriched_at
|
|
970
|
+
`).run(source, sourceId, kind, JSON.stringify(data), new Date().toISOString());
|
|
971
|
+
}
|
|
972
|
+
getExt(source, sourceId, kind) {
|
|
973
|
+
const r = this.db.query("SELECT data FROM ext_profiles WHERE source=? AND source_id=? AND kind=?").get(source, sourceId, kind);
|
|
974
|
+
return r ? JSON.parse(r.data || "{}") : null;
|
|
975
|
+
}
|
|
976
|
+
allExt(source, sourceId) {
|
|
977
|
+
const rows = this.db.query("SELECT kind, data FROM ext_profiles WHERE source=? AND source_id=?").all(source, sourceId);
|
|
978
|
+
return Object.fromEntries(rows.map((r) => [r.kind, JSON.parse(r.data || "{}")]));
|
|
979
|
+
}
|
|
980
|
+
expertsNeedingExt(platform, kind, opts = {}) {
|
|
981
|
+
const where = [`json_extract(socials, '$.${platform}') IS NOT NULL`];
|
|
982
|
+
const bind = [];
|
|
983
|
+
if (opts.source) {
|
|
984
|
+
where.push("source = ?");
|
|
985
|
+
bind.push(opts.source);
|
|
986
|
+
}
|
|
987
|
+
if (!opts.refresh) {
|
|
988
|
+
where.push("NOT EXISTS (SELECT 1 FROM ext_profiles x WHERE x.source=experts.source AND x.source_id=experts.source_id AND x.kind=?)");
|
|
989
|
+
bind.push(kind);
|
|
990
|
+
}
|
|
991
|
+
let sql = "SELECT * FROM experts WHERE " + where.join(" AND ") + " ORDER BY rating_count DESC";
|
|
992
|
+
if (opts.limit) {
|
|
993
|
+
sql += " LIMIT ?";
|
|
994
|
+
bind.push(opts.limit);
|
|
995
|
+
}
|
|
996
|
+
return this.db.query(sql).all(...bind).map((r) => this.rowToExpert(r));
|
|
997
|
+
}
|
|
998
|
+
replaceVideos(source, sourceId, videos) {
|
|
999
|
+
const tx = this.db.transaction((rows) => {
|
|
1000
|
+
this.db.query("DELETE FROM videos WHERE source = ? AND source_id = ?").run(source, sourceId);
|
|
1001
|
+
const stmt = this.db.query("INSERT OR REPLACE INTO videos (source, source_id, video_id, title, description, published_at, url, thumbnail, view_count) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)");
|
|
1002
|
+
for (const v of rows) {
|
|
1003
|
+
stmt.run(v.source, v.sourceId, v.videoId, v.title, v.description, v.publishedAt, v.url, v.thumbnail, v.viewCount);
|
|
1004
|
+
}
|
|
1005
|
+
});
|
|
1006
|
+
tx(videos);
|
|
1007
|
+
}
|
|
1008
|
+
recentVideos(source, sourceId, limit = 10) {
|
|
1009
|
+
const rows = this.db.query("SELECT * FROM videos WHERE source = ? AND source_id = ? ORDER BY published_at DESC LIMIT ?").all(source, sourceId, limit);
|
|
1010
|
+
return rows.map((r) => ({
|
|
1011
|
+
source: r.source,
|
|
1012
|
+
sourceId: r.source_id,
|
|
1013
|
+
videoId: r.video_id,
|
|
1014
|
+
title: r.title || "",
|
|
1015
|
+
description: r.description || "",
|
|
1016
|
+
publishedAt: r.published_at || "",
|
|
1017
|
+
url: r.url || "",
|
|
1018
|
+
thumbnail: r.thumbnail || "",
|
|
1019
|
+
viewCount: r.view_count ?? 0
|
|
1020
|
+
}));
|
|
1021
|
+
}
|
|
1022
|
+
expertsNeedingVideos(opts = {}) {
|
|
1023
|
+
const where = ["json_extract(socials, '$.youtube') IS NOT NULL"];
|
|
1024
|
+
const params = [];
|
|
1025
|
+
if (opts.source) {
|
|
1026
|
+
where.push("source = ?");
|
|
1027
|
+
params.push(opts.source);
|
|
1028
|
+
}
|
|
1029
|
+
if (!opts.refresh) {
|
|
1030
|
+
where.push("NOT EXISTS (SELECT 1 FROM videos v WHERE v.source=experts.source AND v.source_id=experts.source_id)");
|
|
1031
|
+
}
|
|
1032
|
+
let sql = "SELECT * FROM experts WHERE " + where.join(" AND ") + " ORDER BY rating_count DESC";
|
|
1033
|
+
if (opts.limit) {
|
|
1034
|
+
sql += " LIMIT ?";
|
|
1035
|
+
params.push(opts.limit);
|
|
1036
|
+
}
|
|
1037
|
+
return this.db.query(sql).all(...params).map((r) => this.rowToExpert(r));
|
|
1038
|
+
}
|
|
1039
|
+
upsertContact(c) {
|
|
1040
|
+
this.db.query(`
|
|
1041
|
+
INSERT INTO contacts (source, source_id, type, value, label, provider, confidence, status, verified_at, created_at)
|
|
1042
|
+
VALUES ($source, $source_id, $type, $value, $label, $provider, $confidence, $status, $verified_at, $created_at)
|
|
1043
|
+
ON CONFLICT(source, source_id, type, value) DO UPDATE SET
|
|
1044
|
+
label=excluded.label, provider=excluded.provider, confidence=excluded.confidence,
|
|
1045
|
+
status=CASE WHEN excluded.status != 'unverified' THEN excluded.status ELSE contacts.status END,
|
|
1046
|
+
verified_at=COALESCE(excluded.verified_at, contacts.verified_at)
|
|
1047
|
+
`).run({
|
|
1048
|
+
$source: c.source,
|
|
1049
|
+
$source_id: c.sourceId,
|
|
1050
|
+
$type: c.type,
|
|
1051
|
+
$value: c.value,
|
|
1052
|
+
$label: c.label,
|
|
1053
|
+
$provider: c.provider,
|
|
1054
|
+
$confidence: c.confidence,
|
|
1055
|
+
$status: c.status,
|
|
1056
|
+
$verified_at: c.verifiedAt || null,
|
|
1057
|
+
$created_at: c.createdAt || new Date().toISOString()
|
|
1058
|
+
});
|
|
1059
|
+
}
|
|
1060
|
+
setContactStatus(source, sourceId, type, value, status) {
|
|
1061
|
+
this.db.query("UPDATE contacts SET status = ?, verified_at = ? WHERE source = ? AND source_id = ? AND type = ? AND value = ?").run(status, new Date().toISOString(), source, sourceId, type, value);
|
|
1062
|
+
}
|
|
1063
|
+
contacts(source, sourceId) {
|
|
1064
|
+
const rows = this.db.query("SELECT * FROM contacts WHERE source = ? AND source_id = ? ORDER BY type, confidence DESC").all(source, sourceId);
|
|
1065
|
+
return rows.map((r) => ({
|
|
1066
|
+
source: r.source,
|
|
1067
|
+
sourceId: r.source_id,
|
|
1068
|
+
type: r.type,
|
|
1069
|
+
value: r.value,
|
|
1070
|
+
label: r.label || "",
|
|
1071
|
+
provider: r.provider || "",
|
|
1072
|
+
confidence: r.confidence ?? 0,
|
|
1073
|
+
status: r.status || "unverified",
|
|
1074
|
+
verifiedAt: r.verified_at || "",
|
|
1075
|
+
createdAt: r.created_at || ""
|
|
1076
|
+
}));
|
|
1077
|
+
}
|
|
1078
|
+
contactsToVerify(opts = {}) {
|
|
1079
|
+
const where = ["status = 'unverified'"];
|
|
1080
|
+
const params = [];
|
|
1081
|
+
if (opts.source) {
|
|
1082
|
+
where.push("source = ?");
|
|
1083
|
+
params.push(opts.source);
|
|
1084
|
+
}
|
|
1085
|
+
let sql = "SELECT * FROM contacts WHERE " + where.join(" AND ") + " ORDER BY confidence DESC";
|
|
1086
|
+
if (opts.limit) {
|
|
1087
|
+
sql += " LIMIT ?";
|
|
1088
|
+
params.push(opts.limit);
|
|
1089
|
+
}
|
|
1090
|
+
return this.db.query(sql).all(...params).map((r) => ({
|
|
1091
|
+
source: r.source,
|
|
1092
|
+
sourceId: r.source_id,
|
|
1093
|
+
type: r.type,
|
|
1094
|
+
value: r.value,
|
|
1095
|
+
label: r.label || "",
|
|
1096
|
+
provider: r.provider || "",
|
|
1097
|
+
confidence: r.confidence ?? 0,
|
|
1098
|
+
status: r.status || "unverified",
|
|
1099
|
+
verifiedAt: r.verified_at || "",
|
|
1100
|
+
createdAt: r.created_at || ""
|
|
1101
|
+
}));
|
|
1102
|
+
}
|
|
1103
|
+
expertsNeedingContacts(opts = {}) {
|
|
1104
|
+
const where = [];
|
|
1105
|
+
const params = [];
|
|
1106
|
+
if (opts.source) {
|
|
1107
|
+
where.push("source = ?");
|
|
1108
|
+
params.push(opts.source);
|
|
1109
|
+
}
|
|
1110
|
+
if (!opts.refresh) {
|
|
1111
|
+
where.push("NOT EXISTS (SELECT 1 FROM contacts c WHERE c.source = experts.source AND c.source_id = experts.source_id)");
|
|
1112
|
+
}
|
|
1113
|
+
let sql = "SELECT * FROM experts";
|
|
1114
|
+
if (where.length)
|
|
1115
|
+
sql += " WHERE " + where.join(" AND ");
|
|
1116
|
+
sql += " ORDER BY rating_count DESC";
|
|
1117
|
+
if (opts.limit) {
|
|
1118
|
+
sql += " LIMIT ?";
|
|
1119
|
+
params.push(opts.limit);
|
|
1120
|
+
}
|
|
1121
|
+
return this.db.query(sql).all(...params).map((r) => this.rowToExpert(r));
|
|
1122
|
+
}
|
|
1123
|
+
contactStats(source) {
|
|
1124
|
+
const filt = source ? " WHERE source = ?" : "";
|
|
1125
|
+
const args = source ? [source] : [];
|
|
1126
|
+
const total = this.db.query(`SELECT COUNT(*) n FROM contacts${filt}`).get(...args).n;
|
|
1127
|
+
const valid = this.db.query(`SELECT COUNT(*) n FROM contacts WHERE status='valid'${source ? " AND source = ?" : ""}`).get(...args).n;
|
|
1128
|
+
const invalid = this.db.query(`SELECT COUNT(*) n FROM contacts WHERE status='invalid'${source ? " AND source = ?" : ""}`).get(...args).n;
|
|
1129
|
+
const emails = this.db.query(`SELECT COUNT(*) n FROM contacts WHERE type='email'${source ? " AND source = ?" : ""}`).get(...args).n;
|
|
1130
|
+
const phones = this.db.query(`SELECT COUNT(*) n FROM contacts WHERE type='phone'${source ? " AND source = ?" : ""}`).get(...args).n;
|
|
1131
|
+
const expertsWith = this.db.query(`SELECT COUNT(DISTINCT source||source_id) n FROM contacts${filt}`).get(...args).n;
|
|
1132
|
+
return { total, valid, invalid, emails, phones, expertsWith };
|
|
1133
|
+
}
|
|
577
1134
|
close() {
|
|
578
1135
|
this.db.close();
|
|
579
1136
|
}
|
|
@@ -792,62 +1349,393 @@ class IntroSource {
|
|
|
792
1349
|
}
|
|
793
1350
|
}
|
|
794
1351
|
|
|
795
|
-
// src/sources/
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
1352
|
+
// src/sources/common.ts
|
|
1353
|
+
function makeExpert(p) {
|
|
1354
|
+
const first = p.firstName ?? "";
|
|
1355
|
+
const last = p.lastName ?? "";
|
|
1356
|
+
return {
|
|
1357
|
+
source: p.source,
|
|
1358
|
+
sourceId: p.sourceId,
|
|
1359
|
+
slug: p.slug ?? "",
|
|
1360
|
+
url: p.url ?? "",
|
|
1361
|
+
fullName: p.fullName ?? [first, last].filter(Boolean).join(" "),
|
|
1362
|
+
firstName: first,
|
|
1363
|
+
lastName: last,
|
|
1364
|
+
title: p.title ?? "",
|
|
1365
|
+
headline: p.headline ?? "",
|
|
1366
|
+
bio: p.bio ?? "",
|
|
1367
|
+
avatar: p.avatar ?? "",
|
|
1368
|
+
price: p.price ?? 0,
|
|
1369
|
+
priceCurrency: p.priceCurrency ?? "USD",
|
|
1370
|
+
priceUnit: p.priceUnit ?? "",
|
|
1371
|
+
rating: p.rating ?? 0,
|
|
1372
|
+
ratingCount: p.ratingCount ?? 0,
|
|
1373
|
+
verified: p.verified ?? false,
|
|
1374
|
+
featured: p.featured ?? false,
|
|
1375
|
+
topics: p.topics ? [...new Set(p.topics)].sort() : [],
|
|
1376
|
+
tags: p.tags ?? [],
|
|
1377
|
+
socials: p.socials ?? {},
|
|
1378
|
+
extra: p.extra ?? {},
|
|
1379
|
+
crawledAt: p.crawledAt ?? new Date().toISOString()
|
|
1380
|
+
};
|
|
802
1381
|
}
|
|
803
|
-
function
|
|
804
|
-
return [
|
|
1382
|
+
function slugify(s) {
|
|
1383
|
+
return (s || "").toLowerCase().normalize("NFKD").replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
|
805
1384
|
}
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
1385
|
+
async function fetchJson(url, fetchFn, init = {}) {
|
|
1386
|
+
try {
|
|
1387
|
+
const res = await fetchFn(url, {
|
|
1388
|
+
...init,
|
|
1389
|
+
headers: {
|
|
1390
|
+
"User-Agent": "open-experts (+https://github.com/hasna/experts)",
|
|
1391
|
+
Accept: "application/json",
|
|
1392
|
+
...init.headers || {}
|
|
1393
|
+
}
|
|
1394
|
+
});
|
|
1395
|
+
if (!res.ok)
|
|
1396
|
+
return null;
|
|
1397
|
+
return await res.json();
|
|
1398
|
+
} catch {
|
|
1399
|
+
return null;
|
|
813
1400
|
}
|
|
814
|
-
const data = await source.crawl(opts);
|
|
815
|
-
db.upsertExperts(data.experts);
|
|
816
|
-
if (data.topics.length)
|
|
817
|
-
db.setTopics(source.name, data.topics);
|
|
818
|
-
db.setTags(source.name, data.tags);
|
|
819
|
-
db.setMeta(`last_crawl:${source.name}`, new Date().toISOString());
|
|
820
|
-
db.setMeta(`catalog_total:${source.name}`, String(data.total));
|
|
821
|
-
opts.onLog?.("building knowledge graph\u2026");
|
|
822
|
-
const graph = db.rebuildGraph();
|
|
823
|
-
return {
|
|
824
|
-
source: source.name,
|
|
825
|
-
experts: data.experts.length,
|
|
826
|
-
topics: data.topics.map((t) => ({ name: t.name, count: t.expertCount })),
|
|
827
|
-
tags: new Set(data.tags.map((t) => t.name)).size,
|
|
828
|
-
total: data.total,
|
|
829
|
-
graph
|
|
830
|
-
};
|
|
831
1401
|
}
|
|
832
1402
|
|
|
833
|
-
// src/
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
const
|
|
837
|
-
|
|
1403
|
+
// src/sources/mentorcruise.ts
|
|
1404
|
+
function normalizeMentor(m, crawledAt) {
|
|
1405
|
+
const slug = m.slug || slugify(m.name || String(m.id ?? ""));
|
|
1406
|
+
const socials = {};
|
|
1407
|
+
if (m.twitter)
|
|
1408
|
+
socials.twitter = m.twitter.startsWith("http") ? m.twitter : `https://x.com/${m.twitter}`;
|
|
1409
|
+
if (m.linkedin)
|
|
1410
|
+
socials.linkedin = m.linkedin;
|
|
1411
|
+
return makeExpert({
|
|
1412
|
+
source: "mentorcruise",
|
|
1413
|
+
sourceId: String(m.id ?? slug),
|
|
1414
|
+
slug,
|
|
1415
|
+
url: `https://mentorcruise.com/mentor/${slug}/`,
|
|
1416
|
+
fullName: m.name ?? [m.first_name, m.last_name].filter(Boolean).join(" "),
|
|
1417
|
+
firstName: m.first_name ?? "",
|
|
1418
|
+
lastName: m.last_name ?? "",
|
|
1419
|
+
title: m.job_title ?? "",
|
|
1420
|
+
bio: m.bio ?? "",
|
|
1421
|
+
avatar: m.avatar ?? m.photo ?? "",
|
|
1422
|
+
price: m.price ?? 0,
|
|
1423
|
+
priceCurrency: m.currency ?? "USD",
|
|
1424
|
+
priceUnit: m.price ? "per month" : "",
|
|
1425
|
+
rating: m.rating ?? 0,
|
|
1426
|
+
ratingCount: m.reviews_count ?? 0,
|
|
1427
|
+
verified: Boolean(m.verified),
|
|
1428
|
+
featured: Boolean(m.is_top_mentor),
|
|
1429
|
+
topics: m.categories ?? [],
|
|
1430
|
+
tags: m.skills ?? [],
|
|
1431
|
+
socials,
|
|
1432
|
+
crawledAt
|
|
1433
|
+
});
|
|
838
1434
|
}
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
1435
|
+
|
|
1436
|
+
class MentorCruiseSource {
|
|
1437
|
+
name = "mentorcruise";
|
|
1438
|
+
description = "MentorCruise \u2014 long-term mentorship from vetted mentors";
|
|
1439
|
+
website = "https://mentorcruise.com";
|
|
1440
|
+
fetchFn;
|
|
1441
|
+
apiBase;
|
|
1442
|
+
pageSize;
|
|
1443
|
+
constructor(opts = {}) {
|
|
1444
|
+
this.fetchFn = opts.fetchFn ?? fetch;
|
|
1445
|
+
this.apiBase = opts.apiBase ?? process.env.MENTORCRUISE_API_BASE ?? "https://mentorcruise.com/api";
|
|
1446
|
+
this.pageSize = opts.pageSize ?? 50;
|
|
1447
|
+
}
|
|
1448
|
+
async crawl(opts = {}) {
|
|
1449
|
+
const log = opts.onLog ?? (() => {});
|
|
1450
|
+
const crawledAt = new Date().toISOString();
|
|
1451
|
+
const experts = [];
|
|
1452
|
+
const tags = new Set;
|
|
1453
|
+
let offset = 0;
|
|
1454
|
+
for (;; ) {
|
|
1455
|
+
const data = await fetchJson(`${this.apiBase}/mentors/?limit=${this.pageSize}&offset=${offset}`, this.fetchFn);
|
|
1456
|
+
const items = data?.results ?? data?.data ?? (Array.isArray(data) ? data : []);
|
|
1457
|
+
if (!items.length)
|
|
1458
|
+
break;
|
|
1459
|
+
for (const m of items) {
|
|
1460
|
+
const e = normalizeMentor(m, crawledAt);
|
|
1461
|
+
experts.push(e);
|
|
1462
|
+
for (const t of e.tags)
|
|
1463
|
+
tags.add(t);
|
|
1464
|
+
}
|
|
1465
|
+
offset += items.length;
|
|
1466
|
+
log(` mentorcruise: ${experts.length}`);
|
|
1467
|
+
if (opts.max && experts.length >= opts.max)
|
|
1468
|
+
break;
|
|
1469
|
+
if (items.length < this.pageSize)
|
|
1470
|
+
break;
|
|
1471
|
+
}
|
|
1472
|
+
if (experts.length === 0) {
|
|
1473
|
+
log("mentorcruise: no public listing reachable (set MENTORCRUISE_API_BASE or provide a fetchFn).");
|
|
1474
|
+
}
|
|
1475
|
+
const topics = [];
|
|
1476
|
+
return {
|
|
1477
|
+
experts: opts.max ? experts.slice(0, opts.max) : experts,
|
|
1478
|
+
topics,
|
|
1479
|
+
tags: [...tags].map((name) => ({ name, topic: "" })),
|
|
1480
|
+
total: experts.length
|
|
1481
|
+
};
|
|
1482
|
+
}
|
|
844
1483
|
}
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
1484
|
+
|
|
1485
|
+
// src/sources/adplist.ts
|
|
1486
|
+
function normalizeAdpMentor(m, crawledAt) {
|
|
1487
|
+
const slug = m.username || slugify(m.name || m.full_name || String(m.id ?? ""));
|
|
1488
|
+
const socials = {};
|
|
1489
|
+
if (m.twitter)
|
|
1490
|
+
socials.twitter = m.twitter.startsWith("http") ? m.twitter : `https://x.com/${m.twitter}`;
|
|
1491
|
+
if (m.linkedin)
|
|
1492
|
+
socials.linkedin = m.linkedin;
|
|
1493
|
+
return makeExpert({
|
|
1494
|
+
source: "adplist",
|
|
1495
|
+
sourceId: String(m.id ?? slug),
|
|
1496
|
+
slug,
|
|
1497
|
+
url: `https://adplist.org/mentors/${slug}`,
|
|
1498
|
+
fullName: m.name ?? m.full_name ?? "",
|
|
1499
|
+
title: m.headline ?? m.tagline ?? "",
|
|
1500
|
+
headline: m.tagline ?? "",
|
|
1501
|
+
bio: m.bio ?? m.about ?? "",
|
|
1502
|
+
avatar: m.profile_photo ?? m.avatar ?? "",
|
|
1503
|
+
price: 0,
|
|
1504
|
+
priceUnit: "free session",
|
|
1505
|
+
rating: m.rating ?? 0,
|
|
1506
|
+
ratingCount: m.total_reviews ?? 0,
|
|
1507
|
+
verified: Boolean(m.verified),
|
|
1508
|
+
featured: Boolean(m.is_featured),
|
|
1509
|
+
tags: m.expertise ?? m.skills ?? [],
|
|
1510
|
+
socials,
|
|
1511
|
+
crawledAt
|
|
1512
|
+
});
|
|
848
1513
|
}
|
|
849
|
-
|
|
850
|
-
|
|
1514
|
+
|
|
1515
|
+
class ADPListSource {
|
|
1516
|
+
name = "adplist";
|
|
1517
|
+
description = "ADPList \u2014 free mentorship across design, product & engineering";
|
|
1518
|
+
website = "https://adplist.org";
|
|
1519
|
+
fetchFn;
|
|
1520
|
+
apiBase;
|
|
1521
|
+
pageSize;
|
|
1522
|
+
constructor(opts = {}) {
|
|
1523
|
+
this.fetchFn = opts.fetchFn ?? fetch;
|
|
1524
|
+
this.apiBase = opts.apiBase ?? process.env.ADPLIST_API_BASE ?? "https://api.adplist.org/api";
|
|
1525
|
+
this.pageSize = opts.pageSize ?? 50;
|
|
1526
|
+
}
|
|
1527
|
+
async crawl(opts = {}) {
|
|
1528
|
+
const log = opts.onLog ?? (() => {});
|
|
1529
|
+
const crawledAt = new Date().toISOString();
|
|
1530
|
+
const experts = [];
|
|
1531
|
+
const tags = new Set;
|
|
1532
|
+
let page = 1;
|
|
1533
|
+
for (;; ) {
|
|
1534
|
+
const data = await fetchJson(`${this.apiBase}/mentors/?page=${page}&page_size=${this.pageSize}`, this.fetchFn);
|
|
1535
|
+
const items = data?.results ?? data?.data ?? (Array.isArray(data) ? data : []);
|
|
1536
|
+
if (!items.length)
|
|
1537
|
+
break;
|
|
1538
|
+
for (const m of items) {
|
|
1539
|
+
const e = normalizeAdpMentor(m, crawledAt);
|
|
1540
|
+
experts.push(e);
|
|
1541
|
+
for (const t of e.tags)
|
|
1542
|
+
tags.add(t);
|
|
1543
|
+
}
|
|
1544
|
+
log(` adplist: ${experts.length}`);
|
|
1545
|
+
page++;
|
|
1546
|
+
if (opts.max && experts.length >= opts.max)
|
|
1547
|
+
break;
|
|
1548
|
+
if (items.length < this.pageSize)
|
|
1549
|
+
break;
|
|
1550
|
+
}
|
|
1551
|
+
if (experts.length === 0)
|
|
1552
|
+
log("adplist: no public listing reachable (set ADPLIST_API_BASE or provide a fetchFn).");
|
|
1553
|
+
return {
|
|
1554
|
+
experts: opts.max ? experts.slice(0, opts.max) : experts,
|
|
1555
|
+
topics: [],
|
|
1556
|
+
tags: [...tags].map((name) => ({ name, topic: "" })),
|
|
1557
|
+
total: experts.length
|
|
1558
|
+
};
|
|
1559
|
+
}
|
|
1560
|
+
}
|
|
1561
|
+
|
|
1562
|
+
// src/sources/clarity.ts
|
|
1563
|
+
function normalizeClarityExpert(c, crawledAt) {
|
|
1564
|
+
const slug = c.username || slugify(c.name || String(c.id ?? ""));
|
|
1565
|
+
return makeExpert({
|
|
1566
|
+
source: "clarity",
|
|
1567
|
+
sourceId: String(c.id ?? slug),
|
|
1568
|
+
slug,
|
|
1569
|
+
url: `https://clarity.fm/${slug}`,
|
|
1570
|
+
fullName: c.name ?? "",
|
|
1571
|
+
title: c.title ?? "",
|
|
1572
|
+
bio: c.bio ?? "",
|
|
1573
|
+
avatar: c.image ?? "",
|
|
1574
|
+
price: c.rate_per_minute ?? 0,
|
|
1575
|
+
priceCurrency: "USD",
|
|
1576
|
+
priceUnit: c.rate_per_minute ? "per minute" : "",
|
|
1577
|
+
rating: c.rating ?? 0,
|
|
1578
|
+
ratingCount: c.reviews ?? 0,
|
|
1579
|
+
topics: c.categories ?? [],
|
|
1580
|
+
tags: c.expertise ?? [],
|
|
1581
|
+
crawledAt
|
|
1582
|
+
});
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
class ClaritySource {
|
|
1586
|
+
name = "clarity";
|
|
1587
|
+
description = "Clarity.fm \u2014 on-demand expert calls billed per minute";
|
|
1588
|
+
website = "https://clarity.fm";
|
|
1589
|
+
fetchFn;
|
|
1590
|
+
apiBase;
|
|
1591
|
+
constructor(opts = {}) {
|
|
1592
|
+
this.fetchFn = opts.fetchFn ?? fetch;
|
|
1593
|
+
this.apiBase = opts.apiBase ?? process.env.CLARITY_API_BASE;
|
|
1594
|
+
}
|
|
1595
|
+
async crawl(opts = {}) {
|
|
1596
|
+
const log = opts.onLog ?? (() => {});
|
|
1597
|
+
if (!this.apiBase) {
|
|
1598
|
+
log("clarity: no public listing API; set CLARITY_API_BASE or inject a fetchFn to crawl.");
|
|
1599
|
+
return { experts: [], topics: [], tags: [], total: 0 };
|
|
1600
|
+
}
|
|
1601
|
+
const crawledAt = new Date().toISOString();
|
|
1602
|
+
const data = await fetchJson(`${this.apiBase}/experts`, this.fetchFn);
|
|
1603
|
+
const items = data?.results ?? data?.data ?? (Array.isArray(data) ? data : []);
|
|
1604
|
+
const experts = items.map((c) => normalizeClarityExpert(c, crawledAt));
|
|
1605
|
+
const tags = new Set;
|
|
1606
|
+
for (const e of experts)
|
|
1607
|
+
for (const t of e.tags)
|
|
1608
|
+
tags.add(t);
|
|
1609
|
+
return {
|
|
1610
|
+
experts: opts.max ? experts.slice(0, opts.max) : experts,
|
|
1611
|
+
topics: [],
|
|
1612
|
+
tags: [...tags].map((name) => ({ name, topic: "" })),
|
|
1613
|
+
total: experts.length
|
|
1614
|
+
};
|
|
1615
|
+
}
|
|
1616
|
+
}
|
|
1617
|
+
|
|
1618
|
+
// src/sources/glg.ts
|
|
1619
|
+
function normalizeGlgExpert(g, crawledAt) {
|
|
1620
|
+
const slug = slugify(g.name || String(g.id ?? ""));
|
|
1621
|
+
return makeExpert({
|
|
1622
|
+
source: "glg",
|
|
1623
|
+
sourceId: String(g.id ?? slug),
|
|
1624
|
+
slug,
|
|
1625
|
+
url: "https://glginsights.com",
|
|
1626
|
+
fullName: g.name ?? "",
|
|
1627
|
+
title: g.title ?? "",
|
|
1628
|
+
bio: g.biography ?? "",
|
|
1629
|
+
price: g.hourly_rate ?? 0,
|
|
1630
|
+
priceCurrency: g.currency ?? "USD",
|
|
1631
|
+
priceUnit: g.hourly_rate ? "per hour" : "",
|
|
1632
|
+
topics: g.industries ?? [],
|
|
1633
|
+
tags: g.expertise_areas ?? [],
|
|
1634
|
+
crawledAt
|
|
1635
|
+
});
|
|
1636
|
+
}
|
|
1637
|
+
|
|
1638
|
+
class GLGSource {
|
|
1639
|
+
name = "glg";
|
|
1640
|
+
description = "GLG \u2014 enterprise expert network (requires partner API access)";
|
|
1641
|
+
website = "https://glginsights.com";
|
|
1642
|
+
fetchFn;
|
|
1643
|
+
apiBase;
|
|
1644
|
+
apiKey;
|
|
1645
|
+
constructor(opts = {}) {
|
|
1646
|
+
this.fetchFn = opts.fetchFn ?? fetch;
|
|
1647
|
+
this.apiBase = opts.apiBase ?? process.env.GLG_API_BASE;
|
|
1648
|
+
this.apiKey = opts.apiKey ?? process.env.GLG_API_KEY;
|
|
1649
|
+
}
|
|
1650
|
+
async crawl(opts = {}) {
|
|
1651
|
+
const log = opts.onLog ?? (() => {});
|
|
1652
|
+
if (!this.apiBase) {
|
|
1653
|
+
log("glg: enterprise-gated; no public directory. Set GLG_API_BASE + GLG_API_KEY (partner access) to crawl.");
|
|
1654
|
+
return { experts: [], topics: [], tags: [], total: 0 };
|
|
1655
|
+
}
|
|
1656
|
+
const crawledAt = new Date().toISOString();
|
|
1657
|
+
const data = await fetchJson(`${this.apiBase}/experts`, this.fetchFn, {
|
|
1658
|
+
headers: this.apiKey ? { Authorization: `Bearer ${this.apiKey}` } : {}
|
|
1659
|
+
});
|
|
1660
|
+
const items = data?.results ?? data?.data ?? (Array.isArray(data) ? data : []);
|
|
1661
|
+
const experts = items.map((g) => normalizeGlgExpert(g, crawledAt));
|
|
1662
|
+
const tags = new Set;
|
|
1663
|
+
for (const e of experts)
|
|
1664
|
+
for (const t of e.tags)
|
|
1665
|
+
tags.add(t);
|
|
1666
|
+
return {
|
|
1667
|
+
experts: opts.max ? experts.slice(0, opts.max) : experts,
|
|
1668
|
+
topics: [],
|
|
1669
|
+
tags: [...tags].map((name) => ({ name, topic: "" })),
|
|
1670
|
+
total: experts.length
|
|
1671
|
+
};
|
|
1672
|
+
}
|
|
1673
|
+
}
|
|
1674
|
+
|
|
1675
|
+
// src/sources/index.ts
|
|
1676
|
+
var registry = new Map;
|
|
1677
|
+
function registerSource(source) {
|
|
1678
|
+
registry.set(source.name, source);
|
|
1679
|
+
}
|
|
1680
|
+
function getSource(name) {
|
|
1681
|
+
return registry.get(name);
|
|
1682
|
+
}
|
|
1683
|
+
function listSources() {
|
|
1684
|
+
return [...registry.values()];
|
|
1685
|
+
}
|
|
1686
|
+
registerSource(new IntroSource);
|
|
1687
|
+
registerSource(new MentorCruiseSource);
|
|
1688
|
+
registerSource(new ADPListSource);
|
|
1689
|
+
registerSource(new ClaritySource);
|
|
1690
|
+
registerSource(new GLGSource);
|
|
1691
|
+
|
|
1692
|
+
// src/crawl.ts
|
|
1693
|
+
async function crawlSource(db, sourceName, opts = {}) {
|
|
1694
|
+
const source = getSource(sourceName);
|
|
1695
|
+
if (!source) {
|
|
1696
|
+
throw new Error(`Unknown source "${sourceName}". Run \`experts sources\` to list options.`);
|
|
1697
|
+
}
|
|
1698
|
+
const data = await source.crawl(opts);
|
|
1699
|
+
const changes = db.recordChanges(source.name, data.experts);
|
|
1700
|
+
db.upsertExperts(data.experts);
|
|
1701
|
+
if (data.topics.length)
|
|
1702
|
+
db.setTopics(source.name, data.topics);
|
|
1703
|
+
db.setTags(source.name, data.tags);
|
|
1704
|
+
db.setMeta(`last_crawl:${source.name}`, new Date().toISOString());
|
|
1705
|
+
db.setMeta(`catalog_total:${source.name}`, String(data.total));
|
|
1706
|
+
opts.onLog?.("building knowledge graph\u2026");
|
|
1707
|
+
const graph = db.rebuildGraph();
|
|
1708
|
+
db.rescore(source.name);
|
|
1709
|
+
db.rebuildPersons();
|
|
1710
|
+
return {
|
|
1711
|
+
source: source.name,
|
|
1712
|
+
experts: data.experts.length,
|
|
1713
|
+
topics: data.topics.map((t) => ({ name: t.name, count: t.expertCount })),
|
|
1714
|
+
tags: new Set(data.tags.map((t) => t.name)).size,
|
|
1715
|
+
total: data.total,
|
|
1716
|
+
graph,
|
|
1717
|
+
changes
|
|
1718
|
+
};
|
|
1719
|
+
}
|
|
1720
|
+
|
|
1721
|
+
// src/format.ts
|
|
1722
|
+
import chalk from "chalk";
|
|
1723
|
+
function money(amount, currency = "USD") {
|
|
1724
|
+
const symbol = currency === "USD" ? "$" : `${currency} `;
|
|
1725
|
+
return `${symbol}${amount.toLocaleString("en-US")}`;
|
|
1726
|
+
}
|
|
1727
|
+
function stars(rating, count) {
|
|
1728
|
+
const full = Math.round(rating);
|
|
1729
|
+
const bar = "\u2605".repeat(full) + "\u2606".repeat(Math.max(0, 5 - full));
|
|
1730
|
+
const label = rating ? rating.toFixed(2) : "\u2014";
|
|
1731
|
+
return `${chalk.yellow(bar)} ${label} ${chalk.dim(`(${count})`)}`;
|
|
1732
|
+
}
|
|
1733
|
+
function truncate(s, n) {
|
|
1734
|
+
const clean = (s || "").replace(/\s+/g, " ").trim();
|
|
1735
|
+
return clean.length > n ? clean.slice(0, n - 1) + "\u2026" : clean;
|
|
1736
|
+
}
|
|
1737
|
+
function badges(e) {
|
|
1738
|
+
const parts = [];
|
|
851
1739
|
if (e.featured)
|
|
852
1740
|
parts.push(chalk.bgYellow.black(" TOP "));
|
|
853
1741
|
if (e.verified)
|
|
@@ -907,6 +1795,36 @@ function formatDetail(e, enrichment = {}) {
|
|
|
907
1795
|
L.push(` ${chalk.dim((t.createdAt || "").slice(0, 10))} ${truncate(t.text, 80)}`);
|
|
908
1796
|
}
|
|
909
1797
|
}
|
|
1798
|
+
const ext = enrichment.ext ?? {};
|
|
1799
|
+
const li = ext.linkedin;
|
|
1800
|
+
if (li && (li.headline || li.company)) {
|
|
1801
|
+
L.push("");
|
|
1802
|
+
L.push(chalk.bold.blue("LinkedIn") + ` ${[li.headline, li.company].filter(Boolean).join(" \xB7 ")}`);
|
|
1803
|
+
if (li.about)
|
|
1804
|
+
L.push(chalk.dim(wrap(String(li.about), 80)));
|
|
1805
|
+
}
|
|
1806
|
+
const site = ext.site;
|
|
1807
|
+
if (site && site.summary) {
|
|
1808
|
+
L.push("");
|
|
1809
|
+
L.push(chalk.dim("Site ") + chalk.dim(String(site.url || "")));
|
|
1810
|
+
L.push(wrap(truncate(String(site.summary), 280), 88));
|
|
1811
|
+
}
|
|
1812
|
+
const videos = enrichment.videos ?? [];
|
|
1813
|
+
if (videos.length) {
|
|
1814
|
+
L.push("");
|
|
1815
|
+
L.push(chalk.dim(`Recent videos (${videos.length}):`));
|
|
1816
|
+
for (const v of videos.slice(0, 5))
|
|
1817
|
+
L.push(` ${chalk.dim((v.publishedAt || "").slice(0, 10))} ${truncate(v.title, 70)}`);
|
|
1818
|
+
}
|
|
1819
|
+
const contacts = enrichment.contacts ?? [];
|
|
1820
|
+
if (contacts.length) {
|
|
1821
|
+
L.push("");
|
|
1822
|
+
L.push(chalk.dim("Contacts:"));
|
|
1823
|
+
for (const c of contacts) {
|
|
1824
|
+
const mark = c.status === "valid" ? chalk.green("\u2713") : c.status === "invalid" ? chalk.red("\u2717") : chalk.dim("\xB7");
|
|
1825
|
+
L.push(` ${mark} ${c.type}: ${c.value}`);
|
|
1826
|
+
}
|
|
1827
|
+
}
|
|
910
1828
|
const eq = e.extra?.exampleQuestions || [];
|
|
911
1829
|
if (eq.length) {
|
|
912
1830
|
L.push("");
|
|
@@ -970,11 +1888,48 @@ import { join as join2 } from "path";
|
|
|
970
1888
|
import { mkdirSync as mkdirSync2 } from "fs";
|
|
971
1889
|
|
|
972
1890
|
// src/connectors.ts
|
|
1891
|
+
class CredentialPool {
|
|
1892
|
+
creds;
|
|
1893
|
+
cooldownMs;
|
|
1894
|
+
cooldownUntil = new Map;
|
|
1895
|
+
constructor(creds, cooldownMs = 15 * 60 * 1000) {
|
|
1896
|
+
this.creds = creds;
|
|
1897
|
+
this.cooldownMs = cooldownMs;
|
|
1898
|
+
}
|
|
1899
|
+
static fromEnv(variable) {
|
|
1900
|
+
const sets = [];
|
|
1901
|
+
const csv = process.env[`${variable}S`];
|
|
1902
|
+
if (csv) {
|
|
1903
|
+
csv.split(",").map((t) => t.trim()).filter(Boolean).forEach((t, i) => sets.push({ name: `${variable}#csv${i + 1}`, env: { [variable]: t } }));
|
|
1904
|
+
}
|
|
1905
|
+
if (process.env[variable])
|
|
1906
|
+
sets.push({ name: `${variable}#1`, env: { [variable]: process.env[variable] } });
|
|
1907
|
+
for (let i = 2;i <= 10; i++) {
|
|
1908
|
+
const v = process.env[`${variable}_${i}`];
|
|
1909
|
+
if (v)
|
|
1910
|
+
sets.push({ name: `${variable}#${i}`, env: { [variable]: v } });
|
|
1911
|
+
}
|
|
1912
|
+
return new CredentialPool(sets);
|
|
1913
|
+
}
|
|
1914
|
+
size() {
|
|
1915
|
+
return this.creds.length;
|
|
1916
|
+
}
|
|
1917
|
+
available(now = Date.now()) {
|
|
1918
|
+
return this.creds.filter((c) => (this.cooldownUntil.get(c.name) ?? 0) <= now);
|
|
1919
|
+
}
|
|
1920
|
+
next(now = Date.now()) {
|
|
1921
|
+
return this.available(now)[0] ?? null;
|
|
1922
|
+
}
|
|
1923
|
+
cooldown(name, now = Date.now()) {
|
|
1924
|
+
this.cooldownUntil.set(name, now + this.cooldownMs);
|
|
1925
|
+
}
|
|
1926
|
+
}
|
|
973
1927
|
function defaultRunner(bin = "connectors") {
|
|
974
|
-
return async (connector, args) => {
|
|
1928
|
+
return async (connector, args, env) => {
|
|
975
1929
|
const proc = Bun.spawn([bin, "run", connector, ...args, "--format", "json"], {
|
|
976
1930
|
stdout: "pipe",
|
|
977
|
-
stderr: "pipe"
|
|
1931
|
+
stderr: "pipe",
|
|
1932
|
+
env: env ? { ...process.env, ...env } : process.env
|
|
978
1933
|
});
|
|
979
1934
|
const [out, err, code] = await Promise.all([
|
|
980
1935
|
new Response(proc.stdout).text(),
|
|
@@ -1018,9 +1973,30 @@ function extractJson(text) {
|
|
|
1018
1973
|
}
|
|
1019
1974
|
|
|
1020
1975
|
class ConnectorsClient {
|
|
1021
|
-
|
|
1976
|
+
runner;
|
|
1977
|
+
pool;
|
|
1022
1978
|
constructor(opts = {}) {
|
|
1023
|
-
this.
|
|
1979
|
+
this.runner = opts.runner ?? defaultRunner(opts.bin);
|
|
1980
|
+
this.pool = opts.pool;
|
|
1981
|
+
}
|
|
1982
|
+
async run(connector, args) {
|
|
1983
|
+
if (!this.pool || this.pool.size() <= 1) {
|
|
1984
|
+
return this.runner(connector, args, this.pool?.next()?.env);
|
|
1985
|
+
}
|
|
1986
|
+
const tries = this.pool.size();
|
|
1987
|
+
for (let i = 0;i < tries; i++) {
|
|
1988
|
+
const cred = this.pool.next();
|
|
1989
|
+
if (!cred) {
|
|
1990
|
+
return { success: false, quotaExhausted: true, error: "all credentials are in cooldown" };
|
|
1991
|
+
}
|
|
1992
|
+
const res = await this.runner(connector, args, cred.env);
|
|
1993
|
+
if (res.quotaExhausted || res.rateLimited) {
|
|
1994
|
+
this.pool.cooldown(cred.name);
|
|
1995
|
+
continue;
|
|
1996
|
+
}
|
|
1997
|
+
return res;
|
|
1998
|
+
}
|
|
1999
|
+
return { success: false, quotaExhausted: true, error: "all credentials exhausted" };
|
|
1024
2000
|
}
|
|
1025
2001
|
async xUser(username) {
|
|
1026
2002
|
const result = await this.run("x", ["users", "get", username]);
|
|
@@ -1073,10 +2049,65 @@ class ConnectorsClient {
|
|
|
1073
2049
|
});
|
|
1074
2050
|
return { tweets, result };
|
|
1075
2051
|
}
|
|
2052
|
+
async ytVideos(query, opts = {}) {
|
|
2053
|
+
const result = await this.run("youtube", [
|
|
2054
|
+
"search",
|
|
2055
|
+
"--query",
|
|
2056
|
+
query,
|
|
2057
|
+
"--type",
|
|
2058
|
+
"video",
|
|
2059
|
+
"--order",
|
|
2060
|
+
"date",
|
|
2061
|
+
"--max",
|
|
2062
|
+
String(opts.max ?? 10)
|
|
2063
|
+
]);
|
|
2064
|
+
if (!result.success)
|
|
2065
|
+
return { videos: [], result };
|
|
2066
|
+
const items = result.data?.items ?? result.data?.data ?? (Array.isArray(result.data) ? result.data : []);
|
|
2067
|
+
const videos = items.map((it) => parseYouTubeItem(it)).filter((v) => v !== null);
|
|
2068
|
+
return { videos, result };
|
|
2069
|
+
}
|
|
2070
|
+
async linkedInProfile(handle) {
|
|
2071
|
+
const result = await this.run("linkedin", ["profile", "get", handle]);
|
|
2072
|
+
if (!result.success)
|
|
2073
|
+
return { profile: null, result };
|
|
2074
|
+
const d = result.data?.data ?? result.data ?? {};
|
|
2075
|
+
if (!d || !d.headline && !d.about && !d.summary)
|
|
2076
|
+
return { profile: null, result };
|
|
2077
|
+
return {
|
|
2078
|
+
profile: {
|
|
2079
|
+
headline: d.headline ?? d.occupation ?? "",
|
|
2080
|
+
company: d.company ?? d.companyName ?? "",
|
|
2081
|
+
about: d.about ?? d.summary ?? "",
|
|
2082
|
+
location: d.location ?? d.locationName ?? ""
|
|
2083
|
+
},
|
|
2084
|
+
result
|
|
2085
|
+
};
|
|
2086
|
+
}
|
|
2087
|
+
}
|
|
2088
|
+
function parseYouTubeItem(it) {
|
|
2089
|
+
const id = typeof it?.id === "string" ? it.id : it?.id?.videoId ?? it?.videoId;
|
|
2090
|
+
if (!id)
|
|
2091
|
+
return null;
|
|
2092
|
+
const sn = it?.snippet ?? it ?? {};
|
|
2093
|
+
const stats = it?.statistics ?? {};
|
|
2094
|
+
const thumb = sn?.thumbnails?.high?.url ?? sn?.thumbnails?.default?.url ?? "";
|
|
2095
|
+
return {
|
|
2096
|
+
id: String(id),
|
|
2097
|
+
title: sn.title ?? "",
|
|
2098
|
+
description: sn.description ?? "",
|
|
2099
|
+
publishedAt: sn.publishedAt ?? sn.publishTime ?? "",
|
|
2100
|
+
url: `https://www.youtube.com/watch?v=${id}`,
|
|
2101
|
+
thumbnail: thumb,
|
|
2102
|
+
viewCount: Number(stats.viewCount ?? 0)
|
|
2103
|
+
};
|
|
1076
2104
|
}
|
|
1077
2105
|
|
|
1078
2106
|
// src/enrich.ts
|
|
1079
2107
|
var sleep2 = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
2108
|
+
function defaultXClient() {
|
|
2109
|
+
return new ConnectorsClient({ pool: CredentialPool.fromEnv("X_BEARER_TOKEN") });
|
|
2110
|
+
}
|
|
1080
2111
|
function avatarDir() {
|
|
1081
2112
|
return process.env.OPEN_EXPERTS_AVATARS || join2(homedir2(), ".hasna", "experts", "avatars");
|
|
1082
2113
|
}
|
|
@@ -1116,7 +2147,7 @@ async function downloadAvatar(url, e, fetchFn = fetch) {
|
|
|
1116
2147
|
return path;
|
|
1117
2148
|
}
|
|
1118
2149
|
async function enrichExpert(db, e, opts) {
|
|
1119
|
-
const client = opts.client ??
|
|
2150
|
+
const client = opts.client ?? defaultXClient();
|
|
1120
2151
|
const handle = handleFromSocial(e.socials.twitter || "");
|
|
1121
2152
|
const now = new Date().toISOString();
|
|
1122
2153
|
if (!handle)
|
|
@@ -1208,9 +2239,37 @@ async function enrichExpert(db, e, opts) {
|
|
|
1208
2239
|
}
|
|
1209
2240
|
return { ok: true, notFound: false, tweets: tweetCount, avatar, rateLimited: false, quotaExhausted: false };
|
|
1210
2241
|
}
|
|
2242
|
+
async function backfillAvatars(db, opts = {}) {
|
|
2243
|
+
const log = opts.onLog ?? (() => {});
|
|
2244
|
+
const fetchFn = opts.fetchFn ?? fetch;
|
|
2245
|
+
const delayMs = opts.delayMs ?? 150;
|
|
2246
|
+
const experts = db.list({ source: opts.source });
|
|
2247
|
+
const res = { downloaded: 0, skipped: 0, failed: 0 };
|
|
2248
|
+
for (const e of experts) {
|
|
2249
|
+
if (e.avatarLocal || !e.avatar) {
|
|
2250
|
+
res.skipped++;
|
|
2251
|
+
continue;
|
|
2252
|
+
}
|
|
2253
|
+
try {
|
|
2254
|
+
const path = await downloadAvatar(e.avatar, e, fetchFn);
|
|
2255
|
+
if (path) {
|
|
2256
|
+
db.setAvatarLocal(e.source, e.sourceId, path);
|
|
2257
|
+
res.downloaded++;
|
|
2258
|
+
if (res.downloaded % 100 === 0)
|
|
2259
|
+
log(` avatars: ${res.downloaded} downloaded`);
|
|
2260
|
+
} else {
|
|
2261
|
+
res.failed++;
|
|
2262
|
+
}
|
|
2263
|
+
} catch {
|
|
2264
|
+
res.failed++;
|
|
2265
|
+
}
|
|
2266
|
+
await sleep2(delayMs);
|
|
2267
|
+
}
|
|
2268
|
+
return res;
|
|
2269
|
+
}
|
|
1211
2270
|
async function enrichX(db, opts = {}) {
|
|
1212
2271
|
const log = opts.onLog ?? (() => {});
|
|
1213
|
-
const client = opts.client ??
|
|
2272
|
+
const client = opts.client ?? defaultXClient();
|
|
1214
2273
|
const delayMs = opts.delayMs ?? 1200;
|
|
1215
2274
|
const staleBefore = opts.refresh ? undefined : opts.sinceDays ? new Date(Date.now() - opts.sinceDays * 86400000).toISOString() : undefined;
|
|
1216
2275
|
const targets = db.expertsToEnrich({
|
|
@@ -1267,13 +2326,440 @@ async function enrichX(db, opts = {}) {
|
|
|
1267
2326
|
db.setMeta(`last_enrich:x:${opts.source ?? "all"}`, new Date().toISOString());
|
|
1268
2327
|
return res;
|
|
1269
2328
|
}
|
|
2329
|
+
async function enrichYouTube(db, opts = {}) {
|
|
2330
|
+
const log = opts.onLog ?? (() => {});
|
|
2331
|
+
const client = opts.client ?? new ConnectorsClient;
|
|
2332
|
+
const delayMs = opts.delayMs ?? 500;
|
|
2333
|
+
const targets = db.expertsNeedingVideos({ source: opts.source, refresh: opts.refresh, limit: opts.max });
|
|
2334
|
+
const res = { attempted: 0, withVideos: 0, videos: 0, stoppedEarly: false };
|
|
2335
|
+
for (let i = 0;i < targets.length; i++) {
|
|
2336
|
+
const e = targets[i];
|
|
2337
|
+
res.attempted++;
|
|
2338
|
+
let found;
|
|
2339
|
+
try {
|
|
2340
|
+
found = await client.ytVideos(e.fullName || e.slug, { max: opts.videoMax ?? 10 });
|
|
2341
|
+
} catch (err) {
|
|
2342
|
+
log(` ${e.slug}: error ${err?.message || err}`);
|
|
2343
|
+
await sleep2(delayMs);
|
|
2344
|
+
continue;
|
|
2345
|
+
}
|
|
2346
|
+
if (found.result.quotaExhausted) {
|
|
2347
|
+
res.stoppedEarly = true;
|
|
2348
|
+
res.reason = "YouTube API quota exhausted \u2014 top up or wait, then re-run";
|
|
2349
|
+
res.attempted--;
|
|
2350
|
+
break;
|
|
2351
|
+
}
|
|
2352
|
+
if (found.result.rateLimited && found.videos.length === 0) {
|
|
2353
|
+
res.stoppedEarly = true;
|
|
2354
|
+
res.reason = "YouTube rate limited \u2014 resume later";
|
|
2355
|
+
res.attempted--;
|
|
2356
|
+
break;
|
|
2357
|
+
}
|
|
2358
|
+
const rows = found.videos.map((v) => ({
|
|
2359
|
+
source: e.source,
|
|
2360
|
+
sourceId: e.sourceId,
|
|
2361
|
+
videoId: v.id,
|
|
2362
|
+
title: v.title,
|
|
2363
|
+
description: v.description,
|
|
2364
|
+
publishedAt: v.publishedAt,
|
|
2365
|
+
url: v.url,
|
|
2366
|
+
thumbnail: v.thumbnail,
|
|
2367
|
+
viewCount: v.viewCount
|
|
2368
|
+
}));
|
|
2369
|
+
db.replaceVideos(e.source, e.sourceId, rows);
|
|
2370
|
+
if (rows.length)
|
|
2371
|
+
res.withVideos++;
|
|
2372
|
+
res.videos += rows.length;
|
|
2373
|
+
log(` [${res.attempted}/${targets.length}] ${e.fullName || e.slug}: ${rows.length} videos`);
|
|
2374
|
+
await sleep2(delayMs);
|
|
2375
|
+
}
|
|
2376
|
+
db.setMeta(`last_youtube:${opts.source ?? "all"}`, new Date().toISOString());
|
|
2377
|
+
return res;
|
|
2378
|
+
}
|
|
2379
|
+
async function liveTweets(db, e, opts = {}) {
|
|
2380
|
+
const client = opts.client ?? defaultXClient();
|
|
2381
|
+
let xId = db.getXProfile(e.source, e.sourceId)?.xId || "";
|
|
2382
|
+
if (!xId) {
|
|
2383
|
+
const handle = handleFromSocial(e.socials.twitter || "");
|
|
2384
|
+
if (!handle)
|
|
2385
|
+
return { tweets: [], rateLimited: false, quotaExhausted: false };
|
|
2386
|
+
const { user, result: result2 } = await client.xUser(handle);
|
|
2387
|
+
if (!user)
|
|
2388
|
+
return { tweets: [], rateLimited: !!result2.rateLimited, quotaExhausted: !!result2.quotaExhausted };
|
|
2389
|
+
xId = user.id;
|
|
2390
|
+
}
|
|
2391
|
+
const { tweets, result } = await client.xTimeline(xId, { max: opts.max ?? 10, replies: false });
|
|
2392
|
+
const rows = tweets.map((t) => ({
|
|
2393
|
+
source: e.source,
|
|
2394
|
+
sourceId: e.sourceId,
|
|
2395
|
+
tweetId: t.id,
|
|
2396
|
+
text: t.text,
|
|
2397
|
+
createdAt: t.createdAt,
|
|
2398
|
+
retweetCount: t.retweetCount,
|
|
2399
|
+
replyCount: t.replyCount,
|
|
2400
|
+
likeCount: t.likeCount,
|
|
2401
|
+
quoteCount: t.quoteCount,
|
|
2402
|
+
impressionCount: t.impressionCount,
|
|
2403
|
+
isRetweet: t.isRetweet,
|
|
2404
|
+
isReply: t.isReply
|
|
2405
|
+
}));
|
|
2406
|
+
return { tweets: rows, rateLimited: !!result.rateLimited, quotaExhausted: !!result.quotaExhausted };
|
|
2407
|
+
}
|
|
2408
|
+
function linkedinHandle(value) {
|
|
2409
|
+
if (!value)
|
|
2410
|
+
return "";
|
|
2411
|
+
const m = value.match(/linkedin\.com\/(?:in|company)\/([^/?#]+)/i);
|
|
2412
|
+
if (m)
|
|
2413
|
+
return m[1];
|
|
2414
|
+
return value.replace(/^https?:\/\/[^/]+\//, "").replace(/^@/, "").split(/[/?#]/)[0] ?? "";
|
|
2415
|
+
}
|
|
2416
|
+
async function enrichLinkedIn(db, opts = {}) {
|
|
2417
|
+
const log = opts.onLog ?? (() => {});
|
|
2418
|
+
const client = opts.client ?? new ConnectorsClient;
|
|
2419
|
+
const delayMs = opts.delayMs ?? 500;
|
|
2420
|
+
const targets = db.expertsNeedingExt("linkedin", "linkedin", { source: opts.source, refresh: opts.refresh, limit: opts.max });
|
|
2421
|
+
const res = { attempted: 0, enriched: 0, stoppedEarly: false };
|
|
2422
|
+
for (const e of targets) {
|
|
2423
|
+
res.attempted++;
|
|
2424
|
+
const handle = linkedinHandle(e.socials.linkedin || "");
|
|
2425
|
+
if (!handle)
|
|
2426
|
+
continue;
|
|
2427
|
+
let out;
|
|
2428
|
+
try {
|
|
2429
|
+
out = await client.linkedInProfile(handle);
|
|
2430
|
+
} catch (err) {
|
|
2431
|
+
log(` ${e.slug}: error ${err?.message || err}`);
|
|
2432
|
+
await sleep2(delayMs);
|
|
2433
|
+
continue;
|
|
2434
|
+
}
|
|
2435
|
+
if (out.result.quotaExhausted || out.result.rateLimited) {
|
|
2436
|
+
res.stoppedEarly = true;
|
|
2437
|
+
res.reason = "linkedin connector quota/rate-limited or not authenticated \u2014 resume later";
|
|
2438
|
+
res.attempted--;
|
|
2439
|
+
break;
|
|
2440
|
+
}
|
|
2441
|
+
if (out.profile) {
|
|
2442
|
+
db.upsertExt(e.source, e.sourceId, "linkedin", { ...out.profile });
|
|
2443
|
+
res.enriched++;
|
|
2444
|
+
log(` [${res.attempted}/${targets.length}] ${e.fullName || e.slug}: linkedin \u2713`);
|
|
2445
|
+
}
|
|
2446
|
+
await sleep2(delayMs);
|
|
2447
|
+
}
|
|
2448
|
+
return res;
|
|
2449
|
+
}
|
|
2450
|
+
function htmlToText(html) {
|
|
2451
|
+
return (html || "").replace(/<script[\s\S]*?<\/script>/gi, " ").replace(/<style[\s\S]*?<\/style>/gi, " ").replace(/<[^>]+>/g, " ").replace(/&[a-z#0-9]+;/gi, " ").replace(/\s+/g, " ").trim();
|
|
2452
|
+
}
|
|
2453
|
+
async function enrichSite(db, opts = {}) {
|
|
2454
|
+
const log = opts.onLog ?? (() => {});
|
|
2455
|
+
const fetchFn = opts.fetchFn ?? fetch;
|
|
2456
|
+
const delayMs = opts.delayMs ?? 500;
|
|
2457
|
+
const maxChars = opts.maxChars ?? 1500;
|
|
2458
|
+
const all = db.list({ source: opts.source });
|
|
2459
|
+
const res = { attempted: 0, enriched: 0, stoppedEarly: false };
|
|
2460
|
+
let processed = 0;
|
|
2461
|
+
for (const e of all) {
|
|
2462
|
+
if (opts.max && processed >= opts.max)
|
|
2463
|
+
break;
|
|
2464
|
+
const x = db.getXProfile(e.source, e.sourceId);
|
|
2465
|
+
const url = x?.url || "";
|
|
2466
|
+
if (!url || /t\.co\//.test(url))
|
|
2467
|
+
continue;
|
|
2468
|
+
if (!opts.refresh && db.getExt(e.source, e.sourceId, "site"))
|
|
2469
|
+
continue;
|
|
2470
|
+
processed++;
|
|
2471
|
+
res.attempted++;
|
|
2472
|
+
try {
|
|
2473
|
+
const r = await fetchFn(url, { headers: { "User-Agent": "open-experts (+https://github.com/hasna/experts)" } });
|
|
2474
|
+
if (r.ok) {
|
|
2475
|
+
const text = htmlToText(await r.text()).slice(0, maxChars);
|
|
2476
|
+
db.upsertExt(e.source, e.sourceId, "site", { url, summary: text });
|
|
2477
|
+
res.enriched++;
|
|
2478
|
+
log(` [${res.attempted}] ${e.fullName || e.slug}: site \u2713`);
|
|
2479
|
+
}
|
|
2480
|
+
} catch {}
|
|
2481
|
+
await sleep2(delayMs);
|
|
2482
|
+
}
|
|
2483
|
+
return res;
|
|
2484
|
+
}
|
|
2485
|
+
|
|
2486
|
+
// src/contacts.ts
|
|
2487
|
+
import { resolveMx } from "dns/promises";
|
|
2488
|
+
var sleep3 = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
2489
|
+
var EMAIL_RE = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
|
|
2490
|
+
function normalizeEmail(v) {
|
|
2491
|
+
return v.trim().toLowerCase().replace(/^mailto:/, "");
|
|
2492
|
+
}
|
|
2493
|
+
function normalizePhone(v) {
|
|
2494
|
+
const cleaned = v.trim().replace(/[^\d+]/g, "");
|
|
2495
|
+
if (cleaned.startsWith("+"))
|
|
2496
|
+
return "+" + cleaned.slice(1).replace(/\D/g, "");
|
|
2497
|
+
return cleaned;
|
|
2498
|
+
}
|
|
2499
|
+
function validatePhone(v) {
|
|
2500
|
+
const n = normalizePhone(v);
|
|
2501
|
+
const digits = n.replace(/\D/g, "");
|
|
2502
|
+
return digits.length >= 7 && digits.length <= 15 ? "valid" : "invalid";
|
|
2503
|
+
}
|
|
2504
|
+
async function validateEmail(email, resolver = resolveMx) {
|
|
2505
|
+
const e = normalizeEmail(email);
|
|
2506
|
+
if (!EMAIL_RE.test(e))
|
|
2507
|
+
return "invalid";
|
|
2508
|
+
const domain = e.split("@")[1];
|
|
2509
|
+
try {
|
|
2510
|
+
const mx = await resolver(domain);
|
|
2511
|
+
return mx && mx.length > 0 ? "valid" : "invalid";
|
|
2512
|
+
} catch (err) {
|
|
2513
|
+
const code = err?.code || "";
|
|
2514
|
+
if (code === "ENOTFOUND" || code === "ENODATA" || code === "NXDOMAIN")
|
|
2515
|
+
return "invalid";
|
|
2516
|
+
return "unknown";
|
|
2517
|
+
}
|
|
2518
|
+
}
|
|
2519
|
+
|
|
2520
|
+
class ExaWebsetsProvider {
|
|
2521
|
+
name = "exa";
|
|
2522
|
+
apiKey;
|
|
2523
|
+
fetchFn;
|
|
2524
|
+
pollMs;
|
|
2525
|
+
maxPollMs;
|
|
2526
|
+
baseUrl;
|
|
2527
|
+
constructor(opts = {}) {
|
|
2528
|
+
this.apiKey = opts.apiKey ?? process.env.EXA_API_KEY ?? "";
|
|
2529
|
+
this.fetchFn = opts.fetchFn ?? fetch;
|
|
2530
|
+
this.pollMs = opts.pollMs ?? 3000;
|
|
2531
|
+
this.maxPollMs = opts.maxPollMs ?? 120000;
|
|
2532
|
+
this.baseUrl = (opts.baseUrl ?? "https://api.exa.ai").replace(/\/+$/, "");
|
|
2533
|
+
}
|
|
2534
|
+
headers() {
|
|
2535
|
+
return { "content-type": "application/json", "x-api-key": this.apiKey };
|
|
2536
|
+
}
|
|
2537
|
+
query(e) {
|
|
2538
|
+
const company = e.extra?.company || "";
|
|
2539
|
+
const bits = [e.fullName, e.title, company].filter(Boolean).join(", ");
|
|
2540
|
+
return `Contact information (email and phone) for ${bits || e.fullName || e.slug}`;
|
|
2541
|
+
}
|
|
2542
|
+
async find(e) {
|
|
2543
|
+
if (!this.apiKey)
|
|
2544
|
+
return { contacts: [], quotaExhausted: true };
|
|
2545
|
+
const createRes = await this.fetchFn(`${this.baseUrl}/websets/v0/websets`, {
|
|
2546
|
+
method: "POST",
|
|
2547
|
+
headers: this.headers(),
|
|
2548
|
+
body: JSON.stringify({
|
|
2549
|
+
search: { query: this.query(e), count: 3 },
|
|
2550
|
+
enrichments: [
|
|
2551
|
+
{ description: "email address of this person", format: "email" },
|
|
2552
|
+
{ description: "phone number of this person", format: "phone" }
|
|
2553
|
+
]
|
|
2554
|
+
})
|
|
2555
|
+
});
|
|
2556
|
+
if (createRes.status === 429)
|
|
2557
|
+
return { contacts: [], rateLimited: true };
|
|
2558
|
+
if (createRes.status === 402 || createRes.status === 403)
|
|
2559
|
+
return { contacts: [], quotaExhausted: true };
|
|
2560
|
+
if (!createRes.ok)
|
|
2561
|
+
return { contacts: [] };
|
|
2562
|
+
const created = await createRes.json();
|
|
2563
|
+
const id = created?.id;
|
|
2564
|
+
if (!id)
|
|
2565
|
+
return { contacts: [] };
|
|
2566
|
+
const deadline = Date.now() + this.maxPollMs;
|
|
2567
|
+
let status = created?.status;
|
|
2568
|
+
while (status !== "idle" && status !== "completed" && Date.now() < deadline) {
|
|
2569
|
+
await sleep3(this.pollMs);
|
|
2570
|
+
const r = await this.fetchFn(`${this.baseUrl}/websets/v0/websets/${id}`, { headers: this.headers() });
|
|
2571
|
+
if (!r.ok)
|
|
2572
|
+
break;
|
|
2573
|
+
status = (await r.json())?.status;
|
|
2574
|
+
}
|
|
2575
|
+
const itemsRes = await this.fetchFn(`${this.baseUrl}/websets/v0/websets/${id}/items`, { headers: this.headers() });
|
|
2576
|
+
if (!itemsRes.ok)
|
|
2577
|
+
return { contacts: [] };
|
|
2578
|
+
const items = await itemsRes.json();
|
|
2579
|
+
return { contacts: extractExaContacts(items) };
|
|
2580
|
+
}
|
|
2581
|
+
}
|
|
2582
|
+
function extractExaContacts(payload) {
|
|
2583
|
+
const out = [];
|
|
2584
|
+
const seen = new Set;
|
|
2585
|
+
const items = payload?.data ?? payload?.items ?? (Array.isArray(payload) ? payload : []);
|
|
2586
|
+
for (const item of items) {
|
|
2587
|
+
const enrichments = item?.enrichments ?? [];
|
|
2588
|
+
for (const en of enrichments) {
|
|
2589
|
+
const fmt = en?.format;
|
|
2590
|
+
const results = en?.result ?? en?.results ?? (en?.value != null ? [en.value] : []);
|
|
2591
|
+
for (const r of results) {
|
|
2592
|
+
const raw = typeof r === "string" ? r : r?.value ?? r?.text;
|
|
2593
|
+
if (!raw)
|
|
2594
|
+
continue;
|
|
2595
|
+
if (fmt === "email" || /@/.test(raw)) {
|
|
2596
|
+
const v = normalizeEmail(String(raw));
|
|
2597
|
+
if (EMAIL_RE.test(v) && !seen.has("e:" + v)) {
|
|
2598
|
+
seen.add("e:" + v);
|
|
2599
|
+
out.push({ type: "email", value: v, confidence: en?.confidence ?? 0.5 });
|
|
2600
|
+
}
|
|
2601
|
+
} else if (fmt === "phone" || /\+?\d[\d\s().-]{6,}/.test(raw)) {
|
|
2602
|
+
const v = normalizePhone(String(raw));
|
|
2603
|
+
if (v.replace(/\D/g, "").length >= 7 && !seen.has("p:" + v)) {
|
|
2604
|
+
seen.add("p:" + v);
|
|
2605
|
+
out.push({ type: "phone", value: v, confidence: en?.confidence ?? 0.5 });
|
|
2606
|
+
}
|
|
2607
|
+
}
|
|
2608
|
+
}
|
|
2609
|
+
}
|
|
2610
|
+
}
|
|
2611
|
+
return out;
|
|
2612
|
+
}
|
|
2613
|
+
async function discoverContacts(db, opts = {}) {
|
|
2614
|
+
const log = opts.onLog ?? (() => {});
|
|
2615
|
+
const provider = opts.provider ?? new ExaWebsetsProvider;
|
|
2616
|
+
const delayMs = opts.delayMs ?? 500;
|
|
2617
|
+
const targets = db.expertsNeedingContacts({ source: opts.source, refresh: opts.refresh, limit: opts.max });
|
|
2618
|
+
const res = { attempted: 0, withContacts: 0, contacts: 0, stoppedEarly: false };
|
|
2619
|
+
const now = new Date().toISOString();
|
|
2620
|
+
for (let i = 0;i < targets.length; i++) {
|
|
2621
|
+
const e = targets[i];
|
|
2622
|
+
res.attempted++;
|
|
2623
|
+
let found;
|
|
2624
|
+
try {
|
|
2625
|
+
found = await provider.find(e);
|
|
2626
|
+
} catch (err) {
|
|
2627
|
+
log(` ${e.slug}: error ${err?.message || err}`);
|
|
2628
|
+
await sleep3(delayMs);
|
|
2629
|
+
continue;
|
|
2630
|
+
}
|
|
2631
|
+
if (found.quotaExhausted) {
|
|
2632
|
+
res.stoppedEarly = true;
|
|
2633
|
+
res.reason = `${provider.name} quota/credits exhausted \u2014 top up or wait, then re-run`;
|
|
2634
|
+
res.attempted--;
|
|
2635
|
+
log(`${provider.name} quota exhausted; stopping at ${i}/${targets.length}`);
|
|
2636
|
+
break;
|
|
2637
|
+
}
|
|
2638
|
+
if (found.rateLimited && found.contacts.length === 0) {
|
|
2639
|
+
res.stoppedEarly = true;
|
|
2640
|
+
res.reason = `${provider.name} rate limited \u2014 resume later`;
|
|
2641
|
+
res.attempted--;
|
|
2642
|
+
break;
|
|
2643
|
+
}
|
|
2644
|
+
for (const c of found.contacts) {
|
|
2645
|
+
db.upsertContact({
|
|
2646
|
+
source: e.source,
|
|
2647
|
+
sourceId: e.sourceId,
|
|
2648
|
+
type: c.type,
|
|
2649
|
+
value: c.value,
|
|
2650
|
+
label: c.label ?? "",
|
|
2651
|
+
provider: provider.name,
|
|
2652
|
+
confidence: c.confidence,
|
|
2653
|
+
status: "unverified",
|
|
2654
|
+
verifiedAt: "",
|
|
2655
|
+
createdAt: now
|
|
2656
|
+
});
|
|
2657
|
+
}
|
|
2658
|
+
if (found.contacts.length)
|
|
2659
|
+
res.withContacts++;
|
|
2660
|
+
res.contacts += found.contacts.length;
|
|
2661
|
+
log(` [${res.attempted}/${targets.length}] ${e.fullName || e.slug}: ${found.contacts.length} contacts`);
|
|
2662
|
+
await sleep3(delayMs);
|
|
2663
|
+
}
|
|
2664
|
+
db.setMeta(`last_contacts:${opts.source ?? "all"}`, now);
|
|
2665
|
+
return res;
|
|
2666
|
+
}
|
|
2667
|
+
async function verifyContacts(db, opts = {}) {
|
|
2668
|
+
const log = opts.onLog ?? (() => {});
|
|
2669
|
+
const delayMs = opts.delayMs ?? 50;
|
|
2670
|
+
const targets = db.contactsToVerify({ source: opts.source, limit: opts.max });
|
|
2671
|
+
const res = { checked: 0, valid: 0, invalid: 0, unknown: 0 };
|
|
2672
|
+
for (const c of targets) {
|
|
2673
|
+
let status;
|
|
2674
|
+
if (c.type === "email")
|
|
2675
|
+
status = await validateEmail(c.value, opts.resolver);
|
|
2676
|
+
else
|
|
2677
|
+
status = validatePhone(c.value);
|
|
2678
|
+
db.setContactStatus(c.source, c.sourceId, c.type, c.value, status);
|
|
2679
|
+
res.checked++;
|
|
2680
|
+
res[status]++;
|
|
2681
|
+
if (res.checked % 50 === 0)
|
|
2682
|
+
log(` verified ${res.checked}`);
|
|
2683
|
+
if (delayMs)
|
|
2684
|
+
await sleep3(delayMs);
|
|
2685
|
+
}
|
|
2686
|
+
return res;
|
|
2687
|
+
}
|
|
2688
|
+
|
|
2689
|
+
// src/sync.ts
|
|
2690
|
+
function expertToContactRecord(e, contacts) {
|
|
2691
|
+
const li = e.extra?.company || "";
|
|
2692
|
+
return {
|
|
2693
|
+
name: e.fullName || e.slug,
|
|
2694
|
+
title: e.title,
|
|
2695
|
+
company: li,
|
|
2696
|
+
emails: contacts.filter((c) => c.type === "email").map((c) => ({ value: c.value, status: c.status })),
|
|
2697
|
+
phones: contacts.filter((c) => c.type === "phone").map((c) => ({ value: c.value, status: c.status })),
|
|
2698
|
+
socials: e.socials,
|
|
2699
|
+
url: e.url,
|
|
2700
|
+
source: e.source,
|
|
2701
|
+
sourceId: e.sourceId,
|
|
2702
|
+
tags: e.tags,
|
|
2703
|
+
notes: e.headline || e.bio.slice(0, 200)
|
|
2704
|
+
};
|
|
2705
|
+
}
|
|
2706
|
+
|
|
2707
|
+
class JsonSink {
|
|
2708
|
+
name = "json";
|
|
2709
|
+
records = [];
|
|
2710
|
+
async push(records) {
|
|
2711
|
+
this.records.push(...records);
|
|
2712
|
+
return { ok: records.length, failed: 0, output: JSON.stringify(records, null, 2) };
|
|
2713
|
+
}
|
|
2714
|
+
}
|
|
2715
|
+
|
|
2716
|
+
class CliSink {
|
|
2717
|
+
bin;
|
|
2718
|
+
name = "contacts-cli";
|
|
2719
|
+
constructor(bin = "contacts") {
|
|
2720
|
+
this.bin = bin;
|
|
2721
|
+
}
|
|
2722
|
+
async push(records) {
|
|
2723
|
+
let ok = 0;
|
|
2724
|
+
let failed = 0;
|
|
2725
|
+
for (const r of records) {
|
|
2726
|
+
try {
|
|
2727
|
+
const proc = Bun.spawn([this.bin, "upsert", "--json", JSON.stringify(r)], { stdout: "pipe", stderr: "pipe" });
|
|
2728
|
+
const code = await proc.exited;
|
|
2729
|
+
if (code === 0)
|
|
2730
|
+
ok++;
|
|
2731
|
+
else
|
|
2732
|
+
failed++;
|
|
2733
|
+
} catch {
|
|
2734
|
+
failed++;
|
|
2735
|
+
}
|
|
2736
|
+
}
|
|
2737
|
+
return { ok, failed };
|
|
2738
|
+
}
|
|
2739
|
+
}
|
|
2740
|
+
async function syncContacts(db, idsOrSlugs, opts = {}) {
|
|
2741
|
+
const sink = opts.sink ?? new JsonSink;
|
|
2742
|
+
const records = [];
|
|
2743
|
+
for (const id of idsOrSlugs) {
|
|
2744
|
+
const e = db.get(id, opts.source);
|
|
2745
|
+
if (!e)
|
|
2746
|
+
continue;
|
|
2747
|
+
records.push(expertToContactRecord(e, db.contacts(e.source, e.sourceId)));
|
|
2748
|
+
}
|
|
2749
|
+
const res = await sink.push(records);
|
|
2750
|
+
return { records, ...res };
|
|
2751
|
+
}
|
|
1270
2752
|
|
|
1271
2753
|
// src/cli/index.ts
|
|
1272
|
-
var VERSION = "0.0.
|
|
2754
|
+
var VERSION = "0.0.7";
|
|
1273
2755
|
function openDb() {
|
|
1274
2756
|
const opts = program.opts();
|
|
1275
2757
|
return new ExpertsDB(opts.db || defaultDbPath());
|
|
1276
2758
|
}
|
|
2759
|
+
function truncateMid(v, n = 32) {
|
|
2760
|
+
const s = String(v ?? "").replace(/\s+/g, " ").trim();
|
|
2761
|
+
return s.length > n ? s.slice(0, n - 1) + "\u2026" : s;
|
|
2762
|
+
}
|
|
1277
2763
|
function requireData(db) {
|
|
1278
2764
|
if (db.count() === 0) {
|
|
1279
2765
|
console.error(chalk2.yellow("No experts stored yet. Run ") + chalk2.bold("experts crawl intro") + chalk2.yellow(" first."));
|
|
@@ -1305,6 +2791,8 @@ program.command("crawl [source]").description("Fetch experts from a source into
|
|
|
1305
2791
|
console.log(` ${chalk2.dim(t.name.padEnd(20))} ${t.count}`);
|
|
1306
2792
|
}
|
|
1307
2793
|
console.log(chalk2.dim(`graph: ${res.graph.nodes} nodes, ${res.graph.edges} edges`));
|
|
2794
|
+
if (res.changes)
|
|
2795
|
+
console.log(chalk2.dim(`changes since last crawl: ${res.changes}`));
|
|
1308
2796
|
db.close();
|
|
1309
2797
|
});
|
|
1310
2798
|
program.command("enrich [source]").description("Enrich experts via X/Twitter: profile, recent tweets, avatar").option("--max <n>", "limit experts processed (resumable)", (v) => parseInt(v, 10)).option("--refresh", "re-enrich experts already done").option("--since-days <n>", "tweet window in days", (v) => parseInt(v, 10), 30).option("--delay <ms>", "min delay between experts", (v) => parseInt(v, 10), 1200).option("--tweet-max <n>", "max tweets fetched per expert", (v) => parseInt(v, 10), 100).option("--no-avatars", "skip downloading profile pictures").option("--no-tweets", "skip fetching tweets").action(async (source, cmdOpts) => {
|
|
@@ -1326,6 +2814,7 @@ program.command("enrich [source]").description("Enrich experts via X/Twitter: pr
|
|
|
1326
2814
|
`))
|
|
1327
2815
|
});
|
|
1328
2816
|
db.rebuildGraph();
|
|
2817
|
+
db.rescore(source);
|
|
1329
2818
|
console.log(chalk2.green(`\u2713 enriched ${res.enriched}`) + chalk2.dim(` (${res.notFound} not found, ${res.tweets} tweets, ${res.avatars} avatars, ${res.attempted} attempted)`));
|
|
1330
2819
|
if (res.stoppedEarly)
|
|
1331
2820
|
console.log(chalk2.yellow(`\u26A0 stopped early: ${res.reason}`));
|
|
@@ -1333,14 +2822,93 @@ program.command("enrich [source]").description("Enrich experts via X/Twitter: pr
|
|
|
1333
2822
|
console.log(chalk2.dim(`progress: ${after.enriched}/${after.withHandle} enriched`));
|
|
1334
2823
|
db.close();
|
|
1335
2824
|
});
|
|
1336
|
-
program.command("
|
|
2825
|
+
program.command("avatars [source]").description("Download + properly name profile pictures for experts missing one").option("--delay <ms>", "delay between downloads", (v) => parseInt(v, 10), 150).action(async (source, cmdOpts) => {
|
|
2826
|
+
const db = openDb();
|
|
2827
|
+
requireData(db);
|
|
2828
|
+
console.error(chalk2.dim("Backfilling profile pictures from source media\u2026"));
|
|
2829
|
+
const res = await backfillAvatars(db, {
|
|
2830
|
+
source,
|
|
2831
|
+
delayMs: cmdOpts.delay,
|
|
2832
|
+
onLog: (m) => process.stderr.write(chalk2.dim(m + `
|
|
2833
|
+
`))
|
|
2834
|
+
});
|
|
2835
|
+
console.log(chalk2.green(`\u2713 ${res.downloaded} avatars downloaded`) + chalk2.dim(` (${res.skipped} already had one or no URL, ${res.failed} failed)`));
|
|
2836
|
+
const total = db.enrichmentStats(source).avatars;
|
|
2837
|
+
console.log(chalk2.dim(`total experts with a named avatar: ${total}`));
|
|
2838
|
+
db.close();
|
|
2839
|
+
});
|
|
2840
|
+
program.command("enrich-linkedin [source]").description("Enrich experts with LinkedIn headline/company/about (needs linkedin connector auth)").option("--max <n>", "limit experts processed", (v) => parseInt(v, 10)).option("--refresh", "re-enrich").option("--delay <ms>", "delay between experts", (v) => parseInt(v, 10), 500).action(async (source, cmdOpts) => {
|
|
2841
|
+
const db = openDb();
|
|
2842
|
+
requireData(db);
|
|
2843
|
+
const res = await enrichLinkedIn(db, { source, max: cmdOpts.max, refresh: cmdOpts.refresh, delayMs: cmdOpts.delay, onLog: (m) => process.stderr.write(chalk2.dim(m + `
|
|
2844
|
+
`)) });
|
|
2845
|
+
console.log(chalk2.green(`\u2713 enriched ${res.enriched}`) + chalk2.dim(` (${res.attempted} attempted)`));
|
|
2846
|
+
if (res.stoppedEarly)
|
|
2847
|
+
console.log(chalk2.yellow(`\u26A0 ${res.reason}`));
|
|
2848
|
+
db.close();
|
|
2849
|
+
});
|
|
2850
|
+
program.command("enrich-sites [source]").description("Fetch experts' personal sites and store a text summary").option("--max <n>", "limit experts processed", (v) => parseInt(v, 10)).option("--refresh", "re-enrich").option("--delay <ms>", "delay between requests", (v) => parseInt(v, 10), 500).action(async (source, cmdOpts) => {
|
|
2851
|
+
const db = openDb();
|
|
2852
|
+
requireData(db);
|
|
2853
|
+
const res = await enrichSite(db, { source, max: cmdOpts.max, refresh: cmdOpts.refresh, delayMs: cmdOpts.delay, onLog: (m) => process.stderr.write(chalk2.dim(m + `
|
|
2854
|
+
`)) });
|
|
2855
|
+
console.log(chalk2.green(`\u2713 enriched ${res.enriched} sites`) + chalk2.dim(` (${res.attempted} attempted)`));
|
|
2856
|
+
db.close();
|
|
2857
|
+
});
|
|
2858
|
+
program.command("enrich-youtube [source]").description("Fetch + store recent YouTube videos for experts with a YT handle").option("--max <n>", "limit experts processed", (v) => parseInt(v, 10)).option("--refresh", "re-fetch for experts that already have videos").option("--video-max <n>", "videos per expert", (v) => parseInt(v, 10), 10).option("--delay <ms>", "delay between experts", (v) => parseInt(v, 10), 500).action(async (source, cmdOpts) => {
|
|
2859
|
+
const db = openDb();
|
|
2860
|
+
requireData(db);
|
|
2861
|
+
const remaining = db.expertsNeedingVideos({ source, refresh: cmdOpts.refresh }).length;
|
|
2862
|
+
console.error(chalk2.dim(`Fetching YouTube videos \u2014 ${remaining} experts with handles to do`));
|
|
2863
|
+
const res = await enrichYouTube(db, {
|
|
2864
|
+
source,
|
|
2865
|
+
max: cmdOpts.max,
|
|
2866
|
+
refresh: cmdOpts.refresh,
|
|
2867
|
+
videoMax: cmdOpts.videoMax,
|
|
2868
|
+
delayMs: cmdOpts.delay,
|
|
2869
|
+
onLog: (m) => process.stderr.write(chalk2.dim(m + `
|
|
2870
|
+
`))
|
|
2871
|
+
});
|
|
2872
|
+
console.log(chalk2.green(`\u2713 ${res.videos} videos for ${res.withVideos} experts`) + chalk2.dim(` (${res.attempted} attempted)`));
|
|
2873
|
+
if (res.stoppedEarly)
|
|
2874
|
+
console.log(chalk2.yellow(`\u26A0 stopped early: ${res.reason}`));
|
|
2875
|
+
db.close();
|
|
2876
|
+
});
|
|
2877
|
+
program.command("videos <idOrSlug>").description("Show an expert's recent YouTube videos").option("-s, --source <name>", "disambiguate by source").option("-n, --limit <n>", "max videos", (v) => parseInt(v, 10), 10).action((idOrSlug, cmdOpts) => {
|
|
2878
|
+
const db = openDb();
|
|
2879
|
+
const e = db.get(idOrSlug, cmdOpts.source);
|
|
2880
|
+
if (!e) {
|
|
2881
|
+
console.error(chalk2.red(`No expert found for "${idOrSlug}".`));
|
|
2882
|
+
process.exit(1);
|
|
2883
|
+
}
|
|
2884
|
+
const videos = db.recentVideos(e.source, e.sourceId, cmdOpts.limit);
|
|
2885
|
+
if (program.opts().json) {
|
|
2886
|
+
console.log(JSON.stringify(videos, null, 2));
|
|
2887
|
+
} else if (videos.length === 0) {
|
|
2888
|
+
console.log(chalk2.yellow(`No videos for ${e.fullName || e.slug}. Run \`experts enrich-youtube\`.`));
|
|
2889
|
+
} else {
|
|
2890
|
+
console.log(chalk2.bold(`YouTube \u2014 ${e.fullName || e.slug}`));
|
|
2891
|
+
for (const v of videos)
|
|
2892
|
+
console.log(` ${chalk2.dim((v.publishedAt || "").slice(0, 10))} ${v.title}
|
|
2893
|
+
${chalk2.dim(v.url)}`);
|
|
2894
|
+
}
|
|
2895
|
+
db.close();
|
|
2896
|
+
});
|
|
2897
|
+
program.command("tweets <idOrSlug>").description("Show an expert's recent tweets (stored, or --live)").option("-s, --source <name>", "disambiguate by source").option("-n, --limit <n>", "max tweets", (v) => parseInt(v, 10), 10).option("--live", "fetch latest live via the connector (needs X credits)").action(async (idOrSlug, cmdOpts) => {
|
|
1337
2898
|
const db = openDb();
|
|
1338
2899
|
const e = db.get(idOrSlug, cmdOpts.source);
|
|
1339
2900
|
if (!e) {
|
|
1340
2901
|
console.error(chalk2.red(`No expert found for "${idOrSlug}".`));
|
|
1341
2902
|
process.exit(1);
|
|
1342
2903
|
}
|
|
1343
|
-
|
|
2904
|
+
let tweets = db.recentTweets(e.source, e.sourceId, cmdOpts.limit);
|
|
2905
|
+
if (cmdOpts.live) {
|
|
2906
|
+
const live = await liveTweets(db, e, { max: cmdOpts.limit });
|
|
2907
|
+
if (live.quotaExhausted)
|
|
2908
|
+
console.error(chalk2.yellow("X API quota exhausted \u2014 showing stored tweets."));
|
|
2909
|
+
else if (live.tweets.length)
|
|
2910
|
+
tweets = live.tweets;
|
|
2911
|
+
}
|
|
1344
2912
|
if (program.opts().json) {
|
|
1345
2913
|
console.log(JSON.stringify(tweets, null, 2));
|
|
1346
2914
|
} else if (tweets.length === 0) {
|
|
@@ -1357,14 +2925,121 @@ ${chalk2.dim(date)} ${chalk2.yellow(`\u2665${t.likeCount} \u21BA${t.retweetCount
|
|
|
1357
2925
|
}
|
|
1358
2926
|
db.close();
|
|
1359
2927
|
});
|
|
1360
|
-
program.command("
|
|
2928
|
+
program.command("find-contacts [source]").description("Discover email + phone for experts via Exa.ai websets (resumable)").option("--max <n>", "limit experts processed", (v) => parseInt(v, 10)).option("--refresh", "re-discover for experts that already have contacts").option("--delay <ms>", "delay between experts", (v) => parseInt(v, 10), 500).action(async (source, cmdOpts) => {
|
|
2929
|
+
const db = openDb();
|
|
2930
|
+
requireData(db);
|
|
2931
|
+
const remaining = db.expertsNeedingContacts({ source, refresh: cmdOpts.refresh }).length;
|
|
2932
|
+
console.error(chalk2.dim(`Discovering contacts via Exa \u2014 ${remaining} experts to do${cmdOpts.max ? ` (capped ${cmdOpts.max})` : ""}`));
|
|
2933
|
+
const res = await discoverContacts(db, {
|
|
2934
|
+
source,
|
|
2935
|
+
max: cmdOpts.max,
|
|
2936
|
+
refresh: cmdOpts.refresh,
|
|
2937
|
+
delayMs: cmdOpts.delay,
|
|
2938
|
+
onLog: (m) => process.stderr.write(chalk2.dim(m + `
|
|
2939
|
+
`))
|
|
2940
|
+
});
|
|
2941
|
+
console.log(chalk2.green(`\u2713 ${res.contacts} contacts for ${res.withContacts} experts`) + chalk2.dim(` (${res.attempted} attempted)`));
|
|
2942
|
+
if (res.stoppedEarly)
|
|
2943
|
+
console.log(chalk2.yellow(`\u26A0 stopped early: ${res.reason}`));
|
|
2944
|
+
console.log(chalk2.dim("Next: `experts verify-contacts` to set working/not status."));
|
|
2945
|
+
db.close();
|
|
2946
|
+
});
|
|
2947
|
+
program.command("verify-contacts [source]").description("Validate discovered contacts and set working/not status").option("--max <n>", "limit contacts checked", (v) => parseInt(v, 10)).action(async (source, cmdOpts) => {
|
|
2948
|
+
const db = openDb();
|
|
2949
|
+
requireData(db);
|
|
2950
|
+
const res = await verifyContacts(db, {
|
|
2951
|
+
source,
|
|
2952
|
+
max: cmdOpts.max,
|
|
2953
|
+
onLog: (m) => process.stderr.write(chalk2.dim(m + `
|
|
2954
|
+
`))
|
|
2955
|
+
});
|
|
2956
|
+
console.log(chalk2.green(`\u2713 verified ${res.checked}`) + chalk2.dim(` (${res.valid} valid, ${res.invalid} invalid, ${res.unknown} unknown)`));
|
|
2957
|
+
db.close();
|
|
2958
|
+
});
|
|
2959
|
+
program.command("contacts <idOrSlug>").description("Show an expert's discovered contacts with working/not status").option("-s, --source <name>", "disambiguate by source").action((idOrSlug, cmdOpts) => {
|
|
2960
|
+
const db = openDb();
|
|
2961
|
+
const e = db.get(idOrSlug, cmdOpts.source);
|
|
2962
|
+
if (!e) {
|
|
2963
|
+
console.error(chalk2.red(`No expert found for "${idOrSlug}".`));
|
|
2964
|
+
process.exit(1);
|
|
2965
|
+
}
|
|
2966
|
+
const list = db.contacts(e.source, e.sourceId);
|
|
2967
|
+
if (program.opts().json) {
|
|
2968
|
+
console.log(JSON.stringify(list, null, 2));
|
|
2969
|
+
} else if (list.length === 0) {
|
|
2970
|
+
console.log(chalk2.yellow(`No contacts for ${e.fullName || e.slug}. Run \`experts find-contacts\`.`));
|
|
2971
|
+
} else {
|
|
2972
|
+
console.log(chalk2.bold(`Contacts \u2014 ${e.fullName || e.slug}`));
|
|
2973
|
+
for (const c of list) {
|
|
2974
|
+
const mark = c.status === "valid" ? chalk2.green("\u2713 working") : c.status === "invalid" ? chalk2.red("\u2717 not working") : chalk2.dim(c.status);
|
|
2975
|
+
console.log(` ${c.type.padEnd(5)} ${c.value.padEnd(36)} ${mark}`);
|
|
2976
|
+
}
|
|
2977
|
+
}
|
|
2978
|
+
db.close();
|
|
2979
|
+
});
|
|
2980
|
+
program.command("reindex").description("Rebuild the knowledge graph + authority scores (no network)").action(() => {
|
|
1361
2981
|
const db = openDb();
|
|
1362
2982
|
requireData(db);
|
|
1363
2983
|
const g = db.rebuildGraph();
|
|
1364
|
-
|
|
2984
|
+
const n = db.rescore();
|
|
2985
|
+
const p = db.rebuildPersons();
|
|
2986
|
+
console.log(chalk2.green(`\u2713 graph: ${g.nodes} nodes/${g.edges} edges; rescored ${n}; ${p.persons} persons (${p.experts} records)`));
|
|
2987
|
+
db.close();
|
|
2988
|
+
});
|
|
2989
|
+
program.command("changes").description("What changed between crawls (price/title/bio edits, new experts)").option("-s, --source <name>", "filter by source").option("-n, --limit <n>", "max rows", (v) => parseInt(v, 10), 50).action((cmdOpts) => {
|
|
2990
|
+
const db = openDb();
|
|
2991
|
+
requireData(db);
|
|
2992
|
+
const rows = db.changes({ source: cmdOpts.source, limit: cmdOpts.limit });
|
|
2993
|
+
if (program.opts().json) {
|
|
2994
|
+
console.log(JSON.stringify(rows, null, 2));
|
|
2995
|
+
} else if (rows.length === 0) {
|
|
2996
|
+
console.log(chalk2.dim("No changes recorded yet (changes are detected on re-crawl)."));
|
|
2997
|
+
} else {
|
|
2998
|
+
for (const c of rows) {
|
|
2999
|
+
const when = (c.detected_at || "").slice(0, 10);
|
|
3000
|
+
if (c.kind === "added") {
|
|
3001
|
+
console.log(`${chalk2.dim(when)} ${chalk2.green("\uFF0B added")} ${c.new_value} ${chalk2.dim(`[${c.source}:${c.source_id}]`)}`);
|
|
3002
|
+
} else {
|
|
3003
|
+
console.log(`${chalk2.dim(when)} ${chalk2.yellow("~ " + c.field)} ${chalk2.dim(truncateMid(c.old_value))} \u2192 ${truncateMid(c.new_value)} ${chalk2.dim(`[${c.source}:${c.source_id}]`)}`);
|
|
3004
|
+
}
|
|
3005
|
+
}
|
|
3006
|
+
console.log(chalk2.dim(`
|
|
3007
|
+
${rows.length} changes`));
|
|
3008
|
+
}
|
|
1365
3009
|
db.close();
|
|
1366
3010
|
});
|
|
1367
|
-
program.command("
|
|
3011
|
+
program.command("persons").description("Identity-resolution overview (canonical persons vs. records)").action(() => {
|
|
3012
|
+
const db = openDb();
|
|
3013
|
+
requireData(db);
|
|
3014
|
+
const s = db.personStats();
|
|
3015
|
+
if (program.opts().json)
|
|
3016
|
+
console.log(JSON.stringify(s, null, 2));
|
|
3017
|
+
else {
|
|
3018
|
+
console.log(chalk2.bold("Identity resolution"));
|
|
3019
|
+
console.log(` Expert records ${s.experts}`);
|
|
3020
|
+
console.log(` Unique persons ${s.persons}`);
|
|
3021
|
+
console.log(` Merged (>1 src) ${s.duplicated}`);
|
|
3022
|
+
}
|
|
3023
|
+
db.close();
|
|
3024
|
+
});
|
|
3025
|
+
program.command("stale").description("List the experts whose data is oldest (refresh candidates)").option("-s, --source <name>", "filter by source").option("-n, --limit <n>", "max rows", (v) => parseInt(v, 10), 25).action((cmdOpts) => {
|
|
3026
|
+
const db = openDb();
|
|
3027
|
+
requireData(db);
|
|
3028
|
+
const rows = db.stalest({ source: cmdOpts.source, limit: cmdOpts.limit });
|
|
3029
|
+
if (program.opts().json) {
|
|
3030
|
+
console.log(JSON.stringify(rows, null, 2));
|
|
3031
|
+
} else {
|
|
3032
|
+
const now = Date.now();
|
|
3033
|
+
for (const { expert, lastSeen } of rows) {
|
|
3034
|
+
const days = lastSeen ? Math.floor((now - Date.parse(lastSeen)) / 86400000) : "?";
|
|
3035
|
+
console.log(`${chalk2.dim(String(days).padStart(4) + "d")} ${formatRow(expert)}`);
|
|
3036
|
+
}
|
|
3037
|
+
console.log(chalk2.dim(`
|
|
3038
|
+
${rows.length} stalest experts`));
|
|
3039
|
+
}
|
|
3040
|
+
db.close();
|
|
3041
|
+
});
|
|
3042
|
+
program.command("list").description("List experts with filters").option("-s, --source <name>", "filter by source").option("-t, --topic <topic>", "filter by topic/category").option("--verified", "only verified experts").option("--top", "only featured/top experts").option("--min-price <n>", "minimum price", (v) => parseInt(v, 10)).option("--max-price <n>", "maximum price", (v) => parseInt(v, 10)).option("--min-rating <n>", "minimum rating", parseFloat).option("--sort <field>", "rating|price|name|reviews|authority", "rating").option("--asc", "ascending order").option("-n, --limit <n>", "max rows", (v) => parseInt(v, 10), 25).action((cmdOpts) => {
|
|
1368
3043
|
const db = openDb();
|
|
1369
3044
|
requireData(db);
|
|
1370
3045
|
const filters = {
|
|
@@ -1391,6 +3066,81 @@ ${rows.length} experts`));
|
|
|
1391
3066
|
}
|
|
1392
3067
|
db.close();
|
|
1393
3068
|
});
|
|
3069
|
+
program.command("embed [source]").description("Build the semantic search index (embeddings) for stored experts").action(async (source) => {
|
|
3070
|
+
const db = openDb();
|
|
3071
|
+
requireData(db);
|
|
3072
|
+
const embedder = getEmbedder();
|
|
3073
|
+
console.error(chalk2.dim(`Embedding with ${embedder.id}\u2026`));
|
|
3074
|
+
const n = await db.buildEmbeddings(embedder, {
|
|
3075
|
+
source,
|
|
3076
|
+
onLog: (m) => process.stderr.write(chalk2.dim(`\r${m}`.padEnd(40)))
|
|
3077
|
+
});
|
|
3078
|
+
process.stderr.write(`
|
|
3079
|
+
`);
|
|
3080
|
+
console.log(chalk2.green(`\u2713 embedded ${n} experts (${embedder.id})`));
|
|
3081
|
+
db.close();
|
|
3082
|
+
});
|
|
3083
|
+
program.command("ask <query...>").description("Natural-language semantic search: 'who can help with X'").option("-s, --source <name>", "filter by source").option("-n, --limit <n>", "max rows", (v) => parseInt(v, 10), 15).action(async (query, cmdOpts) => {
|
|
3084
|
+
const db = openDb();
|
|
3085
|
+
requireData(db);
|
|
3086
|
+
if (db.vectorCount() === 0) {
|
|
3087
|
+
console.error(chalk2.yellow("No semantic index yet. Run ") + chalk2.bold("experts embed") + chalk2.yellow(" first."));
|
|
3088
|
+
process.exit(1);
|
|
3089
|
+
}
|
|
3090
|
+
const embedder = getEmbedder();
|
|
3091
|
+
const [qv] = await embedder.embed([query.join(" ")]);
|
|
3092
|
+
const results = db.semanticSearch(qv, { source: cmdOpts.source, limit: cmdOpts.limit });
|
|
3093
|
+
if (program.opts().json) {
|
|
3094
|
+
console.log(JSON.stringify(results, null, 2));
|
|
3095
|
+
} else {
|
|
3096
|
+
const multi = new Set(results.map((r) => r.expert.source)).size > 1;
|
|
3097
|
+
for (const { expert, score } of results) {
|
|
3098
|
+
console.log(chalk2.dim(score.toFixed(3)) + " " + formatRow(expert, { showSource: multi }));
|
|
3099
|
+
}
|
|
3100
|
+
console.log(chalk2.dim(`
|
|
3101
|
+
${results.length} matches for "${query.join(" ")}"`));
|
|
3102
|
+
}
|
|
3103
|
+
db.close();
|
|
3104
|
+
});
|
|
3105
|
+
program.command("brief <text...>").description("Paste a brief \u2192 ranked, de-duplicated expert shortlist with why each matched").option("-s, --source <name>", "filter by source").option("-n, --limit <n>", "shortlist size", (v) => parseInt(v, 10), 10).action(async (text, cmdOpts) => {
|
|
3106
|
+
const db = openDb();
|
|
3107
|
+
requireData(db);
|
|
3108
|
+
if (db.vectorCount() === 0) {
|
|
3109
|
+
console.error(chalk2.yellow("No semantic index. Run ") + chalk2.bold("experts embed") + chalk2.yellow(" first."));
|
|
3110
|
+
process.exit(1);
|
|
3111
|
+
}
|
|
3112
|
+
const brief = text.join(" ");
|
|
3113
|
+
const [qv] = await getEmbedder().embed([brief]);
|
|
3114
|
+
const raw = db.semanticSearch(qv, { source: cmdOpts.source, limit: (cmdOpts.limit + 5) * 4 });
|
|
3115
|
+
const seen = new Set;
|
|
3116
|
+
const shortlist = [];
|
|
3117
|
+
for (const r of raw) {
|
|
3118
|
+
const pid = db.personIdOf(r.expert.source, r.expert.sourceId);
|
|
3119
|
+
if (seen.has(pid))
|
|
3120
|
+
continue;
|
|
3121
|
+
seen.add(pid);
|
|
3122
|
+
shortlist.push(r);
|
|
3123
|
+
if (shortlist.length >= cmdOpts.limit)
|
|
3124
|
+
break;
|
|
3125
|
+
}
|
|
3126
|
+
const briefLc = brief.toLowerCase();
|
|
3127
|
+
const annotated = shortlist.map((r) => ({
|
|
3128
|
+
...r,
|
|
3129
|
+
why: r.expert.tags.filter((t) => briefLc.includes(t.toLowerCase())).slice(0, 4)
|
|
3130
|
+
}));
|
|
3131
|
+
if (program.opts().json) {
|
|
3132
|
+
console.log(JSON.stringify(annotated, null, 2));
|
|
3133
|
+
} else {
|
|
3134
|
+
console.log(chalk2.bold(`Shortlist for: "${brief}"
|
|
3135
|
+
`));
|
|
3136
|
+
annotated.forEach((r, i) => {
|
|
3137
|
+
console.log(`${chalk2.cyan(`${i + 1}.`)} ${formatRow(r.expert)}`);
|
|
3138
|
+
const why = r.why.length ? r.why.join(", ") : "semantic match";
|
|
3139
|
+
console.log(` ${chalk2.dim("why:")} ${why} ${chalk2.dim(`(${r.score.toFixed(3)})`)}`);
|
|
3140
|
+
});
|
|
3141
|
+
}
|
|
3142
|
+
db.close();
|
|
3143
|
+
});
|
|
1394
3144
|
program.command("search <query...>").description("Full-text search across name, title and bio").option("-s, --source <name>", "filter by source").option("-n, --limit <n>", "max rows", (v) => parseInt(v, 10), 25).action((query, cmdOpts) => {
|
|
1395
3145
|
const db = openDb();
|
|
1396
3146
|
requireData(db);
|
|
@@ -1473,10 +3223,13 @@ program.command("show <idOrSlug>").description("Show full detail for one expert"
|
|
|
1473
3223
|
}
|
|
1474
3224
|
const xProfile = db.getXProfile(e.source, e.sourceId);
|
|
1475
3225
|
const tweets = db.recentTweets(e.source, e.sourceId, 10);
|
|
3226
|
+
const contacts = db.contacts(e.source, e.sourceId);
|
|
3227
|
+
const videos = db.recentVideos(e.source, e.sourceId, 5);
|
|
3228
|
+
const ext = db.allExt(e.source, e.sourceId);
|
|
1476
3229
|
if (program.opts().json) {
|
|
1477
|
-
console.log(JSON.stringify({ ...e, xProfile, tweets }, null, 2));
|
|
3230
|
+
console.log(JSON.stringify({ ...e, xProfile, tweets, contacts, videos, ext }, null, 2));
|
|
1478
3231
|
} else {
|
|
1479
|
-
console.log(formatDetail(e, { xProfile, tweets }));
|
|
3232
|
+
console.log(formatDetail(e, { xProfile, tweets, contacts, videos, ext }));
|
|
1480
3233
|
}
|
|
1481
3234
|
db.close();
|
|
1482
3235
|
});
|
|
@@ -1543,6 +3296,21 @@ program.command("export").description("Export experts as JSON or CSV").option("-
|
|
|
1543
3296
|
}
|
|
1544
3297
|
db.close();
|
|
1545
3298
|
});
|
|
3299
|
+
program.command("sync-contacts <idsOrSlugs...>").description("Export experts + their contacts to the contacts system (JSON, or --via-cli)").option("-s, --source <name>", "disambiguate by source").option("--via-cli", "push via the `contacts` CLI instead of emitting JSON").option("-o, --out <file>", "write JSON to a file").action(async (ids, cmdOpts) => {
|
|
3300
|
+
const db = openDb();
|
|
3301
|
+
requireData(db);
|
|
3302
|
+
const sink = cmdOpts.viaCli ? new CliSink : new JsonSink;
|
|
3303
|
+
const res = await syncContacts(db, ids, { source: cmdOpts.source, sink });
|
|
3304
|
+
if (cmdOpts.viaCli) {
|
|
3305
|
+
console.log(chalk2.green(`\u2713 pushed ${res.ok} contacts`) + chalk2.dim(` (${res.failed} failed)`));
|
|
3306
|
+
} else if (cmdOpts.out) {
|
|
3307
|
+
await Bun.write(cmdOpts.out, res.output ?? "[]");
|
|
3308
|
+
console.error(chalk2.green(`\u2713 wrote ${res.records.length} contact records to ${cmdOpts.out}`));
|
|
3309
|
+
} else {
|
|
3310
|
+
console.log(res.output ?? "[]");
|
|
3311
|
+
}
|
|
3312
|
+
db.close();
|
|
3313
|
+
});
|
|
1546
3314
|
program.command("sources").description("List available marketplace sources").action(() => {
|
|
1547
3315
|
const db = openDb();
|
|
1548
3316
|
const inStore = new Map(db.sourcesInStore().map((r) => [r.source, r.count]));
|