@hasna/experts 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +353 -81
- package/dist/contacts.d.ts +15 -1
- package/dist/contacts.d.ts.map +1 -1
- package/dist/crawl.d.ts.map +1 -1
- package/dist/crypto.d.ts +6 -0
- package/dist/crypto.d.ts.map +1 -0
- package/dist/db.d.ts +16 -4
- package/dist/db.d.ts.map +1 -1
- package/dist/embed.d.ts +20 -2
- package/dist/embed.d.ts.map +1 -1
- package/dist/index.d.ts +4 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +302 -67
- package/dist/score.d.ts +28 -0
- package/dist/score.d.ts.map +1 -1
- package/dist/sdk.js +1 -0
- package/dist/server/index.d.ts.map +1 -1
- package/dist/server/index.js +218 -68
- package/dist/sources/mentorcruise.d.ts +26 -23
- package/dist/sources/mentorcruise.d.ts.map +1 -1
- package/dist/types.d.ts +2 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +5 -2
package/dist/cli/index.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env bun
|
|
2
2
|
// @bun
|
|
3
|
+
var __require = import.meta.require;
|
|
3
4
|
|
|
4
5
|
// src/cli/index.ts
|
|
5
6
|
import { Command } from "commander";
|
|
@@ -53,6 +54,36 @@ function authorityScore(e, inputs = {}, weights = DEFAULT_WEIGHTS) {
|
|
|
53
54
|
const raw = weights.rating * rating + weights.reviews * reviews + weights.followers * followers + weights.featured * featured + weights.verified * verified + weights.recency * recency;
|
|
54
55
|
return Math.round(raw * 1000) / 10;
|
|
55
56
|
}
|
|
57
|
+
function pricePerHour(price, priceUnit) {
|
|
58
|
+
if (!price || price <= 0)
|
|
59
|
+
return /free/i.test(priceUnit) ? 0 : null;
|
|
60
|
+
const u = (priceUnit || "").toLowerCase();
|
|
61
|
+
const minMatch = u.match(/(\d+)\s*min/);
|
|
62
|
+
if (minMatch)
|
|
63
|
+
return Math.round(price * 60 / Number(minMatch[1]));
|
|
64
|
+
if (/per\s*min|\/\s*min|minute/.test(u))
|
|
65
|
+
return price * 60;
|
|
66
|
+
if (/hour|\/\s*hr|per\s*hr/.test(u))
|
|
67
|
+
return price;
|
|
68
|
+
if (/free/.test(u))
|
|
69
|
+
return 0;
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
var DEFAULT_BLEND = { semantic: 0.8, authority: 0.2 };
|
|
73
|
+
function blendScore(semantic, authority, w = DEFAULT_BLEND) {
|
|
74
|
+
const a = Math.max(0, Math.min(1, (authority || 0) / 100));
|
|
75
|
+
const s = Math.max(0, Math.min(1, semantic));
|
|
76
|
+
return w.semantic * s + w.authority * a;
|
|
77
|
+
}
|
|
78
|
+
function explainMatch(query, e) {
|
|
79
|
+
const q = ` ${(query || "").toLowerCase()} `;
|
|
80
|
+
const hit = (label) => {
|
|
81
|
+
const l = label.toLowerCase();
|
|
82
|
+
return q.includes(` ${l} `) || q.includes(`${l},`) || q.includes(`${l}.`) || new RegExp(`\\b${l.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`).test(q);
|
|
83
|
+
};
|
|
84
|
+
const matched = [...e.topics, ...e.tags].filter(hit);
|
|
85
|
+
return [...new Set(matched)].slice(0, 6);
|
|
86
|
+
}
|
|
56
87
|
|
|
57
88
|
// src/embed.ts
|
|
58
89
|
var STOPWORDS = new Set([
|
|
@@ -143,11 +174,40 @@ class OpenAIEmbedder {
|
|
|
143
174
|
return data.data.map((d) => d.embedding);
|
|
144
175
|
}
|
|
145
176
|
}
|
|
146
|
-
|
|
147
|
-
|
|
177
|
+
|
|
178
|
+
class TransformersEmbedder {
|
|
179
|
+
id = "minilm-l6-v2";
|
|
180
|
+
dim = 384;
|
|
181
|
+
model = process.env.EXPERTS_EMBED_MODEL || "Xenova/all-MiniLM-L6-v2";
|
|
182
|
+
extractor = null;
|
|
183
|
+
async ensure() {
|
|
184
|
+
if (this.extractor)
|
|
185
|
+
return;
|
|
186
|
+
const { pipeline } = await import("@huggingface/transformers");
|
|
187
|
+
this.extractor = await pipeline("feature-extraction", this.model);
|
|
188
|
+
}
|
|
189
|
+
async embed(texts) {
|
|
190
|
+
await this.ensure();
|
|
191
|
+
const out = [];
|
|
192
|
+
for (const t of texts) {
|
|
193
|
+
const r = await this.extractor(t || " ", { pooling: "mean", normalize: true });
|
|
194
|
+
out.push(Array.from(r.data));
|
|
195
|
+
}
|
|
196
|
+
return out;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
async function getEmbedder() {
|
|
200
|
+
const choice = process.env.EXPERTS_EMBEDDER;
|
|
201
|
+
if (choice === "openai" && process.env.OPENAI_API_KEY)
|
|
148
202
|
return new OpenAIEmbedder;
|
|
203
|
+
if (choice === "hash")
|
|
204
|
+
return new HashingEmbedder;
|
|
205
|
+
try {
|
|
206
|
+
await import("@huggingface/transformers");
|
|
207
|
+
return new TransformersEmbedder;
|
|
208
|
+
} catch {
|
|
209
|
+
return new HashingEmbedder;
|
|
149
210
|
}
|
|
150
|
-
return new HashingEmbedder;
|
|
151
211
|
}
|
|
152
212
|
function cosine(a, b) {
|
|
153
213
|
let dot = 0;
|
|
@@ -232,6 +292,48 @@ function clusterPersons(experts) {
|
|
|
232
292
|
return out;
|
|
233
293
|
}
|
|
234
294
|
|
|
295
|
+
// src/crypto.ts
|
|
296
|
+
import { createCipheriv, createDecipheriv, createHmac, scryptSync } from "crypto";
|
|
297
|
+
var PREFIX = "enc1:";
|
|
298
|
+
var cachedKey = null;
|
|
299
|
+
var cachedFrom = null;
|
|
300
|
+
function key2() {
|
|
301
|
+
const secret = process.env.OPEN_EXPERTS_KEY;
|
|
302
|
+
if (!secret)
|
|
303
|
+
return null;
|
|
304
|
+
if (cachedKey && cachedFrom === secret)
|
|
305
|
+
return cachedKey;
|
|
306
|
+
cachedKey = scryptSync(secret, "open-experts/contacts/v1", 32);
|
|
307
|
+
cachedFrom = secret;
|
|
308
|
+
return cachedKey;
|
|
309
|
+
}
|
|
310
|
+
function maybeEncrypt(plaintext) {
|
|
311
|
+
const k = key2();
|
|
312
|
+
if (!k || plaintext == null)
|
|
313
|
+
return plaintext;
|
|
314
|
+
if (plaintext.startsWith(PREFIX))
|
|
315
|
+
return plaintext;
|
|
316
|
+
const iv = createHmac("sha256", k).update(plaintext).digest().subarray(0, 12);
|
|
317
|
+
const cipher = createCipheriv("aes-256-gcm", k, iv);
|
|
318
|
+
const enc = Buffer.concat([cipher.update(plaintext, "utf8"), cipher.final()]);
|
|
319
|
+
const tag = cipher.getAuthTag();
|
|
320
|
+
return PREFIX + Buffer.concat([iv, tag, enc]).toString("base64");
|
|
321
|
+
}
|
|
322
|
+
function maybeDecrypt(stored) {
|
|
323
|
+
if (stored == null || !stored.startsWith(PREFIX))
|
|
324
|
+
return stored;
|
|
325
|
+
const k = key2();
|
|
326
|
+
if (!k)
|
|
327
|
+
return stored;
|
|
328
|
+
const raw = Buffer.from(stored.slice(PREFIX.length), "base64");
|
|
329
|
+
const iv = raw.subarray(0, 12);
|
|
330
|
+
const tag = raw.subarray(12, 28);
|
|
331
|
+
const enc = raw.subarray(28);
|
|
332
|
+
const decipher = createDecipheriv("aes-256-gcm", k, iv);
|
|
333
|
+
decipher.setAuthTag(tag);
|
|
334
|
+
return Buffer.concat([decipher.update(enc), decipher.final()]).toString("utf8");
|
|
335
|
+
}
|
|
336
|
+
|
|
235
337
|
// src/db.ts
|
|
236
338
|
function defaultDbPath() {
|
|
237
339
|
return process.env.OPEN_EXPERTS_DB || join(homedir(), ".hasna", "experts", "experts.db");
|
|
@@ -343,10 +445,11 @@ class ExpertsDB {
|
|
|
343
445
|
CREATE INDEX IF NOT EXISTS idx_contacts_expert ON contacts(source, source_id);
|
|
344
446
|
CREATE INDEX IF NOT EXISTS idx_contacts_status ON contacts(status);
|
|
345
447
|
|
|
346
|
-
-- Semantic search: one embedding vector per expert
|
|
448
|
+
-- Semantic search: one embedding vector per expert (text_hash enables
|
|
449
|
+
-- incremental re-embedding \u2014 skip unchanged experts).
|
|
347
450
|
CREATE TABLE IF NOT EXISTS vectors (
|
|
348
451
|
source TEXT NOT NULL, source_id TEXT NOT NULL,
|
|
349
|
-
embedder TEXT NOT NULL, dim INTEGER, vec BLOB,
|
|
452
|
+
embedder TEXT NOT NULL, dim INTEGER, vec BLOB, text_hash TEXT,
|
|
350
453
|
PRIMARY KEY (source, source_id)
|
|
351
454
|
);
|
|
352
455
|
|
|
@@ -368,6 +471,7 @@ class ExpertsDB {
|
|
|
368
471
|
`);
|
|
369
472
|
this.addColumnIfMissing("experts", "avatar_local", "TEXT");
|
|
370
473
|
this.addColumnIfMissing("experts", "authority", "REAL DEFAULT 0");
|
|
474
|
+
this.addColumnIfMissing("vectors", "text_hash", "TEXT");
|
|
371
475
|
}
|
|
372
476
|
addColumnIfMissing(table, column, type) {
|
|
373
477
|
const cols = this.db.query(`PRAGMA table_info(${table})`).all();
|
|
@@ -454,6 +558,7 @@ class ExpertsDB {
|
|
|
454
558
|
extra: JSON.parse(r.extra || "{}"),
|
|
455
559
|
avatarLocal: r.avatar_local || undefined,
|
|
456
560
|
authority: r.authority ?? 0,
|
|
561
|
+
pricePerHour: pricePerHour(r.price ?? 0, r.price_unit ?? ""),
|
|
457
562
|
crawledAt: r.crawled_at
|
|
458
563
|
};
|
|
459
564
|
}
|
|
@@ -577,11 +682,11 @@ class ExpertsDB {
|
|
|
577
682
|
sql += " ORDER BY name";
|
|
578
683
|
return this.db.query(sql).all(...params);
|
|
579
684
|
}
|
|
580
|
-
setMeta(
|
|
581
|
-
this.db.query("INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value=excluded.value").run(
|
|
685
|
+
setMeta(key3, value) {
|
|
686
|
+
this.db.query("INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value=excluded.value").run(key3, value);
|
|
582
687
|
}
|
|
583
|
-
getMeta(
|
|
584
|
-
const row = this.db.query("SELECT value FROM meta WHERE key = ?").get(
|
|
688
|
+
getMeta(key3) {
|
|
689
|
+
const row = this.db.query("SELECT value FROM meta WHERE key = ?").get(key3);
|
|
585
690
|
return row ? row.value : null;
|
|
586
691
|
}
|
|
587
692
|
stats(source) {
|
|
@@ -605,12 +710,12 @@ class ExpertsDB {
|
|
|
605
710
|
const nodeIds = new Map;
|
|
606
711
|
const insertNode = this.db.query("INSERT INTO kg_nodes (type, key, label) VALUES (?, ?, ?) ON CONFLICT(type, key) DO UPDATE SET label=excluded.label RETURNING id");
|
|
607
712
|
const insertEdge = this.db.query("INSERT OR REPLACE INTO kg_edges (src, dst, rel, weight) VALUES (?, ?, ?, ?)");
|
|
608
|
-
const node = (type,
|
|
609
|
-
const ck = `${type}\x00${
|
|
713
|
+
const node = (type, key3, label) => {
|
|
714
|
+
const ck = `${type}\x00${key3.toLowerCase()}`;
|
|
610
715
|
const cached = nodeIds.get(ck);
|
|
611
716
|
if (cached != null)
|
|
612
717
|
return cached;
|
|
613
|
-
const id = insertNode.get(type,
|
|
718
|
+
const id = insertNode.get(type, key3.toLowerCase(), label).id;
|
|
614
719
|
nodeIds.set(ck, id);
|
|
615
720
|
return id;
|
|
616
721
|
};
|
|
@@ -620,7 +725,7 @@ class ExpertsDB {
|
|
|
620
725
|
for (const topic of e.topics) {
|
|
621
726
|
insertEdge.run(eId, node("topic", topic, topic), "IN_TOPIC", 1);
|
|
622
727
|
}
|
|
623
|
-
const tweetText = this.recentTweets(e.source, e.sourceId, 30).map((t) => t.text).join(". ");
|
|
728
|
+
const tweetText = this.recentTweets(e.source, e.sourceId, 30).filter((t) => !t.isRetweet).map((t) => t.text).join(". ");
|
|
624
729
|
const tags = inferTags(expertText(e) + ". " + tweetText, vocabulary);
|
|
625
730
|
for (const tag of tags) {
|
|
626
731
|
insertEdge.run(eId, node("tag", tag, tag), "HAS_TAG", 1);
|
|
@@ -674,11 +779,11 @@ class ExpertsDB {
|
|
|
674
779
|
lastSeen: r.last_seen || ""
|
|
675
780
|
}));
|
|
676
781
|
}
|
|
677
|
-
expertFromNodeKey(
|
|
678
|
-
const idx =
|
|
782
|
+
expertFromNodeKey(key3) {
|
|
783
|
+
const idx = key3.indexOf(":");
|
|
679
784
|
if (idx < 0)
|
|
680
785
|
return null;
|
|
681
|
-
return this.get(
|
|
786
|
+
return this.get(key3.slice(idx + 1), key3.slice(0, idx));
|
|
682
787
|
}
|
|
683
788
|
findByNeeds(needs, opts = {}) {
|
|
684
789
|
const cleaned = needs.map((n) => n.trim().toLowerCase()).filter(Boolean);
|
|
@@ -805,6 +910,15 @@ class ExpertsDB {
|
|
|
805
910
|
};
|
|
806
911
|
}
|
|
807
912
|
replaceTweets(source, sourceId, tweets) {
|
|
913
|
+
const norm = (t) => (t || "").toLowerCase().replace(/^rt @\w+:\s*/, "").replace(/https?:\/\/\S+/g, "").replace(/[^a-z0-9 ]/g, "").replace(/\s+/g, " ").trim();
|
|
914
|
+
const seen = new Set;
|
|
915
|
+
const deduped = tweets.filter((t) => {
|
|
916
|
+
const k = norm(t.text);
|
|
917
|
+
if (!k || seen.has(k))
|
|
918
|
+
return false;
|
|
919
|
+
seen.add(k);
|
|
920
|
+
return true;
|
|
921
|
+
});
|
|
808
922
|
const tx = this.db.transaction((rows) => {
|
|
809
923
|
this.db.query("DELETE FROM tweets WHERE source = ? AND source_id = ?").run(source, sourceId);
|
|
810
924
|
const stmt = this.db.query(`
|
|
@@ -817,7 +931,7 @@ class ExpertsDB {
|
|
|
817
931
|
stmt.run(t.source, t.sourceId, t.tweetId, t.text, t.createdAt, t.retweetCount, t.replyCount, t.likeCount, t.quoteCount, t.impressionCount, t.isRetweet ? 1 : 0, t.isReply ? 1 : 0);
|
|
818
932
|
}
|
|
819
933
|
});
|
|
820
|
-
tx(
|
|
934
|
+
tx(deduped);
|
|
821
935
|
}
|
|
822
936
|
recentTweets(source, sourceId, limit = 10) {
|
|
823
937
|
const rows = this.db.query("SELECT * FROM tweets WHERE source = ? AND source_id = ? ORDER BY created_at DESC LIMIT ?").all(source, sourceId, limit);
|
|
@@ -935,17 +1049,24 @@ class ExpertsDB {
|
|
|
935
1049
|
const log = opts.onLog ?? (() => {});
|
|
936
1050
|
const experts = this.list({ source: opts.source });
|
|
937
1051
|
const batch = opts.batch ?? 64;
|
|
938
|
-
const stmt = this.db.query("INSERT OR REPLACE INTO vectors (source, source_id, embedder, dim, vec) VALUES (?, ?, ?, ?, ?)");
|
|
1052
|
+
const stmt = this.db.query("INSERT OR REPLACE INTO vectors (source, source_id, embedder, dim, vec, text_hash) VALUES (?, ?, ?, ?, ?, ?)");
|
|
1053
|
+
const existing = new Map(this.db.query("SELECT source, source_id, embedder, text_hash FROM vectors").all().map((r) => [`${r.source}:${r.source_id}`, { embedder: r.embedder, hash: r.text_hash || "" }]));
|
|
1054
|
+
const work = experts.map((e) => ({ e, text: expertEmbedText(e), hash: String(Bun.hash(expertEmbedText(e))) })).filter(({ e, hash }) => {
|
|
1055
|
+
if (opts.force)
|
|
1056
|
+
return true;
|
|
1057
|
+
const prev = existing.get(`${e.source}:${e.sourceId}`);
|
|
1058
|
+
return !prev || prev.embedder !== embedder.id || prev.hash !== hash;
|
|
1059
|
+
});
|
|
939
1060
|
let done = 0;
|
|
940
|
-
for (let i = 0;i <
|
|
941
|
-
const slice =
|
|
942
|
-
const vecs = await embedder.embed(slice.map((
|
|
1061
|
+
for (let i = 0;i < work.length; i += batch) {
|
|
1062
|
+
const slice = work.slice(i, i + batch);
|
|
1063
|
+
const vecs = await embedder.embed(slice.map((w) => w.text));
|
|
943
1064
|
const tx = this.db.transaction(() => {
|
|
944
|
-
slice.forEach((
|
|
1065
|
+
slice.forEach((w, j) => stmt.run(w.e.source, w.e.sourceId, embedder.id, embedder.dim, packVector(vecs[j]), w.hash));
|
|
945
1066
|
});
|
|
946
1067
|
tx();
|
|
947
1068
|
done += slice.length;
|
|
948
|
-
log(` embedded ${done}/${experts.length}`);
|
|
1069
|
+
log(` embedded ${done}/${work.length} (${experts.length - work.length} unchanged)`);
|
|
949
1070
|
}
|
|
950
1071
|
this.setMeta("embedder", embedder.id);
|
|
951
1072
|
this.setMeta("embedded_at", new Date().toISOString());
|
|
@@ -958,7 +1079,13 @@ class ExpertsDB {
|
|
|
958
1079
|
const where = opts.source ? "WHERE v.source = ?" : "";
|
|
959
1080
|
const params = opts.source ? [opts.source] : [];
|
|
960
1081
|
const rows = this.db.query(`SELECT e.*, v.vec AS _vec FROM vectors v JOIN experts e ON e.source=v.source AND e.source_id=v.source_id ${where}`).all(...params);
|
|
961
|
-
const
|
|
1082
|
+
const blend = opts.blend !== false;
|
|
1083
|
+
const scored = rows.map((r) => {
|
|
1084
|
+
const expert = this.rowToExpert(r);
|
|
1085
|
+
const semantic = cosine(queryVec, unpackVector(r._vec));
|
|
1086
|
+
const score = blend ? blendScore(semantic, expert.authority ?? 0) : semantic;
|
|
1087
|
+
return { expert, score, semantic };
|
|
1088
|
+
});
|
|
962
1089
|
scored.sort((a, b) => b.score - a.score);
|
|
963
1090
|
return scored.slice(0, opts.limit ?? 25);
|
|
964
1091
|
}
|
|
@@ -1048,7 +1175,7 @@ class ExpertsDB {
|
|
|
1048
1175
|
$source: c.source,
|
|
1049
1176
|
$source_id: c.sourceId,
|
|
1050
1177
|
$type: c.type,
|
|
1051
|
-
$value: c.value,
|
|
1178
|
+
$value: maybeEncrypt(c.value),
|
|
1052
1179
|
$label: c.label,
|
|
1053
1180
|
$provider: c.provider,
|
|
1054
1181
|
$confidence: c.confidence,
|
|
@@ -1058,7 +1185,7 @@ class ExpertsDB {
|
|
|
1058
1185
|
});
|
|
1059
1186
|
}
|
|
1060
1187
|
setContactStatus(source, sourceId, type, value, status) {
|
|
1061
|
-
this.db.query("UPDATE contacts SET status = ?, verified_at = ? WHERE source = ? AND source_id = ? AND type = ? AND value = ?").run(status, new Date().toISOString(), source, sourceId, type, value);
|
|
1188
|
+
this.db.query("UPDATE contacts SET status = ?, verified_at = ? WHERE source = ? AND source_id = ? AND type = ? AND value = ?").run(status, new Date().toISOString(), source, sourceId, type, maybeEncrypt(value));
|
|
1062
1189
|
}
|
|
1063
1190
|
contacts(source, sourceId) {
|
|
1064
1191
|
const rows = this.db.query("SELECT * FROM contacts WHERE source = ? AND source_id = ? ORDER BY type, confidence DESC").all(source, sourceId);
|
|
@@ -1066,7 +1193,7 @@ class ExpertsDB {
|
|
|
1066
1193
|
source: r.source,
|
|
1067
1194
|
sourceId: r.source_id,
|
|
1068
1195
|
type: r.type,
|
|
1069
|
-
value: r.value,
|
|
1196
|
+
value: maybeDecrypt(r.value),
|
|
1070
1197
|
label: r.label || "",
|
|
1071
1198
|
provider: r.provider || "",
|
|
1072
1199
|
confidence: r.confidence ?? 0,
|
|
@@ -1091,7 +1218,7 @@ class ExpertsDB {
|
|
|
1091
1218
|
source: r.source,
|
|
1092
1219
|
sourceId: r.source_id,
|
|
1093
1220
|
type: r.type,
|
|
1094
|
-
value: r.value,
|
|
1221
|
+
value: maybeDecrypt(r.value),
|
|
1095
1222
|
label: r.label || "",
|
|
1096
1223
|
provider: r.provider || "",
|
|
1097
1224
|
confidence: r.confidence ?? 0,
|
|
@@ -1401,34 +1528,37 @@ async function fetchJson(url, fetchFn, init = {}) {
|
|
|
1401
1528
|
}
|
|
1402
1529
|
|
|
1403
1530
|
// src/sources/mentorcruise.ts
|
|
1531
|
+
function stripHtml(s) {
|
|
1532
|
+
return (s || "").replace(/<[^>]+>/g, " ").replace(/&[a-z#0-9]+;/gi, " ").replace(/\s+/g, " ").trim();
|
|
1533
|
+
}
|
|
1404
1534
|
function normalizeMentor(m, crawledAt) {
|
|
1405
|
-
const
|
|
1535
|
+
const path = m.get_absolute_url || "";
|
|
1536
|
+
const slug = path.match(/\/mentor\/([^/]+)/)?.[1] || slugify(m.get_full_name || String(m.objectID ?? ""));
|
|
1406
1537
|
const socials = {};
|
|
1407
1538
|
if (m.twitter)
|
|
1408
1539
|
socials.twitter = m.twitter.startsWith("http") ? m.twitter : `https://x.com/${m.twitter}`;
|
|
1409
1540
|
if (m.linkedin)
|
|
1410
1541
|
socials.linkedin = m.linkedin;
|
|
1542
|
+
const price = m.all_prices?.length ? Math.min(...m.all_prices) : Math.round(m.avg_price_per_call ?? 0);
|
|
1411
1543
|
return makeExpert({
|
|
1412
1544
|
source: "mentorcruise",
|
|
1413
|
-
sourceId: String(m.
|
|
1545
|
+
sourceId: String(m.objectID ?? slug),
|
|
1414
1546
|
slug,
|
|
1415
|
-
url: `https://mentorcruise.com/mentor/${slug}/`,
|
|
1416
|
-
fullName: m.
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
rating: m.rating ?? 0,
|
|
1426
|
-
ratingCount: m.reviews_count ?? 0,
|
|
1427
|
-
verified: Boolean(m.verified),
|
|
1547
|
+
url: path ? `https://mentorcruise.com${path}` : `https://mentorcruise.com/mentor/${slug}/`,
|
|
1548
|
+
fullName: m.get_full_name ?? "",
|
|
1549
|
+
title: (m.cleaned_job_title ?? []).join(", "),
|
|
1550
|
+
bio: stripHtml(m.bio_formatted ?? ""),
|
|
1551
|
+
avatar: m.get_profile_picture ?? "",
|
|
1552
|
+
price,
|
|
1553
|
+
priceCurrency: "USD",
|
|
1554
|
+
priceUnit: price ? "per month" : "",
|
|
1555
|
+
rating: m.avg_rating_float_one_decimal ?? 0,
|
|
1556
|
+
ratingCount: m.number_of_reviews ?? 0,
|
|
1428
1557
|
featured: Boolean(m.is_top_mentor),
|
|
1429
|
-
topics: m.
|
|
1430
|
-
tags: m.
|
|
1558
|
+
topics: m.get_industries ?? [],
|
|
1559
|
+
tags: m.get_skills ?? [],
|
|
1431
1560
|
socials,
|
|
1561
|
+
extra: { company: m.company ?? "", location: m.get_location_display ?? "", avgPricePerCall: m.avg_price_per_call ?? 0 },
|
|
1432
1562
|
crawledAt
|
|
1433
1563
|
});
|
|
1434
1564
|
}
|
|
@@ -1438,44 +1568,61 @@ class MentorCruiseSource {
|
|
|
1438
1568
|
description = "MentorCruise \u2014 long-term mentorship from vetted mentors";
|
|
1439
1569
|
website = "https://mentorcruise.com";
|
|
1440
1570
|
fetchFn;
|
|
1441
|
-
|
|
1571
|
+
appId;
|
|
1572
|
+
apiKey;
|
|
1573
|
+
index;
|
|
1442
1574
|
pageSize;
|
|
1443
1575
|
constructor(opts = {}) {
|
|
1444
1576
|
this.fetchFn = opts.fetchFn ?? fetch;
|
|
1445
|
-
this.
|
|
1446
|
-
this.
|
|
1577
|
+
this.appId = opts.appId ?? process.env.MENTORCRUISE_ALGOLIA_APP_ID ?? "YD3XA4V91L";
|
|
1578
|
+
this.apiKey = opts.apiKey ?? process.env.MENTORCRUISE_ALGOLIA_API_KEY ?? "454b55a2e50bc884225318d99b0dad1a";
|
|
1579
|
+
this.index = opts.index ?? process.env.MENTORCRUISE_ALGOLIA_INDEX ?? "MentorProfile_prod";
|
|
1580
|
+
this.pageSize = opts.pageSize ?? 200;
|
|
1447
1581
|
}
|
|
1448
1582
|
async crawl(opts = {}) {
|
|
1449
1583
|
const log = opts.onLog ?? (() => {});
|
|
1450
1584
|
const crawledAt = new Date().toISOString();
|
|
1585
|
+
const url = `https://${this.appId}-dsn.algolia.net/1/indexes/${this.index}/query`;
|
|
1451
1586
|
const experts = [];
|
|
1452
1587
|
const tags = new Set;
|
|
1453
|
-
let
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1588
|
+
let page = 0;
|
|
1589
|
+
let pages = 1;
|
|
1590
|
+
while (page < pages) {
|
|
1591
|
+
let data;
|
|
1592
|
+
try {
|
|
1593
|
+
const res = await this.fetchFn(url, {
|
|
1594
|
+
method: "POST",
|
|
1595
|
+
headers: {
|
|
1596
|
+
"X-Algolia-Application-Id": this.appId,
|
|
1597
|
+
"X-Algolia-API-Key": this.apiKey,
|
|
1598
|
+
"Content-Type": "application/json"
|
|
1599
|
+
},
|
|
1600
|
+
body: JSON.stringify({ params: `hitsPerPage=${this.pageSize}&page=${page}` })
|
|
1601
|
+
});
|
|
1602
|
+
if (!res.ok)
|
|
1603
|
+
break;
|
|
1604
|
+
data = await res.json();
|
|
1605
|
+
} catch {
|
|
1458
1606
|
break;
|
|
1459
|
-
|
|
1460
|
-
|
|
1607
|
+
}
|
|
1608
|
+
pages = data.nbPages ?? 1;
|
|
1609
|
+
for (const hit of data.hits ?? []) {
|
|
1610
|
+
const e = normalizeMentor(hit, crawledAt);
|
|
1461
1611
|
experts.push(e);
|
|
1462
1612
|
for (const t of e.tags)
|
|
1463
1613
|
tags.add(t);
|
|
1464
1614
|
}
|
|
1465
|
-
|
|
1466
|
-
|
|
1615
|
+
log(` mentorcruise: ${experts.length}/${data.nbHits ?? "?"}`);
|
|
1616
|
+
page++;
|
|
1467
1617
|
if (opts.max && experts.length >= opts.max)
|
|
1468
1618
|
break;
|
|
1469
|
-
if (items.length < this.pageSize)
|
|
1470
|
-
break;
|
|
1471
1619
|
}
|
|
1472
1620
|
if (experts.length === 0) {
|
|
1473
|
-
log("mentorcruise:
|
|
1621
|
+
log("mentorcruise: Algolia returned nothing (set MENTORCRUISE_ALGOLIA_* or inject fetchFn).");
|
|
1474
1622
|
}
|
|
1475
|
-
const topics = [];
|
|
1476
1623
|
return {
|
|
1477
1624
|
experts: opts.max ? experts.slice(0, opts.max) : experts,
|
|
1478
|
-
topics,
|
|
1625
|
+
topics: [],
|
|
1479
1626
|
tags: [...tags].map((name) => ({ name, topic: "" })),
|
|
1480
1627
|
total: experts.length
|
|
1481
1628
|
};
|
|
@@ -1696,6 +1843,9 @@ async function crawlSource(db, sourceName, opts = {}) {
|
|
|
1696
1843
|
throw new Error(`Unknown source "${sourceName}". Run \`experts sources\` to list options.`);
|
|
1697
1844
|
}
|
|
1698
1845
|
const data = await source.crawl(opts);
|
|
1846
|
+
if (data.experts.length === 0 && db.count(source.name) > 0) {
|
|
1847
|
+
opts.onLog?.(`\u26A0 ${source.name} returned 0 experts but ${db.count(source.name)} are stored \u2014 possible API drift; not overwriting.`);
|
|
1848
|
+
}
|
|
1699
1849
|
const changes = db.recordChanges(source.name, data.experts);
|
|
1700
1850
|
db.upsertExperts(data.experts);
|
|
1701
1851
|
if (data.topics.length)
|
|
@@ -2496,10 +2646,68 @@ function normalizePhone(v) {
|
|
|
2496
2646
|
return "+" + cleaned.slice(1).replace(/\D/g, "");
|
|
2497
2647
|
return cleaned;
|
|
2498
2648
|
}
|
|
2649
|
+
function looksFakePhone(v) {
|
|
2650
|
+
const d = normalizePhone(v).replace(/\D/g, "");
|
|
2651
|
+
if (/^(\d)\1+$/.test(d))
|
|
2652
|
+
return true;
|
|
2653
|
+
if (d === "1234567890" || d === "0123456789")
|
|
2654
|
+
return true;
|
|
2655
|
+
if (/^(\d{2,3})\1\1+$/.test(d))
|
|
2656
|
+
return true;
|
|
2657
|
+
return false;
|
|
2658
|
+
}
|
|
2499
2659
|
function validatePhone(v) {
|
|
2500
2660
|
const n = normalizePhone(v);
|
|
2501
2661
|
const digits = n.replace(/\D/g, "");
|
|
2502
|
-
|
|
2662
|
+
if (digits.length < 7 || digits.length > 15)
|
|
2663
|
+
return "invalid";
|
|
2664
|
+
if (looksFakePhone(v))
|
|
2665
|
+
return "invalid";
|
|
2666
|
+
return "valid";
|
|
2667
|
+
}
|
|
2668
|
+
function interpretRcptCode(code) {
|
|
2669
|
+
if (code === 250 || code === 251)
|
|
2670
|
+
return "valid";
|
|
2671
|
+
if (code === 550 || code === 551 || code === 553 || code === 554)
|
|
2672
|
+
return "invalid";
|
|
2673
|
+
return "unknown";
|
|
2674
|
+
}
|
|
2675
|
+
async function smtpProbe(email, mxHost) {
|
|
2676
|
+
const net = await import("net");
|
|
2677
|
+
return new Promise((resolve) => {
|
|
2678
|
+
let step = 0;
|
|
2679
|
+
let settled = false;
|
|
2680
|
+
const done = (r) => {
|
|
2681
|
+
if (settled)
|
|
2682
|
+
return;
|
|
2683
|
+
settled = true;
|
|
2684
|
+
try {
|
|
2685
|
+
sock.end();
|
|
2686
|
+
} catch {}
|
|
2687
|
+
resolve(r);
|
|
2688
|
+
};
|
|
2689
|
+
const sock = net.createConnection({ host: mxHost, port: 25, timeout: 7000 });
|
|
2690
|
+
sock.on("timeout", () => done("unknown"));
|
|
2691
|
+
sock.on("error", () => done("unknown"));
|
|
2692
|
+
sock.on("data", (buf) => {
|
|
2693
|
+
const code = parseInt(buf.toString().slice(0, 3), 10);
|
|
2694
|
+
if (step === 0) {
|
|
2695
|
+
sock.write(`HELO open-experts.local\r
|
|
2696
|
+
`);
|
|
2697
|
+
step = 1;
|
|
2698
|
+
} else if (step === 1) {
|
|
2699
|
+
sock.write(`MAIL FROM:<probe@open-experts.local>\r
|
|
2700
|
+
`);
|
|
2701
|
+
step = 2;
|
|
2702
|
+
} else if (step === 2) {
|
|
2703
|
+
sock.write(`RCPT TO:<${email}>\r
|
|
2704
|
+
`);
|
|
2705
|
+
step = 3;
|
|
2706
|
+
} else if (step === 3) {
|
|
2707
|
+
done(interpretRcptCode(code));
|
|
2708
|
+
}
|
|
2709
|
+
});
|
|
2710
|
+
});
|
|
2503
2711
|
}
|
|
2504
2712
|
async function validateEmail(email, resolver = resolveMx) {
|
|
2505
2713
|
const e = normalizeEmail(email);
|
|
@@ -2669,12 +2877,25 @@ async function verifyContacts(db, opts = {}) {
|
|
|
2669
2877
|
const delayMs = opts.delayMs ?? 50;
|
|
2670
2878
|
const targets = db.contactsToVerify({ source: opts.source, limit: opts.max });
|
|
2671
2879
|
const res = { checked: 0, valid: 0, invalid: 0, unknown: 0 };
|
|
2880
|
+
const resolver = opts.resolver;
|
|
2881
|
+
const prober = opts.prober ?? smtpProbe;
|
|
2672
2882
|
for (const c of targets) {
|
|
2673
2883
|
let status;
|
|
2674
|
-
if (c.type === "email")
|
|
2675
|
-
status = await validateEmail(c.value,
|
|
2676
|
-
|
|
2884
|
+
if (c.type === "email") {
|
|
2885
|
+
status = await validateEmail(c.value, resolver);
|
|
2886
|
+
if (opts.smtp && status === "valid") {
|
|
2887
|
+
try {
|
|
2888
|
+
const mx = await (resolver ?? (await import("dns/promises")).resolveMx)(c.value.split("@")[1]);
|
|
2889
|
+
if (mx[0]?.exchange) {
|
|
2890
|
+
const probed = await prober(c.value, mx[0].exchange);
|
|
2891
|
+
if (probed !== "unknown")
|
|
2892
|
+
status = probed;
|
|
2893
|
+
}
|
|
2894
|
+
} catch {}
|
|
2895
|
+
}
|
|
2896
|
+
} else {
|
|
2677
2897
|
status = validatePhone(c.value);
|
|
2898
|
+
}
|
|
2678
2899
|
db.setContactStatus(c.source, c.sourceId, c.type, c.value, status);
|
|
2679
2900
|
res.checked++;
|
|
2680
2901
|
res[status]++;
|
|
@@ -2751,7 +2972,7 @@ async function syncContacts(db, idsOrSlugs, opts = {}) {
|
|
|
2751
2972
|
}
|
|
2752
2973
|
|
|
2753
2974
|
// src/cli/index.ts
|
|
2754
|
-
var VERSION = "0.0.
|
|
2975
|
+
var VERSION = "0.0.8";
|
|
2755
2976
|
function openDb() {
|
|
2756
2977
|
const opts = program.opts();
|
|
2757
2978
|
return new ExpertsDB(opts.db || defaultDbPath());
|
|
@@ -2768,6 +2989,15 @@ function requireData(db) {
|
|
|
2768
2989
|
}
|
|
2769
2990
|
var program = new Command;
|
|
2770
2991
|
program.name("experts").description("Crawl and query expert marketplaces (intro.co and more).").version(VERSION, "-v, --version").option("--db <path>", "database file", undefined).option("--json", "output JSON where applicable", false);
|
|
2992
|
+
program.addHelpText("after", `
|
|
2993
|
+
Command groups:
|
|
2994
|
+
Query who \xB7 ask \xB7 brief \xB7 find \xB7 search \xB7 related \xB7 show \xB7 list
|
|
2995
|
+
Ingest crawl \xB7 enrich \xB7 enrich-youtube \xB7 enrich-linkedin \xB7 enrich-sites \xB7 avatars
|
|
2996
|
+
Contacts find-contacts \xB7 verify-contacts \xB7 contacts \xB7 sync-contacts
|
|
2997
|
+
Insight topics \xB7 tags \xB7 stats \xB7 graph \xB7 persons \xB7 changes \xB7 stale
|
|
2998
|
+
Admin embed \xB7 reindex \xB7 export \xB7 sources
|
|
2999
|
+
|
|
3000
|
+
Start with: experts who "someone who can help me scale a B2B AI product"`);
|
|
2771
3001
|
program.command("crawl [source]").description("Fetch experts from a source into the local store").option("--max <n>", "limit number of experts (quick runs)", (v) => parseInt(v, 10)).option("--no-topics", "skip topic-membership resolution (fewer requests)").option("--delay <ms>", "min delay between requests", (v) => parseInt(v, 10), 500).action(async (source, cmdOpts) => {
|
|
2772
3002
|
const db = openDb();
|
|
2773
3003
|
const name = source || "intro";
|
|
@@ -2944,12 +3174,13 @@ program.command("find-contacts [source]").description("Discover email + phone fo
|
|
|
2944
3174
|
console.log(chalk2.dim("Next: `experts verify-contacts` to set working/not status."));
|
|
2945
3175
|
db.close();
|
|
2946
3176
|
});
|
|
2947
|
-
program.command("verify-contacts [source]").description("Validate discovered contacts and set working/not status").option("--max <n>", "limit contacts checked", (v) => parseInt(v, 10)).action(async (source, cmdOpts) => {
|
|
3177
|
+
program.command("verify-contacts [source]").description("Validate discovered contacts and set working/not status").option("--max <n>", "limit contacts checked", (v) => parseInt(v, 10)).option("--smtp", "also SMTP-probe MX-valid emails (needs outbound port 25)").action(async (source, cmdOpts) => {
|
|
2948
3178
|
const db = openDb();
|
|
2949
3179
|
requireData(db);
|
|
2950
3180
|
const res = await verifyContacts(db, {
|
|
2951
3181
|
source,
|
|
2952
3182
|
max: cmdOpts.max,
|
|
3183
|
+
smtp: cmdOpts.smtp,
|
|
2953
3184
|
onLog: (m) => process.stderr.write(chalk2.dim(m + `
|
|
2954
3185
|
`))
|
|
2955
3186
|
});
|
|
@@ -3066,14 +3297,15 @@ ${rows.length} experts`));
|
|
|
3066
3297
|
}
|
|
3067
3298
|
db.close();
|
|
3068
3299
|
});
|
|
3069
|
-
program.command("embed [source]").description("Build the semantic search index (
|
|
3300
|
+
program.command("embed [source]").description("Build the semantic search index (incremental \u2014 only new/changed experts)").option("--force", "re-embed every expert even if unchanged").action(async (source, cmdOpts) => {
|
|
3070
3301
|
const db = openDb();
|
|
3071
3302
|
requireData(db);
|
|
3072
|
-
const embedder = getEmbedder();
|
|
3303
|
+
const embedder = await getEmbedder();
|
|
3073
3304
|
console.error(chalk2.dim(`Embedding with ${embedder.id}\u2026`));
|
|
3074
3305
|
const n = await db.buildEmbeddings(embedder, {
|
|
3075
3306
|
source,
|
|
3076
|
-
|
|
3307
|
+
force: cmdOpts.force,
|
|
3308
|
+
onLog: (m) => process.stderr.write(chalk2.dim(`\r${m}`.padEnd(50)))
|
|
3077
3309
|
});
|
|
3078
3310
|
process.stderr.write(`
|
|
3079
3311
|
`);
|
|
@@ -3087,18 +3319,62 @@ program.command("ask <query...>").description("Natural-language semantic search:
|
|
|
3087
3319
|
console.error(chalk2.yellow("No semantic index yet. Run ") + chalk2.bold("experts embed") + chalk2.yellow(" first."));
|
|
3088
3320
|
process.exit(1);
|
|
3089
3321
|
}
|
|
3090
|
-
const embedder = getEmbedder();
|
|
3322
|
+
const embedder = await getEmbedder();
|
|
3323
|
+
const storedBy = db.getMeta("embedder");
|
|
3324
|
+
if (storedBy && storedBy !== embedder.id) {
|
|
3325
|
+
console.error(chalk2.yellow(`\u26A0 index built with "${storedBy}" but active embedder is "${embedder.id}" \u2014 run \`experts embed\` to re-index.`));
|
|
3326
|
+
}
|
|
3091
3327
|
const [qv] = await embedder.embed([query.join(" ")]);
|
|
3092
|
-
const
|
|
3328
|
+
const q = query.join(" ");
|
|
3329
|
+
const results = db.semanticSearch(qv, { source: cmdOpts.source, limit: cmdOpts.limit }).map((r) => ({
|
|
3330
|
+
...r,
|
|
3331
|
+
why: explainMatch(q, r.expert)
|
|
3332
|
+
}));
|
|
3093
3333
|
if (program.opts().json) {
|
|
3094
3334
|
console.log(JSON.stringify(results, null, 2));
|
|
3095
3335
|
} else {
|
|
3096
3336
|
const multi = new Set(results.map((r) => r.expert.source)).size > 1;
|
|
3097
|
-
for (const { expert, score } of results) {
|
|
3337
|
+
for (const { expert, score, why } of results) {
|
|
3098
3338
|
console.log(chalk2.dim(score.toFixed(3)) + " " + formatRow(expert, { showSource: multi }));
|
|
3339
|
+
if (why.length)
|
|
3340
|
+
console.log(` ${chalk2.dim("why: " + why.join(", "))}`);
|
|
3099
3341
|
}
|
|
3100
3342
|
console.log(chalk2.dim(`
|
|
3101
|
-
${results.length} matches for "${
|
|
3343
|
+
${results.length} matches for "${q}"`));
|
|
3344
|
+
}
|
|
3345
|
+
db.close();
|
|
3346
|
+
});
|
|
3347
|
+
program.command("who <query...>").description("Smart search \u2014 the one command: semantic + authority + person-dedup + why").option("-s, --source <name>", "filter by source").option("-n, --limit <n>", "results", (v) => parseInt(v, 10), 10).action(async (query, cmdOpts) => {
|
|
3348
|
+
const db = openDb();
|
|
3349
|
+
requireData(db);
|
|
3350
|
+
if (db.vectorCount() === 0) {
|
|
3351
|
+
console.error(chalk2.yellow("No semantic index. Run ") + chalk2.bold("experts embed") + chalk2.yellow(" first."));
|
|
3352
|
+
process.exit(1);
|
|
3353
|
+
}
|
|
3354
|
+
const q = query.join(" ");
|
|
3355
|
+
const [qv] = await (await getEmbedder()).embed([q]);
|
|
3356
|
+
const raw = db.semanticSearch(qv, { source: cmdOpts.source, limit: (cmdOpts.limit + 5) * 4 });
|
|
3357
|
+
const seen = new Set;
|
|
3358
|
+
const out = [];
|
|
3359
|
+
for (const r of raw) {
|
|
3360
|
+
const pid = db.personIdOf(r.expert.source, r.expert.sourceId);
|
|
3361
|
+
if (seen.has(pid))
|
|
3362
|
+
continue;
|
|
3363
|
+
seen.add(pid);
|
|
3364
|
+
out.push({ expert: r.expert, score: r.score, why: explainMatch(q, r.expert) });
|
|
3365
|
+
if (out.length >= cmdOpts.limit)
|
|
3366
|
+
break;
|
|
3367
|
+
}
|
|
3368
|
+
if (program.opts().json) {
|
|
3369
|
+
console.log(JSON.stringify(out, null, 2));
|
|
3370
|
+
} else {
|
|
3371
|
+
const multi = new Set(out.map((r) => r.expert.source)).size > 1;
|
|
3372
|
+
out.forEach((r, i) => {
|
|
3373
|
+
console.log(`${chalk2.cyan(`${i + 1}.`)} ${formatRow(r.expert, { showSource: multi })}`);
|
|
3374
|
+
console.log(` ${chalk2.dim((r.why.length ? "why: " + r.why.join(", ") : "semantic match") + ` (${r.score.toFixed(3)})`)}`);
|
|
3375
|
+
});
|
|
3376
|
+
console.log(chalk2.dim(`
|
|
3377
|
+
${out.length} experts for "${q}"`));
|
|
3102
3378
|
}
|
|
3103
3379
|
db.close();
|
|
3104
3380
|
});
|
|
@@ -3110,7 +3386,7 @@ program.command("brief <text...>").description("Paste a brief \u2192 ranked, de-
|
|
|
3110
3386
|
process.exit(1);
|
|
3111
3387
|
}
|
|
3112
3388
|
const brief = text.join(" ");
|
|
3113
|
-
const [qv] = await getEmbedder().embed([brief]);
|
|
3389
|
+
const [qv] = await (await getEmbedder()).embed([brief]);
|
|
3114
3390
|
const raw = db.semanticSearch(qv, { source: cmdOpts.source, limit: (cmdOpts.limit + 5) * 4 });
|
|
3115
3391
|
const seen = new Set;
|
|
3116
3392
|
const shortlist = [];
|
|
@@ -3123,11 +3399,7 @@ program.command("brief <text...>").description("Paste a brief \u2192 ranked, de-
|
|
|
3123
3399
|
if (shortlist.length >= cmdOpts.limit)
|
|
3124
3400
|
break;
|
|
3125
3401
|
}
|
|
3126
|
-
const
|
|
3127
|
-
const annotated = shortlist.map((r) => ({
|
|
3128
|
-
...r,
|
|
3129
|
-
why: r.expert.tags.filter((t) => briefLc.includes(t.toLowerCase())).slice(0, 4)
|
|
3130
|
-
}));
|
|
3402
|
+
const annotated = shortlist.map((r) => ({ ...r, why: explainMatch(brief, r.expert) }));
|
|
3131
3403
|
if (program.opts().json) {
|
|
3132
3404
|
console.log(JSON.stringify(annotated, null, 2));
|
|
3133
3405
|
} else {
|