@hasna/experts 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/index.js CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env bun
2
2
  // @bun
3
+ var __require = import.meta.require;
3
4
 
4
5
  // src/cli/index.ts
5
6
  import { Command } from "commander";
@@ -53,6 +54,36 @@ function authorityScore(e, inputs = {}, weights = DEFAULT_WEIGHTS) {
53
54
  const raw = weights.rating * rating + weights.reviews * reviews + weights.followers * followers + weights.featured * featured + weights.verified * verified + weights.recency * recency;
54
55
  return Math.round(raw * 1000) / 10;
55
56
  }
57
+ function pricePerHour(price, priceUnit) {
58
+ if (!price || price <= 0)
59
+ return /free/i.test(priceUnit) ? 0 : null;
60
+ const u = (priceUnit || "").toLowerCase();
61
+ const minMatch = u.match(/(\d+)\s*min/);
62
+ if (minMatch)
63
+ return Math.round(price * 60 / Number(minMatch[1]));
64
+ if (/per\s*min|\/\s*min|minute/.test(u))
65
+ return price * 60;
66
+ if (/hour|\/\s*hr|per\s*hr/.test(u))
67
+ return price;
68
+ if (/free/.test(u))
69
+ return 0;
70
+ return null;
71
+ }
72
+ var DEFAULT_BLEND = { semantic: 0.8, authority: 0.2 };
73
+ function blendScore(semantic, authority, w = DEFAULT_BLEND) {
74
+ const a = Math.max(0, Math.min(1, (authority || 0) / 100));
75
+ const s = Math.max(0, Math.min(1, semantic));
76
+ return w.semantic * s + w.authority * a;
77
+ }
78
+ function explainMatch(query, e) {
79
+ const q = ` ${(query || "").toLowerCase()} `;
80
+ const hit = (label) => {
81
+ const l = label.toLowerCase();
82
+ return q.includes(` ${l} `) || q.includes(`${l},`) || q.includes(`${l}.`) || new RegExp(`\\b${l.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`).test(q);
83
+ };
84
+ const matched = [...e.topics, ...e.tags].filter(hit);
85
+ return [...new Set(matched)].slice(0, 6);
86
+ }
56
87
 
57
88
  // src/embed.ts
58
89
  var STOPWORDS = new Set([
@@ -143,11 +174,40 @@ class OpenAIEmbedder {
143
174
  return data.data.map((d) => d.embedding);
144
175
  }
145
176
  }
146
- function getEmbedder() {
147
- if (process.env.EXPERTS_EMBEDDER === "openai" && process.env.OPENAI_API_KEY) {
177
+
178
+ class TransformersEmbedder {
179
+ id = "minilm-l6-v2";
180
+ dim = 384;
181
+ model = process.env.EXPERTS_EMBED_MODEL || "Xenova/all-MiniLM-L6-v2";
182
+ extractor = null;
183
+ async ensure() {
184
+ if (this.extractor)
185
+ return;
186
+ const { pipeline } = await import("@huggingface/transformers");
187
+ this.extractor = await pipeline("feature-extraction", this.model);
188
+ }
189
+ async embed(texts) {
190
+ await this.ensure();
191
+ const out = [];
192
+ for (const t of texts) {
193
+ const r = await this.extractor(t || " ", { pooling: "mean", normalize: true });
194
+ out.push(Array.from(r.data));
195
+ }
196
+ return out;
197
+ }
198
+ }
199
+ async function getEmbedder() {
200
+ const choice = process.env.EXPERTS_EMBEDDER;
201
+ if (choice === "openai" && process.env.OPENAI_API_KEY)
148
202
  return new OpenAIEmbedder;
203
+ if (choice === "hash")
204
+ return new HashingEmbedder;
205
+ try {
206
+ await import("@huggingface/transformers");
207
+ return new TransformersEmbedder;
208
+ } catch {
209
+ return new HashingEmbedder;
149
210
  }
150
- return new HashingEmbedder;
151
211
  }
152
212
  function cosine(a, b) {
153
213
  let dot = 0;
@@ -232,6 +292,48 @@ function clusterPersons(experts) {
232
292
  return out;
233
293
  }
234
294
 
295
+ // src/crypto.ts
296
+ import { createCipheriv, createDecipheriv, createHmac, scryptSync } from "crypto";
297
+ var PREFIX = "enc1:";
298
+ var cachedKey = null;
299
+ var cachedFrom = null;
300
+ function key2() {
301
+ const secret = process.env.OPEN_EXPERTS_KEY;
302
+ if (!secret)
303
+ return null;
304
+ if (cachedKey && cachedFrom === secret)
305
+ return cachedKey;
306
+ cachedKey = scryptSync(secret, "open-experts/contacts/v1", 32);
307
+ cachedFrom = secret;
308
+ return cachedKey;
309
+ }
310
+ function maybeEncrypt(plaintext) {
311
+ const k = key2();
312
+ if (!k || plaintext == null)
313
+ return plaintext;
314
+ if (plaintext.startsWith(PREFIX))
315
+ return plaintext;
316
+ const iv = createHmac("sha256", k).update(plaintext).digest().subarray(0, 12);
317
+ const cipher = createCipheriv("aes-256-gcm", k, iv);
318
+ const enc = Buffer.concat([cipher.update(plaintext, "utf8"), cipher.final()]);
319
+ const tag = cipher.getAuthTag();
320
+ return PREFIX + Buffer.concat([iv, tag, enc]).toString("base64");
321
+ }
322
+ function maybeDecrypt(stored) {
323
+ if (stored == null || !stored.startsWith(PREFIX))
324
+ return stored;
325
+ const k = key2();
326
+ if (!k)
327
+ return stored;
328
+ const raw = Buffer.from(stored.slice(PREFIX.length), "base64");
329
+ const iv = raw.subarray(0, 12);
330
+ const tag = raw.subarray(12, 28);
331
+ const enc = raw.subarray(28);
332
+ const decipher = createDecipheriv("aes-256-gcm", k, iv);
333
+ decipher.setAuthTag(tag);
334
+ return Buffer.concat([decipher.update(enc), decipher.final()]).toString("utf8");
335
+ }
336
+
235
337
  // src/db.ts
236
338
  function defaultDbPath() {
237
339
  return process.env.OPEN_EXPERTS_DB || join(homedir(), ".hasna", "experts", "experts.db");
@@ -343,10 +445,11 @@ class ExpertsDB {
343
445
  CREATE INDEX IF NOT EXISTS idx_contacts_expert ON contacts(source, source_id);
344
446
  CREATE INDEX IF NOT EXISTS idx_contacts_status ON contacts(status);
345
447
 
346
- -- Semantic search: one embedding vector per expert.
448
+ -- Semantic search: one embedding vector per expert (text_hash enables
449
+ -- incremental re-embedding \u2014 skip unchanged experts).
347
450
  CREATE TABLE IF NOT EXISTS vectors (
348
451
  source TEXT NOT NULL, source_id TEXT NOT NULL,
349
- embedder TEXT NOT NULL, dim INTEGER, vec BLOB,
452
+ embedder TEXT NOT NULL, dim INTEGER, vec BLOB, text_hash TEXT,
350
453
  PRIMARY KEY (source, source_id)
351
454
  );
352
455
 
@@ -368,6 +471,7 @@ class ExpertsDB {
368
471
  `);
369
472
  this.addColumnIfMissing("experts", "avatar_local", "TEXT");
370
473
  this.addColumnIfMissing("experts", "authority", "REAL DEFAULT 0");
474
+ this.addColumnIfMissing("vectors", "text_hash", "TEXT");
371
475
  }
372
476
  addColumnIfMissing(table, column, type) {
373
477
  const cols = this.db.query(`PRAGMA table_info(${table})`).all();
@@ -454,6 +558,7 @@ class ExpertsDB {
454
558
  extra: JSON.parse(r.extra || "{}"),
455
559
  avatarLocal: r.avatar_local || undefined,
456
560
  authority: r.authority ?? 0,
561
+ pricePerHour: pricePerHour(r.price ?? 0, r.price_unit ?? ""),
457
562
  crawledAt: r.crawled_at
458
563
  };
459
564
  }
@@ -577,11 +682,11 @@ class ExpertsDB {
577
682
  sql += " ORDER BY name";
578
683
  return this.db.query(sql).all(...params);
579
684
  }
580
- setMeta(key2, value) {
581
- this.db.query("INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value=excluded.value").run(key2, value);
685
+ setMeta(key3, value) {
686
+ this.db.query("INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value=excluded.value").run(key3, value);
582
687
  }
583
- getMeta(key2) {
584
- const row = this.db.query("SELECT value FROM meta WHERE key = ?").get(key2);
688
+ getMeta(key3) {
689
+ const row = this.db.query("SELECT value FROM meta WHERE key = ?").get(key3);
585
690
  return row ? row.value : null;
586
691
  }
587
692
  stats(source) {
@@ -605,12 +710,12 @@ class ExpertsDB {
605
710
  const nodeIds = new Map;
606
711
  const insertNode = this.db.query("INSERT INTO kg_nodes (type, key, label) VALUES (?, ?, ?) ON CONFLICT(type, key) DO UPDATE SET label=excluded.label RETURNING id");
607
712
  const insertEdge = this.db.query("INSERT OR REPLACE INTO kg_edges (src, dst, rel, weight) VALUES (?, ?, ?, ?)");
608
- const node = (type, key2, label) => {
609
- const ck = `${type}\x00${key2.toLowerCase()}`;
713
+ const node = (type, key3, label) => {
714
+ const ck = `${type}\x00${key3.toLowerCase()}`;
610
715
  const cached = nodeIds.get(ck);
611
716
  if (cached != null)
612
717
  return cached;
613
- const id = insertNode.get(type, key2.toLowerCase(), label).id;
718
+ const id = insertNode.get(type, key3.toLowerCase(), label).id;
614
719
  nodeIds.set(ck, id);
615
720
  return id;
616
721
  };
@@ -620,7 +725,7 @@ class ExpertsDB {
620
725
  for (const topic of e.topics) {
621
726
  insertEdge.run(eId, node("topic", topic, topic), "IN_TOPIC", 1);
622
727
  }
623
- const tweetText = this.recentTweets(e.source, e.sourceId, 30).map((t) => t.text).join(". ");
728
+ const tweetText = this.recentTweets(e.source, e.sourceId, 30).filter((t) => !t.isRetweet).map((t) => t.text).join(". ");
624
729
  const tags = inferTags(expertText(e) + ". " + tweetText, vocabulary);
625
730
  for (const tag of tags) {
626
731
  insertEdge.run(eId, node("tag", tag, tag), "HAS_TAG", 1);
@@ -674,11 +779,11 @@ class ExpertsDB {
674
779
  lastSeen: r.last_seen || ""
675
780
  }));
676
781
  }
677
- expertFromNodeKey(key2) {
678
- const idx = key2.indexOf(":");
782
+ expertFromNodeKey(key3) {
783
+ const idx = key3.indexOf(":");
679
784
  if (idx < 0)
680
785
  return null;
681
- return this.get(key2.slice(idx + 1), key2.slice(0, idx));
786
+ return this.get(key3.slice(idx + 1), key3.slice(0, idx));
682
787
  }
683
788
  findByNeeds(needs, opts = {}) {
684
789
  const cleaned = needs.map((n) => n.trim().toLowerCase()).filter(Boolean);
@@ -805,6 +910,15 @@ class ExpertsDB {
805
910
  };
806
911
  }
807
912
  replaceTweets(source, sourceId, tweets) {
913
+ const norm = (t) => (t || "").toLowerCase().replace(/^rt @\w+:\s*/, "").replace(/https?:\/\/\S+/g, "").replace(/[^a-z0-9 ]/g, "").replace(/\s+/g, " ").trim();
914
+ const seen = new Set;
915
+ const deduped = tweets.filter((t) => {
916
+ const k = norm(t.text);
917
+ if (!k || seen.has(k))
918
+ return false;
919
+ seen.add(k);
920
+ return true;
921
+ });
808
922
  const tx = this.db.transaction((rows) => {
809
923
  this.db.query("DELETE FROM tweets WHERE source = ? AND source_id = ?").run(source, sourceId);
810
924
  const stmt = this.db.query(`
@@ -817,7 +931,7 @@ class ExpertsDB {
817
931
  stmt.run(t.source, t.sourceId, t.tweetId, t.text, t.createdAt, t.retweetCount, t.replyCount, t.likeCount, t.quoteCount, t.impressionCount, t.isRetweet ? 1 : 0, t.isReply ? 1 : 0);
818
932
  }
819
933
  });
820
- tx(tweets);
934
+ tx(deduped);
821
935
  }
822
936
  recentTweets(source, sourceId, limit = 10) {
823
937
  const rows = this.db.query("SELECT * FROM tweets WHERE source = ? AND source_id = ? ORDER BY created_at DESC LIMIT ?").all(source, sourceId, limit);
@@ -935,17 +1049,24 @@ class ExpertsDB {
935
1049
  const log = opts.onLog ?? (() => {});
936
1050
  const experts = this.list({ source: opts.source });
937
1051
  const batch = opts.batch ?? 64;
938
- const stmt = this.db.query("INSERT OR REPLACE INTO vectors (source, source_id, embedder, dim, vec) VALUES (?, ?, ?, ?, ?)");
1052
+ const stmt = this.db.query("INSERT OR REPLACE INTO vectors (source, source_id, embedder, dim, vec, text_hash) VALUES (?, ?, ?, ?, ?, ?)");
1053
+ const existing = new Map(this.db.query("SELECT source, source_id, embedder, text_hash FROM vectors").all().map((r) => [`${r.source}:${r.source_id}`, { embedder: r.embedder, hash: r.text_hash || "" }]));
1054
+ const work = experts.map((e) => ({ e, text: expertEmbedText(e), hash: String(Bun.hash(expertEmbedText(e))) })).filter(({ e, hash }) => {
1055
+ if (opts.force)
1056
+ return true;
1057
+ const prev = existing.get(`${e.source}:${e.sourceId}`);
1058
+ return !prev || prev.embedder !== embedder.id || prev.hash !== hash;
1059
+ });
939
1060
  let done = 0;
940
- for (let i = 0;i < experts.length; i += batch) {
941
- const slice = experts.slice(i, i + batch);
942
- const vecs = await embedder.embed(slice.map((e) => expertEmbedText(e)));
1061
+ for (let i = 0;i < work.length; i += batch) {
1062
+ const slice = work.slice(i, i + batch);
1063
+ const vecs = await embedder.embed(slice.map((w) => w.text));
943
1064
  const tx = this.db.transaction(() => {
944
- slice.forEach((e, j) => stmt.run(e.source, e.sourceId, embedder.id, embedder.dim, packVector(vecs[j])));
1065
+ slice.forEach((w, j) => stmt.run(w.e.source, w.e.sourceId, embedder.id, embedder.dim, packVector(vecs[j]), w.hash));
945
1066
  });
946
1067
  tx();
947
1068
  done += slice.length;
948
- log(` embedded ${done}/${experts.length}`);
1069
+ log(` embedded ${done}/${work.length} (${experts.length - work.length} unchanged)`);
949
1070
  }
950
1071
  this.setMeta("embedder", embedder.id);
951
1072
  this.setMeta("embedded_at", new Date().toISOString());
@@ -958,7 +1079,13 @@ class ExpertsDB {
958
1079
  const where = opts.source ? "WHERE v.source = ?" : "";
959
1080
  const params = opts.source ? [opts.source] : [];
960
1081
  const rows = this.db.query(`SELECT e.*, v.vec AS _vec FROM vectors v JOIN experts e ON e.source=v.source AND e.source_id=v.source_id ${where}`).all(...params);
961
- const scored = rows.map((r) => ({ expert: this.rowToExpert(r), score: cosine(queryVec, unpackVector(r._vec)) }));
1082
+ const blend = opts.blend !== false;
1083
+ const scored = rows.map((r) => {
1084
+ const expert = this.rowToExpert(r);
1085
+ const semantic = cosine(queryVec, unpackVector(r._vec));
1086
+ const score = blend ? blendScore(semantic, expert.authority ?? 0) : semantic;
1087
+ return { expert, score, semantic };
1088
+ });
962
1089
  scored.sort((a, b) => b.score - a.score);
963
1090
  return scored.slice(0, opts.limit ?? 25);
964
1091
  }
@@ -1048,7 +1175,7 @@ class ExpertsDB {
1048
1175
  $source: c.source,
1049
1176
  $source_id: c.sourceId,
1050
1177
  $type: c.type,
1051
- $value: c.value,
1178
+ $value: maybeEncrypt(c.value),
1052
1179
  $label: c.label,
1053
1180
  $provider: c.provider,
1054
1181
  $confidence: c.confidence,
@@ -1058,7 +1185,7 @@ class ExpertsDB {
1058
1185
  });
1059
1186
  }
1060
1187
  setContactStatus(source, sourceId, type, value, status) {
1061
- this.db.query("UPDATE contacts SET status = ?, verified_at = ? WHERE source = ? AND source_id = ? AND type = ? AND value = ?").run(status, new Date().toISOString(), source, sourceId, type, value);
1188
+ this.db.query("UPDATE contacts SET status = ?, verified_at = ? WHERE source = ? AND source_id = ? AND type = ? AND value = ?").run(status, new Date().toISOString(), source, sourceId, type, maybeEncrypt(value));
1062
1189
  }
1063
1190
  contacts(source, sourceId) {
1064
1191
  const rows = this.db.query("SELECT * FROM contacts WHERE source = ? AND source_id = ? ORDER BY type, confidence DESC").all(source, sourceId);
@@ -1066,7 +1193,7 @@ class ExpertsDB {
1066
1193
  source: r.source,
1067
1194
  sourceId: r.source_id,
1068
1195
  type: r.type,
1069
- value: r.value,
1196
+ value: maybeDecrypt(r.value),
1070
1197
  label: r.label || "",
1071
1198
  provider: r.provider || "",
1072
1199
  confidence: r.confidence ?? 0,
@@ -1091,7 +1218,7 @@ class ExpertsDB {
1091
1218
  source: r.source,
1092
1219
  sourceId: r.source_id,
1093
1220
  type: r.type,
1094
- value: r.value,
1221
+ value: maybeDecrypt(r.value),
1095
1222
  label: r.label || "",
1096
1223
  provider: r.provider || "",
1097
1224
  confidence: r.confidence ?? 0,
@@ -1401,34 +1528,37 @@ async function fetchJson(url, fetchFn, init = {}) {
1401
1528
  }
1402
1529
 
1403
1530
  // src/sources/mentorcruise.ts
1531
+ function stripHtml(s) {
1532
+ return (s || "").replace(/<[^>]+>/g, " ").replace(/&[a-z#0-9]+;/gi, " ").replace(/\s+/g, " ").trim();
1533
+ }
1404
1534
  function normalizeMentor(m, crawledAt) {
1405
- const slug = m.slug || slugify(m.name || String(m.id ?? ""));
1535
+ const path = m.get_absolute_url || "";
1536
+ const slug = path.match(/\/mentor\/([^/]+)/)?.[1] || slugify(m.get_full_name || String(m.objectID ?? ""));
1406
1537
  const socials = {};
1407
1538
  if (m.twitter)
1408
1539
  socials.twitter = m.twitter.startsWith("http") ? m.twitter : `https://x.com/${m.twitter}`;
1409
1540
  if (m.linkedin)
1410
1541
  socials.linkedin = m.linkedin;
1542
+ const price = m.all_prices?.length ? Math.min(...m.all_prices) : Math.round(m.avg_price_per_call ?? 0);
1411
1543
  return makeExpert({
1412
1544
  source: "mentorcruise",
1413
- sourceId: String(m.id ?? slug),
1545
+ sourceId: String(m.objectID ?? slug),
1414
1546
  slug,
1415
- url: `https://mentorcruise.com/mentor/${slug}/`,
1416
- fullName: m.name ?? [m.first_name, m.last_name].filter(Boolean).join(" "),
1417
- firstName: m.first_name ?? "",
1418
- lastName: m.last_name ?? "",
1419
- title: m.job_title ?? "",
1420
- bio: m.bio ?? "",
1421
- avatar: m.avatar ?? m.photo ?? "",
1422
- price: m.price ?? 0,
1423
- priceCurrency: m.currency ?? "USD",
1424
- priceUnit: m.price ? "per month" : "",
1425
- rating: m.rating ?? 0,
1426
- ratingCount: m.reviews_count ?? 0,
1427
- verified: Boolean(m.verified),
1547
+ url: path ? `https://mentorcruise.com${path}` : `https://mentorcruise.com/mentor/${slug}/`,
1548
+ fullName: m.get_full_name ?? "",
1549
+ title: (m.cleaned_job_title ?? []).join(", "),
1550
+ bio: stripHtml(m.bio_formatted ?? ""),
1551
+ avatar: m.get_profile_picture ?? "",
1552
+ price,
1553
+ priceCurrency: "USD",
1554
+ priceUnit: price ? "per month" : "",
1555
+ rating: m.avg_rating_float_one_decimal ?? 0,
1556
+ ratingCount: m.number_of_reviews ?? 0,
1428
1557
  featured: Boolean(m.is_top_mentor),
1429
- topics: m.categories ?? [],
1430
- tags: m.skills ?? [],
1558
+ topics: m.get_industries ?? [],
1559
+ tags: m.get_skills ?? [],
1431
1560
  socials,
1561
+ extra: { company: m.company ?? "", location: m.get_location_display ?? "", avgPricePerCall: m.avg_price_per_call ?? 0 },
1432
1562
  crawledAt
1433
1563
  });
1434
1564
  }
@@ -1438,44 +1568,61 @@ class MentorCruiseSource {
1438
1568
  description = "MentorCruise \u2014 long-term mentorship from vetted mentors";
1439
1569
  website = "https://mentorcruise.com";
1440
1570
  fetchFn;
1441
- apiBase;
1571
+ appId;
1572
+ apiKey;
1573
+ index;
1442
1574
  pageSize;
1443
1575
  constructor(opts = {}) {
1444
1576
  this.fetchFn = opts.fetchFn ?? fetch;
1445
- this.apiBase = opts.apiBase ?? process.env.MENTORCRUISE_API_BASE ?? "https://mentorcruise.com/api";
1446
- this.pageSize = opts.pageSize ?? 50;
1577
+ this.appId = opts.appId ?? process.env.MENTORCRUISE_ALGOLIA_APP_ID ?? "YD3XA4V91L";
1578
+ this.apiKey = opts.apiKey ?? process.env.MENTORCRUISE_ALGOLIA_API_KEY ?? "454b55a2e50bc884225318d99b0dad1a";
1579
+ this.index = opts.index ?? process.env.MENTORCRUISE_ALGOLIA_INDEX ?? "MentorProfile_prod";
1580
+ this.pageSize = opts.pageSize ?? 200;
1447
1581
  }
1448
1582
  async crawl(opts = {}) {
1449
1583
  const log = opts.onLog ?? (() => {});
1450
1584
  const crawledAt = new Date().toISOString();
1585
+ const url = `https://${this.appId}-dsn.algolia.net/1/indexes/${this.index}/query`;
1451
1586
  const experts = [];
1452
1587
  const tags = new Set;
1453
- let offset = 0;
1454
- for (;; ) {
1455
- const data = await fetchJson(`${this.apiBase}/mentors/?limit=${this.pageSize}&offset=${offset}`, this.fetchFn);
1456
- const items = data?.results ?? data?.data ?? (Array.isArray(data) ? data : []);
1457
- if (!items.length)
1588
+ let page = 0;
1589
+ let pages = 1;
1590
+ while (page < pages) {
1591
+ let data;
1592
+ try {
1593
+ const res = await this.fetchFn(url, {
1594
+ method: "POST",
1595
+ headers: {
1596
+ "X-Algolia-Application-Id": this.appId,
1597
+ "X-Algolia-API-Key": this.apiKey,
1598
+ "Content-Type": "application/json"
1599
+ },
1600
+ body: JSON.stringify({ params: `hitsPerPage=${this.pageSize}&page=${page}` })
1601
+ });
1602
+ if (!res.ok)
1603
+ break;
1604
+ data = await res.json();
1605
+ } catch {
1458
1606
  break;
1459
- for (const m of items) {
1460
- const e = normalizeMentor(m, crawledAt);
1607
+ }
1608
+ pages = data.nbPages ?? 1;
1609
+ for (const hit of data.hits ?? []) {
1610
+ const e = normalizeMentor(hit, crawledAt);
1461
1611
  experts.push(e);
1462
1612
  for (const t of e.tags)
1463
1613
  tags.add(t);
1464
1614
  }
1465
- offset += items.length;
1466
- log(` mentorcruise: ${experts.length}`);
1615
+ log(` mentorcruise: ${experts.length}/${data.nbHits ?? "?"}`);
1616
+ page++;
1467
1617
  if (opts.max && experts.length >= opts.max)
1468
1618
  break;
1469
- if (items.length < this.pageSize)
1470
- break;
1471
1619
  }
1472
1620
  if (experts.length === 0) {
1473
- log("mentorcruise: no public listing reachable (set MENTORCRUISE_API_BASE or provide a fetchFn).");
1621
+ log("mentorcruise: Algolia returned nothing (set MENTORCRUISE_ALGOLIA_* or inject fetchFn).");
1474
1622
  }
1475
- const topics = [];
1476
1623
  return {
1477
1624
  experts: opts.max ? experts.slice(0, opts.max) : experts,
1478
- topics,
1625
+ topics: [],
1479
1626
  tags: [...tags].map((name) => ({ name, topic: "" })),
1480
1627
  total: experts.length
1481
1628
  };
@@ -1696,6 +1843,9 @@ async function crawlSource(db, sourceName, opts = {}) {
1696
1843
  throw new Error(`Unknown source "${sourceName}". Run \`experts sources\` to list options.`);
1697
1844
  }
1698
1845
  const data = await source.crawl(opts);
1846
+ if (data.experts.length === 0 && db.count(source.name) > 0) {
1847
+ opts.onLog?.(`\u26A0 ${source.name} returned 0 experts but ${db.count(source.name)} are stored \u2014 possible API drift; not overwriting.`);
1848
+ }
1699
1849
  const changes = db.recordChanges(source.name, data.experts);
1700
1850
  db.upsertExperts(data.experts);
1701
1851
  if (data.topics.length)
@@ -2496,10 +2646,68 @@ function normalizePhone(v) {
2496
2646
  return "+" + cleaned.slice(1).replace(/\D/g, "");
2497
2647
  return cleaned;
2498
2648
  }
2649
+ function looksFakePhone(v) {
2650
+ const d = normalizePhone(v).replace(/\D/g, "");
2651
+ if (/^(\d)\1+$/.test(d))
2652
+ return true;
2653
+ if (d === "1234567890" || d === "0123456789")
2654
+ return true;
2655
+ if (/^(\d{2,3})\1\1+$/.test(d))
2656
+ return true;
2657
+ return false;
2658
+ }
2499
2659
  function validatePhone(v) {
2500
2660
  const n = normalizePhone(v);
2501
2661
  const digits = n.replace(/\D/g, "");
2502
- return digits.length >= 7 && digits.length <= 15 ? "valid" : "invalid";
2662
+ if (digits.length < 7 || digits.length > 15)
2663
+ return "invalid";
2664
+ if (looksFakePhone(v))
2665
+ return "invalid";
2666
+ return "valid";
2667
+ }
2668
+ function interpretRcptCode(code) {
2669
+ if (code === 250 || code === 251)
2670
+ return "valid";
2671
+ if (code === 550 || code === 551 || code === 553 || code === 554)
2672
+ return "invalid";
2673
+ return "unknown";
2674
+ }
2675
+ async function smtpProbe(email, mxHost) {
2676
+ const net = await import("net");
2677
+ return new Promise((resolve) => {
2678
+ let step = 0;
2679
+ let settled = false;
2680
+ const done = (r) => {
2681
+ if (settled)
2682
+ return;
2683
+ settled = true;
2684
+ try {
2685
+ sock.end();
2686
+ } catch {}
2687
+ resolve(r);
2688
+ };
2689
+ const sock = net.createConnection({ host: mxHost, port: 25, timeout: 7000 });
2690
+ sock.on("timeout", () => done("unknown"));
2691
+ sock.on("error", () => done("unknown"));
2692
+ sock.on("data", (buf) => {
2693
+ const code = parseInt(buf.toString().slice(0, 3), 10);
2694
+ if (step === 0) {
2695
+ sock.write(`HELO open-experts.local\r
2696
+ `);
2697
+ step = 1;
2698
+ } else if (step === 1) {
2699
+ sock.write(`MAIL FROM:<probe@open-experts.local>\r
2700
+ `);
2701
+ step = 2;
2702
+ } else if (step === 2) {
2703
+ sock.write(`RCPT TO:<${email}>\r
2704
+ `);
2705
+ step = 3;
2706
+ } else if (step === 3) {
2707
+ done(interpretRcptCode(code));
2708
+ }
2709
+ });
2710
+ });
2503
2711
  }
2504
2712
  async function validateEmail(email, resolver = resolveMx) {
2505
2713
  const e = normalizeEmail(email);
@@ -2669,12 +2877,25 @@ async function verifyContacts(db, opts = {}) {
2669
2877
  const delayMs = opts.delayMs ?? 50;
2670
2878
  const targets = db.contactsToVerify({ source: opts.source, limit: opts.max });
2671
2879
  const res = { checked: 0, valid: 0, invalid: 0, unknown: 0 };
2880
+ const resolver = opts.resolver;
2881
+ const prober = opts.prober ?? smtpProbe;
2672
2882
  for (const c of targets) {
2673
2883
  let status;
2674
- if (c.type === "email")
2675
- status = await validateEmail(c.value, opts.resolver);
2676
- else
2884
+ if (c.type === "email") {
2885
+ status = await validateEmail(c.value, resolver);
2886
+ if (opts.smtp && status === "valid") {
2887
+ try {
2888
+ const mx = await (resolver ?? (await import("dns/promises")).resolveMx)(c.value.split("@")[1]);
2889
+ if (mx[0]?.exchange) {
2890
+ const probed = await prober(c.value, mx[0].exchange);
2891
+ if (probed !== "unknown")
2892
+ status = probed;
2893
+ }
2894
+ } catch {}
2895
+ }
2896
+ } else {
2677
2897
  status = validatePhone(c.value);
2898
+ }
2678
2899
  db.setContactStatus(c.source, c.sourceId, c.type, c.value, status);
2679
2900
  res.checked++;
2680
2901
  res[status]++;
@@ -2751,7 +2972,7 @@ async function syncContacts(db, idsOrSlugs, opts = {}) {
2751
2972
  }
2752
2973
 
2753
2974
  // src/cli/index.ts
2754
- var VERSION = "0.0.7";
2975
+ var VERSION = "0.0.8";
2755
2976
  function openDb() {
2756
2977
  const opts = program.opts();
2757
2978
  return new ExpertsDB(opts.db || defaultDbPath());
@@ -2768,6 +2989,15 @@ function requireData(db) {
2768
2989
  }
2769
2990
  var program = new Command;
2770
2991
  program.name("experts").description("Crawl and query expert marketplaces (intro.co and more).").version(VERSION, "-v, --version").option("--db <path>", "database file", undefined).option("--json", "output JSON where applicable", false);
2992
+ program.addHelpText("after", `
2993
+ Command groups:
2994
+ Query who \xB7 ask \xB7 brief \xB7 find \xB7 search \xB7 related \xB7 show \xB7 list
2995
+ Ingest crawl \xB7 enrich \xB7 enrich-youtube \xB7 enrich-linkedin \xB7 enrich-sites \xB7 avatars
2996
+ Contacts find-contacts \xB7 verify-contacts \xB7 contacts \xB7 sync-contacts
2997
+ Insight topics \xB7 tags \xB7 stats \xB7 graph \xB7 persons \xB7 changes \xB7 stale
2998
+ Admin embed \xB7 reindex \xB7 export \xB7 sources
2999
+
3000
+ Start with: experts who "someone who can help me scale a B2B AI product"`);
2771
3001
  program.command("crawl [source]").description("Fetch experts from a source into the local store").option("--max <n>", "limit number of experts (quick runs)", (v) => parseInt(v, 10)).option("--no-topics", "skip topic-membership resolution (fewer requests)").option("--delay <ms>", "min delay between requests", (v) => parseInt(v, 10), 500).action(async (source, cmdOpts) => {
2772
3002
  const db = openDb();
2773
3003
  const name = source || "intro";
@@ -2944,12 +3174,13 @@ program.command("find-contacts [source]").description("Discover email + phone fo
2944
3174
  console.log(chalk2.dim("Next: `experts verify-contacts` to set working/not status."));
2945
3175
  db.close();
2946
3176
  });
2947
- program.command("verify-contacts [source]").description("Validate discovered contacts and set working/not status").option("--max <n>", "limit contacts checked", (v) => parseInt(v, 10)).action(async (source, cmdOpts) => {
3177
+ program.command("verify-contacts [source]").description("Validate discovered contacts and set working/not status").option("--max <n>", "limit contacts checked", (v) => parseInt(v, 10)).option("--smtp", "also SMTP-probe MX-valid emails (needs outbound port 25)").action(async (source, cmdOpts) => {
2948
3178
  const db = openDb();
2949
3179
  requireData(db);
2950
3180
  const res = await verifyContacts(db, {
2951
3181
  source,
2952
3182
  max: cmdOpts.max,
3183
+ smtp: cmdOpts.smtp,
2953
3184
  onLog: (m) => process.stderr.write(chalk2.dim(m + `
2954
3185
  `))
2955
3186
  });
@@ -3066,14 +3297,15 @@ ${rows.length} experts`));
3066
3297
  }
3067
3298
  db.close();
3068
3299
  });
3069
- program.command("embed [source]").description("Build the semantic search index (embeddings) for stored experts").action(async (source) => {
3300
+ program.command("embed [source]").description("Build the semantic search index (incremental \u2014 only new/changed experts)").option("--force", "re-embed every expert even if unchanged").action(async (source, cmdOpts) => {
3070
3301
  const db = openDb();
3071
3302
  requireData(db);
3072
- const embedder = getEmbedder();
3303
+ const embedder = await getEmbedder();
3073
3304
  console.error(chalk2.dim(`Embedding with ${embedder.id}\u2026`));
3074
3305
  const n = await db.buildEmbeddings(embedder, {
3075
3306
  source,
3076
- onLog: (m) => process.stderr.write(chalk2.dim(`\r${m}`.padEnd(40)))
3307
+ force: cmdOpts.force,
3308
+ onLog: (m) => process.stderr.write(chalk2.dim(`\r${m}`.padEnd(50)))
3077
3309
  });
3078
3310
  process.stderr.write(`
3079
3311
  `);
@@ -3087,18 +3319,62 @@ program.command("ask <query...>").description("Natural-language semantic search:
3087
3319
  console.error(chalk2.yellow("No semantic index yet. Run ") + chalk2.bold("experts embed") + chalk2.yellow(" first."));
3088
3320
  process.exit(1);
3089
3321
  }
3090
- const embedder = getEmbedder();
3322
+ const embedder = await getEmbedder();
3323
+ const storedBy = db.getMeta("embedder");
3324
+ if (storedBy && storedBy !== embedder.id) {
3325
+ console.error(chalk2.yellow(`\u26A0 index built with "${storedBy}" but active embedder is "${embedder.id}" \u2014 run \`experts embed\` to re-index.`));
3326
+ }
3091
3327
  const [qv] = await embedder.embed([query.join(" ")]);
3092
- const results = db.semanticSearch(qv, { source: cmdOpts.source, limit: cmdOpts.limit });
3328
+ const q = query.join(" ");
3329
+ const results = db.semanticSearch(qv, { source: cmdOpts.source, limit: cmdOpts.limit }).map((r) => ({
3330
+ ...r,
3331
+ why: explainMatch(q, r.expert)
3332
+ }));
3093
3333
  if (program.opts().json) {
3094
3334
  console.log(JSON.stringify(results, null, 2));
3095
3335
  } else {
3096
3336
  const multi = new Set(results.map((r) => r.expert.source)).size > 1;
3097
- for (const { expert, score } of results) {
3337
+ for (const { expert, score, why } of results) {
3098
3338
  console.log(chalk2.dim(score.toFixed(3)) + " " + formatRow(expert, { showSource: multi }));
3339
+ if (why.length)
3340
+ console.log(` ${chalk2.dim("why: " + why.join(", "))}`);
3099
3341
  }
3100
3342
  console.log(chalk2.dim(`
3101
- ${results.length} matches for "${query.join(" ")}"`));
3343
+ ${results.length} matches for "${q}"`));
3344
+ }
3345
+ db.close();
3346
+ });
3347
+ program.command("who <query...>").description("Smart search \u2014 the one command: semantic + authority + person-dedup + why").option("-s, --source <name>", "filter by source").option("-n, --limit <n>", "results", (v) => parseInt(v, 10), 10).action(async (query, cmdOpts) => {
3348
+ const db = openDb();
3349
+ requireData(db);
3350
+ if (db.vectorCount() === 0) {
3351
+ console.error(chalk2.yellow("No semantic index. Run ") + chalk2.bold("experts embed") + chalk2.yellow(" first."));
3352
+ process.exit(1);
3353
+ }
3354
+ const q = query.join(" ");
3355
+ const [qv] = await (await getEmbedder()).embed([q]);
3356
+ const raw = db.semanticSearch(qv, { source: cmdOpts.source, limit: (cmdOpts.limit + 5) * 4 });
3357
+ const seen = new Set;
3358
+ const out = [];
3359
+ for (const r of raw) {
3360
+ const pid = db.personIdOf(r.expert.source, r.expert.sourceId);
3361
+ if (seen.has(pid))
3362
+ continue;
3363
+ seen.add(pid);
3364
+ out.push({ expert: r.expert, score: r.score, why: explainMatch(q, r.expert) });
3365
+ if (out.length >= cmdOpts.limit)
3366
+ break;
3367
+ }
3368
+ if (program.opts().json) {
3369
+ console.log(JSON.stringify(out, null, 2));
3370
+ } else {
3371
+ const multi = new Set(out.map((r) => r.expert.source)).size > 1;
3372
+ out.forEach((r, i) => {
3373
+ console.log(`${chalk2.cyan(`${i + 1}.`)} ${formatRow(r.expert, { showSource: multi })}`);
3374
+ console.log(` ${chalk2.dim((r.why.length ? "why: " + r.why.join(", ") : "semantic match") + ` (${r.score.toFixed(3)})`)}`);
3375
+ });
3376
+ console.log(chalk2.dim(`
3377
+ ${out.length} experts for "${q}"`));
3102
3378
  }
3103
3379
  db.close();
3104
3380
  });
@@ -3110,7 +3386,7 @@ program.command("brief <text...>").description("Paste a brief \u2192 ranked, de-
3110
3386
  process.exit(1);
3111
3387
  }
3112
3388
  const brief = text.join(" ");
3113
- const [qv] = await getEmbedder().embed([brief]);
3389
+ const [qv] = await (await getEmbedder()).embed([brief]);
3114
3390
  const raw = db.semanticSearch(qv, { source: cmdOpts.source, limit: (cmdOpts.limit + 5) * 4 });
3115
3391
  const seen = new Set;
3116
3392
  const shortlist = [];
@@ -3123,11 +3399,7 @@ program.command("brief <text...>").description("Paste a brief \u2192 ranked, de-
3123
3399
  if (shortlist.length >= cmdOpts.limit)
3124
3400
  break;
3125
3401
  }
3126
- const briefLc = brief.toLowerCase();
3127
- const annotated = shortlist.map((r) => ({
3128
- ...r,
3129
- why: r.expert.tags.filter((t) => briefLc.includes(t.toLowerCase())).slice(0, 4)
3130
- }));
3402
+ const annotated = shortlist.map((r) => ({ ...r, why: explainMatch(brief, r.expert) }));
3131
3403
  if (program.opts().json) {
3132
3404
  console.log(JSON.stringify(annotated, null, 2));
3133
3405
  } else {