@vespermcp/mcp-server 1.2.24 → 1.2.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,6 +101,18 @@ export class GithubSource {
101
101
  qualityWarnings.push("Low star count; may be low-signal");
102
102
  if (description.length < 80)
103
103
  qualityWarnings.push("Short description; relevance may be weaker");
104
+ const lowSignalPatterns = [
105
+ /\bawesome\b/i,
106
+ /\bresources?\b/i,
107
+ /\bcurated\b/i,
108
+ /\blist\b/i,
109
+ /\bcollection\b/i,
110
+ ];
111
+ const lowSignalText = `${fullName} ${description}`;
112
+ const looksResourceList = lowSignalPatterns.some((rx) => rx.test(lowSignalText));
113
+ if (looksResourceList) {
114
+ qualityWarnings.push("Repository appears to be a resource/list collection; relevance may be indirect.");
115
+ }
104
116
  const abstractLength = input.readmeText ? input.readmeText.length : description.length;
105
117
  const authorsPresent = !!owner;
106
118
  const datePresent = !!updatedAt;
@@ -111,6 +123,19 @@ export class GithubSource {
111
123
  datePresent,
112
124
  contentDepth,
113
125
  });
126
+ let adjustedQuality01 = quality01;
127
+ // Calibrate GitHub quality so resource-list repos don't dominate.
128
+ // Keep penalty moderate; readme-rich/long-form repos still score well.
129
+ if (looksResourceList) {
130
+ adjustedQuality01 -= 0.14;
131
+ }
132
+ if (!input.readmeText && description.length < 140) {
133
+ adjustedQuality01 -= 0.08;
134
+ }
135
+ if (stars < 50) {
136
+ adjustedQuality01 -= 0.04;
137
+ }
138
+ adjustedQuality01 = Math.max(0.3, Math.min(1.0, adjustedQuality01));
114
139
  return {
115
140
  id: fullName,
116
141
  source: "github",
@@ -133,7 +158,7 @@ export class GithubSource {
133
158
  usage_restrictions: [],
134
159
  warnings: [],
135
160
  },
136
- quality_score: Math.round(quality01 * 100),
161
+ quality_score: Math.round(adjustedQuality01 * 100),
137
162
  quality_warnings: qualityWarnings,
138
163
  download_url: String(repo.html_url || `https://github.com/${fullName}`),
139
164
  format: "GIT",
@@ -43,14 +43,70 @@ function tokenize(content) {
43
43
  .filter((w) => w.length >= 3);
44
44
  return new Set(words);
45
45
  }
46
+ function titleTokens(doc) {
47
+ const mj = doc.metadata_json || {};
48
+ const raw = typeof mj.title === "string" ? mj.title : "";
49
+ return tokenize(raw);
50
+ }
51
+ function semanticHintTokens(doc) {
52
+ const mj = doc.metadata_json || {};
53
+ const fields = [];
54
+ if (typeof mj.title === "string")
55
+ fields.push(mj.title);
56
+ if (typeof mj.name === "string")
57
+ fields.push(mj.name);
58
+ if (typeof mj.description === "string")
59
+ fields.push(mj.description);
60
+ if (typeof mj.abstract === "string")
61
+ fields.push(mj.abstract);
62
+ if (Array.isArray(mj.tags))
63
+ fields.push(mj.tags.join(" "));
64
+ if (Array.isArray(mj.topics))
65
+ fields.push(mj.topics.join(" "));
66
+ fields.push(doc.source_url || "");
67
+ return tokenize(fields.join(" "));
68
+ }
46
69
  function isSuspiciousPair(a, b) {
47
70
  // semantic fallback should be selective; do cheap prefilter first
71
+ // Metadata/topic overlap can indicate same object even with very different body lengths.
72
+ const aHints = semanticHintTokens(a);
73
+ const bHints = semanticHintTokens(b);
74
+ if (aHints.size > 0 && bHints.size > 0) {
75
+ let hInter = 0;
76
+ for (const t of aHints)
77
+ if (bHints.has(t))
78
+ hInter++;
79
+ const hUnion = aHints.size + bHints.size - hInter;
80
+ const hJaccard = hUnion > 0 ? hInter / hUnion : 0;
81
+ if (hJaccard >= 0.2)
82
+ return true;
83
+ }
48
84
  const aLen = a.content.length;
49
85
  const bLen = b.content.length;
50
86
  const maxLen = Math.max(aLen, bLen, 1);
51
87
  const lenRatio = Math.abs(aLen - bLen) / maxLen;
52
- if (lenRatio > 0.45)
88
+ // Loosened again to allow abstract-vs-summary style comparisons.
89
+ if (lenRatio > 0.9)
53
90
  return false;
91
+ // Fast path: same normalized title-like prefix often indicates same research object.
92
+ const aPrefix = a.content.slice(0, 140).toLowerCase().replace(/[^a-z0-9\s]/g, " ").trim();
93
+ const bPrefix = b.content.slice(0, 140).toLowerCase().replace(/[^a-z0-9\s]/g, " ").trim();
94
+ if (aPrefix && bPrefix && (aPrefix.includes(bPrefix) || bPrefix.includes(aPrefix))) {
95
+ return true;
96
+ }
97
+ // Cross-source papers often have close titles even if abstracts differ.
98
+ const aTitle = titleTokens(a);
99
+ const bTitle = titleTokens(b);
100
+ if (aTitle.size > 0 && bTitle.size > 0) {
101
+ let tInter = 0;
102
+ for (const t of aTitle)
103
+ if (bTitle.has(t))
104
+ tInter++;
105
+ const tUnion = aTitle.size + bTitle.size - tInter;
106
+ const tJaccard = tUnion > 0 ? tInter / tUnion : 0;
107
+ if (tJaccard >= 0.25)
108
+ return true;
109
+ }
54
110
  const aTokens = tokenize(a.content);
55
111
  const bTokens = tokenize(b.content);
56
112
  if (aTokens.size === 0 || bTokens.size === 0)
@@ -61,7 +117,8 @@ function isSuspiciousPair(a, b) {
61
117
  inter++;
62
118
  const union = aTokens.size + bTokens.size - inter;
63
119
  const jaccard = union > 0 ? inter / union : 0;
64
- return jaccard >= 0.18;
120
+ // Loosened from 0.12 -> 0.08 to let semantic stage inspect more borderline matches.
121
+ return jaccard >= 0.08;
65
122
  }
66
123
  function normalizeStars(doc) {
67
124
  const mj = doc.metadata_json || {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.24",
3
+ "version": "1.2.26",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",