@vespermcp/mcp-server 1.2.25 → 1.2.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +92 -0
- package/build/metadata/semantic-scholar-source.js +64 -11
- package/build/python/normalize_schema_engine.py +224 -0
- package/build/web/fusion-engine.js +35 -1
- package/build/web/web-core.js +2 -0
- package/package.json +1 -1
- package/src/python/normalize_schema_engine.py +224 -0
package/build/index.js
CHANGED
|
@@ -1540,6 +1540,42 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1540
1540
|
required: ["file_path", "target_format"],
|
|
1541
1541
|
},
|
|
1542
1542
|
},
|
|
1543
|
+
{
|
|
1544
|
+
name: "vesper_normalize_schema",
|
|
1545
|
+
description: "Normalize ragged JSON/JSONL rows into a schema-uniform JSONL (or JSON) by flattening metadata_json into stable columns (fills missing values with null). Useful before converting fused WebCore JSON to Parquet.",
|
|
1546
|
+
inputSchema: {
|
|
1547
|
+
type: "object",
|
|
1548
|
+
properties: {
|
|
1549
|
+
file_path: {
|
|
1550
|
+
type: "string",
|
|
1551
|
+
description: "Absolute path to the input file (.json or .jsonl). If it's a fused Vesper output JSON, tool will extract results[].",
|
|
1552
|
+
},
|
|
1553
|
+
output_format: {
|
|
1554
|
+
type: "string",
|
|
1555
|
+
enum: ["jsonl", "json"],
|
|
1556
|
+
description: "Output format for normalized rows. Default: jsonl.",
|
|
1557
|
+
},
|
|
1558
|
+
output_dir: {
|
|
1559
|
+
type: "string",
|
|
1560
|
+
description: "Directory to write normalized output. Default: ~/.vesper/data/normalized_schema",
|
|
1561
|
+
},
|
|
1562
|
+
flatten_metadata_json: {
|
|
1563
|
+
type: "boolean",
|
|
1564
|
+
description: "Flatten metadata_json into metadata__* columns. Default: true.",
|
|
1565
|
+
},
|
|
1566
|
+
max_keys: {
|
|
1567
|
+
type: "number",
|
|
1568
|
+
description: "Max number of metadata_json keys to materialize as columns. Extra keys go into metadata_json_blob (if extras_mode='blob'). Default: 200.",
|
|
1569
|
+
},
|
|
1570
|
+
extras_mode: {
|
|
1571
|
+
type: "string",
|
|
1572
|
+
enum: ["blob", "drop"],
|
|
1573
|
+
description: "How to handle metadata_json keys beyond max_keys. blob keeps them in metadata_json_blob; drop discards them. Default: blob.",
|
|
1574
|
+
},
|
|
1575
|
+
},
|
|
1576
|
+
required: ["file_path"],
|
|
1577
|
+
},
|
|
1578
|
+
},
|
|
1543
1579
|
{
|
|
1544
1580
|
name: "fuse_datasets",
|
|
1545
1581
|
description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
|
|
@@ -2637,6 +2673,62 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2637
2673
|
};
|
|
2638
2674
|
}
|
|
2639
2675
|
}
|
|
2676
|
+
case "vesper_normalize_schema": {
|
|
2677
|
+
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
2678
|
+
const outputFormat = String(request.params.arguments?.output_format || "jsonl").trim().toLowerCase();
|
|
2679
|
+
const outputDirRaw = request.params.arguments?.output_dir ? String(request.params.arguments.output_dir).trim() : "";
|
|
2680
|
+
const flattenMetadataJson = request.params.arguments?.flatten_metadata_json !== false;
|
|
2681
|
+
const maxKeys = Number(request.params.arguments?.max_keys ?? 200);
|
|
2682
|
+
const extrasMode = String(request.params.arguments?.extras_mode || "blob").trim().toLowerCase();
|
|
2683
|
+
if (!filePath) {
|
|
2684
|
+
throw new McpError(ErrorCode.InvalidParams, "file_path is required");
|
|
2685
|
+
}
|
|
2686
|
+
if (!["jsonl", "json"].includes(outputFormat)) {
|
|
2687
|
+
throw new McpError(ErrorCode.InvalidParams, "output_format must be one of: jsonl, json");
|
|
2688
|
+
}
|
|
2689
|
+
if (!fs.existsSync(filePath)) {
|
|
2690
|
+
return { content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }], isError: true };
|
|
2691
|
+
}
|
|
2692
|
+
const outDir = outputDirRaw || path.join(dataRoot, "data", "normalized_schema");
|
|
2693
|
+
if (!fs.existsSync(outDir))
|
|
2694
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
2695
|
+
const baseName = path.parse(filePath).name || `normalized_${Date.now()}`;
|
|
2696
|
+
const outputPath = path.join(outDir, `${baseName}.normalized.${outputFormat}`);
|
|
2697
|
+
try {
|
|
2698
|
+
const scriptPath = path.join(dataRoot, "python", "normalize_schema_engine.py");
|
|
2699
|
+
const options = {
|
|
2700
|
+
flatten_metadata_json: !!flattenMetadataJson,
|
|
2701
|
+
max_keys: Number.isFinite(maxKeys) ? maxKeys : 200,
|
|
2702
|
+
extras_mode: ["blob", "drop"].includes(extrasMode) ? extrasMode : "blob",
|
|
2703
|
+
};
|
|
2704
|
+
const result = await runPythonJson(scriptPath, [filePath, outputPath, JSON.stringify(options)]);
|
|
2705
|
+
if (!result.ok) {
|
|
2706
|
+
return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${result.error}` }], isError: true };
|
|
2707
|
+
}
|
|
2708
|
+
// Register normalized file to make follow-up conversion easier.
|
|
2709
|
+
try {
|
|
2710
|
+
const datasetId = path.basename(outputPath, path.extname(outputPath));
|
|
2711
|
+
upsertRegistry(datasetId, outputPath, "completed");
|
|
2712
|
+
}
|
|
2713
|
+
catch (e) {
|
|
2714
|
+
console.error(`[NormalizeSchema] Registry write failed: ${e?.message || e}`);
|
|
2715
|
+
}
|
|
2716
|
+
let msg = `**Schema normalization complete**\n`;
|
|
2717
|
+
msg += `- **Input**: ${filePath}\n`;
|
|
2718
|
+
msg += `- **Output**: ${result.output_path}\n`;
|
|
2719
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString?.() ?? result.rows}\n`;
|
|
2720
|
+
msg += `- **Columns**: ${result.columns}\n`;
|
|
2721
|
+
msg += `- **Flattened keys**: ${result.flattened_keys}\n`;
|
|
2722
|
+
msg += `- **Extras mode**: ${result.extras_mode}\n`;
|
|
2723
|
+
if (result.extras_rows !== undefined)
|
|
2724
|
+
msg += `- **Rows with extras**: ${result.extras_rows}\n`;
|
|
2725
|
+
msg += `\nNext: run \`vesper_convert_format\` on the output to convert to parquet.\n`;
|
|
2726
|
+
return { content: [{ type: "text", text: msg }] };
|
|
2727
|
+
}
|
|
2728
|
+
catch (error) {
|
|
2729
|
+
return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${error.message}` }], isError: true };
|
|
2730
|
+
}
|
|
2731
|
+
}
|
|
2640
2732
|
case "fuse_datasets": {
|
|
2641
2733
|
const rawSources = request.params.arguments?.sources;
|
|
2642
2734
|
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
@@ -1,6 +1,19 @@
|
|
|
1
|
-
import { rateLimitedFetch } from "./rate-limiter.js";
|
|
2
1
|
import { CircuitBreaker } from "./circuit-breaker.js";
|
|
3
2
|
import { estimateQualityScore } from "./quality.js";
|
|
3
|
+
function sleep(ms) {
|
|
4
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
5
|
+
}
|
|
6
|
+
function parseRetryAfterMs(value) {
|
|
7
|
+
if (!value)
|
|
8
|
+
return null;
|
|
9
|
+
const asNum = Number(value);
|
|
10
|
+
if (Number.isFinite(asNum) && asNum >= 0)
|
|
11
|
+
return asNum * 1000;
|
|
12
|
+
const ts = Date.parse(value);
|
|
13
|
+
if (!Number.isFinite(ts))
|
|
14
|
+
return null;
|
|
15
|
+
return Math.max(0, ts - Date.now());
|
|
16
|
+
}
|
|
4
17
|
export class SemanticScholarSource {
|
|
5
18
|
cache;
|
|
6
19
|
breaker = new CircuitBreaker("semantic_scholar", {
|
|
@@ -19,12 +32,12 @@ export class SemanticScholarSource {
|
|
|
19
32
|
const start = Date.now();
|
|
20
33
|
const cleanQuery = String(query || "").trim();
|
|
21
34
|
if (!cleanQuery)
|
|
22
|
-
return { results: [], cacheHit: false, latencyMs: Date.now() - start };
|
|
35
|
+
return { results: [], cacheHit: false, latencyMs: Date.now() - start, rateLimited: false, rateLimitAttempts: 0 };
|
|
23
36
|
const perPage = Math.max(1, Math.min(100, Number(limit || 20)));
|
|
24
37
|
const cacheKey = `webcore:semantic_scholar:discover:${cleanQuery.toLowerCase()}:limit=${perPage}`;
|
|
25
38
|
const cached = await this.cache?.getJson(cacheKey);
|
|
26
39
|
if (cached)
|
|
27
|
-
return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
|
|
40
|
+
return { results: cached, cacheHit: true, latencyMs: Date.now() - start, rateLimited: false, rateLimitAttempts: 0 };
|
|
28
41
|
if (!this.breaker.canAttempt()) {
|
|
29
42
|
throw new Error("Semantic Scholar connector is temporarily unavailable (circuit open).");
|
|
30
43
|
}
|
|
@@ -43,16 +56,32 @@ export class SemanticScholarSource {
|
|
|
43
56
|
"publicationTypes",
|
|
44
57
|
"openAccessPdf",
|
|
45
58
|
].join(","));
|
|
46
|
-
const
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
59
|
+
const headers = {
|
|
60
|
+
"Accept": "application/json",
|
|
61
|
+
"User-Agent": "vesper/2.0 (phase1-semantic-scholar-connector)",
|
|
62
|
+
};
|
|
63
|
+
// Handle S2 429s gracefully: respect Retry-After and avoid failing whole fusion runs.
|
|
64
|
+
const fetched = await this.fetchWith429Retry(url.toString(), headers).catch((e) => {
|
|
65
|
+
const msg = String(e?.message || e || "");
|
|
66
|
+
if (msg.includes("429")) {
|
|
67
|
+
this.breaker.onFailure();
|
|
68
|
+
return { response: null, rateLimited: true, attempts: 6 };
|
|
69
|
+
}
|
|
52
70
|
this.breaker.onFailure();
|
|
53
71
|
throw e;
|
|
54
72
|
});
|
|
55
|
-
|
|
73
|
+
if (!fetched?.response) {
|
|
74
|
+
// Cache short empty result to avoid immediate retry storms on repeated identical queries.
|
|
75
|
+
await this.cache?.setJson(cacheKey, [], 120);
|
|
76
|
+
return {
|
|
77
|
+
results: [],
|
|
78
|
+
cacheHit: false,
|
|
79
|
+
latencyMs: Date.now() - start,
|
|
80
|
+
rateLimited: !!fetched?.rateLimited,
|
|
81
|
+
rateLimitAttempts: fetched?.attempts ?? 0,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
const data = await fetched.response.json().catch((e) => {
|
|
56
85
|
this.breaker.onFailure();
|
|
57
86
|
throw new Error(`Semantic Scholar JSON parse failed: ${e?.message || String(e)}`);
|
|
58
87
|
});
|
|
@@ -60,7 +89,31 @@ export class SemanticScholarSource {
|
|
|
60
89
|
const result = papers.map((p) => this.toDatasetMetadata(p)).filter(Boolean);
|
|
61
90
|
this.breaker.onSuccess();
|
|
62
91
|
await this.cache?.setJson(cacheKey, result, 86400); // 24h
|
|
63
|
-
return {
|
|
92
|
+
return {
|
|
93
|
+
results: result,
|
|
94
|
+
cacheHit: false,
|
|
95
|
+
latencyMs: Date.now() - start,
|
|
96
|
+
rateLimited: false,
|
|
97
|
+
rateLimitAttempts: fetched.attempts,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
async fetchWith429Retry(url, headers) {
|
|
101
|
+
const maxAttempts = 6;
|
|
102
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
103
|
+
const response = await fetch(url, { headers });
|
|
104
|
+
if (response.ok)
|
|
105
|
+
return { response, rateLimited: false, attempts: attempt };
|
|
106
|
+
if (response.status !== 429) {
|
|
107
|
+
const error = new Error(`HTTP error: ${response.status}`);
|
|
108
|
+
error.status = response.status;
|
|
109
|
+
throw error;
|
|
110
|
+
}
|
|
111
|
+
const retryAfterMs = parseRetryAfterMs(response.headers.get("retry-after"));
|
|
112
|
+
const backoffMs = Math.min(30000, 1500 * Math.pow(2, attempt - 1));
|
|
113
|
+
const jitterMs = Math.floor(Math.random() * 400);
|
|
114
|
+
await sleep((retryAfterMs ?? backoffMs) + jitterMs);
|
|
115
|
+
}
|
|
116
|
+
return { response: null, rateLimited: true, attempts: maxAttempts };
|
|
64
117
|
}
|
|
65
118
|
toDatasetMetadata(paper) {
|
|
66
119
|
const paperId = String(paper.paperId || paper.externalIds?.DOI || "").trim();
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalize schema for semi-structured JSON/JSONL exports so they can be converted to Parquet safely.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
normalize_schema_engine.py <input_path> <output_path> [options_json]
|
|
6
|
+
|
|
7
|
+
options_json (optional):
|
|
8
|
+
{
|
|
9
|
+
"flatten_metadata_json": true,
|
|
10
|
+
"max_keys": 200,
|
|
11
|
+
"extras_mode": "blob" | "drop"
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
Outputs JSON:
|
|
15
|
+
{"ok": true, "output_path": "...", "rows": N, "columns": M, "flattened_keys": K, "extras_mode": "..."}
|
|
16
|
+
or {"ok": false, "error": "..."}
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import sys
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
import re
|
|
23
|
+
from typing import Any, Dict, List, Tuple
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _safe_col(name: str) -> str:
|
|
27
|
+
s = str(name or "").strip()
|
|
28
|
+
s = re.sub(r"[^a-zA-Z0-9_]", "_", s)
|
|
29
|
+
s = re.sub(r"_+", "_", s).strip("_")
|
|
30
|
+
if not s:
|
|
31
|
+
return "unknown"
|
|
32
|
+
if len(s) > 64:
|
|
33
|
+
s = s[:64]
|
|
34
|
+
return s
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _coerce_cell(v: Any) -> Any:
|
|
38
|
+
if v is None:
|
|
39
|
+
return None
|
|
40
|
+
if isinstance(v, (str, int, float, bool)):
|
|
41
|
+
return v
|
|
42
|
+
# Keep nested values as JSON strings so downstream Parquet has stable scalar columns.
|
|
43
|
+
try:
|
|
44
|
+
return json.dumps(v, ensure_ascii=False)
|
|
45
|
+
except Exception:
|
|
46
|
+
return str(v)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _load_records(src: str) -> List[Dict[str, Any]]:
|
|
50
|
+
ext = os.path.splitext(src)[1].lower()
|
|
51
|
+
if ext in (".jsonl", ".ndjson"):
|
|
52
|
+
rows: List[Dict[str, Any]] = []
|
|
53
|
+
with open(src, "r", encoding="utf-8") as f:
|
|
54
|
+
for line in f:
|
|
55
|
+
line = line.strip()
|
|
56
|
+
if not line:
|
|
57
|
+
continue
|
|
58
|
+
obj = json.loads(line)
|
|
59
|
+
if isinstance(obj, dict):
|
|
60
|
+
rows.append(obj)
|
|
61
|
+
return rows
|
|
62
|
+
|
|
63
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
64
|
+
if not raw:
|
|
65
|
+
return []
|
|
66
|
+
obj = json.loads(raw)
|
|
67
|
+
|
|
68
|
+
if isinstance(obj, list):
|
|
69
|
+
return [r for r in obj if isinstance(r, dict)]
|
|
70
|
+
if isinstance(obj, dict):
|
|
71
|
+
for key in ("results", "rows", "items", "records", "data", "entries", "samples"):
|
|
72
|
+
v = obj.get(key)
|
|
73
|
+
if isinstance(v, list) and (len(v) == 0 or isinstance(v[0], dict)):
|
|
74
|
+
return [r for r in v if isinstance(r, dict)]
|
|
75
|
+
# Sometimes the dict itself is the record
|
|
76
|
+
return [obj]
|
|
77
|
+
|
|
78
|
+
return []
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _gather_flat_keys(records: List[Dict[str, Any]], max_keys: int) -> List[str]:
|
|
82
|
+
keys: List[str] = []
|
|
83
|
+
seen = set()
|
|
84
|
+
for r in records:
|
|
85
|
+
mj = r.get("metadata_json")
|
|
86
|
+
if not isinstance(mj, dict):
|
|
87
|
+
continue
|
|
88
|
+
for k in mj.keys():
|
|
89
|
+
col = f"metadata__{_safe_col(k)}"
|
|
90
|
+
if col in seen:
|
|
91
|
+
continue
|
|
92
|
+
seen.add(col)
|
|
93
|
+
keys.append(col)
|
|
94
|
+
if len(keys) >= max_keys:
|
|
95
|
+
return keys
|
|
96
|
+
return keys
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _normalize_records(
|
|
100
|
+
records: List[Dict[str, Any]],
|
|
101
|
+
flat_keys: List[str],
|
|
102
|
+
extras_mode: str,
|
|
103
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
104
|
+
flat_set = set(flat_keys)
|
|
105
|
+
extras_count = 0
|
|
106
|
+
out: List[Dict[str, Any]] = []
|
|
107
|
+
|
|
108
|
+
for r in records:
|
|
109
|
+
base: Dict[str, Any] = {}
|
|
110
|
+
# Keep top-level stable fields as-is.
|
|
111
|
+
for k in (
|
|
112
|
+
"source_type",
|
|
113
|
+
"source_url",
|
|
114
|
+
"content",
|
|
115
|
+
"quality_score",
|
|
116
|
+
"collected_at",
|
|
117
|
+
"content_type",
|
|
118
|
+
):
|
|
119
|
+
if k in r:
|
|
120
|
+
base[k] = _coerce_cell(r.get(k))
|
|
121
|
+
|
|
122
|
+
# Preserve source_chain as a JSON string (it is nested).
|
|
123
|
+
if "source_chain" in r:
|
|
124
|
+
base["source_chain"] = _coerce_cell(r.get("source_chain"))
|
|
125
|
+
|
|
126
|
+
mj = r.get("metadata_json")
|
|
127
|
+
extras: Dict[str, Any] = {}
|
|
128
|
+
if isinstance(mj, dict):
|
|
129
|
+
for k, v in mj.items():
|
|
130
|
+
col = f"metadata__{_safe_col(k)}"
|
|
131
|
+
if col in flat_set:
|
|
132
|
+
base[col] = _coerce_cell(v)
|
|
133
|
+
else:
|
|
134
|
+
extras[k] = v
|
|
135
|
+
|
|
136
|
+
# Fill missing flattened keys with nulls for uniform schema.
|
|
137
|
+
for col in flat_keys:
|
|
138
|
+
if col not in base:
|
|
139
|
+
base[col] = None
|
|
140
|
+
|
|
141
|
+
if extras:
|
|
142
|
+
extras_count += 1
|
|
143
|
+
if extras_mode == "blob":
|
|
144
|
+
base["metadata_json_blob"] = _coerce_cell(extras)
|
|
145
|
+
# extras_mode == "drop" => ignore
|
|
146
|
+
else:
|
|
147
|
+
if extras_mode == "blob":
|
|
148
|
+
base["metadata_json_blob"] = None
|
|
149
|
+
|
|
150
|
+
out.append(base)
|
|
151
|
+
|
|
152
|
+
return out, extras_count
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def main():
|
|
156
|
+
if len(sys.argv) < 3:
|
|
157
|
+
print(json.dumps({"ok": False, "error": "Usage: normalize_schema_engine.py <input> <output> [options_json]"}))
|
|
158
|
+
sys.exit(1)
|
|
159
|
+
|
|
160
|
+
input_path = sys.argv[1]
|
|
161
|
+
output_path = sys.argv[2]
|
|
162
|
+
options_raw = sys.argv[3] if len(sys.argv) >= 4 else "{}"
|
|
163
|
+
|
|
164
|
+
if not os.path.exists(input_path):
|
|
165
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
166
|
+
sys.exit(1)
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
options = json.loads(options_raw) if options_raw else {}
|
|
170
|
+
except Exception:
|
|
171
|
+
options = {}
|
|
172
|
+
|
|
173
|
+
flatten_metadata_json = options.get("flatten_metadata_json", True) is True
|
|
174
|
+
max_keys = int(options.get("max_keys", 200) or 200)
|
|
175
|
+
max_keys = max(0, min(2000, max_keys))
|
|
176
|
+
extras_mode = str(options.get("extras_mode", "blob") or "blob").lower()
|
|
177
|
+
if extras_mode not in ("blob", "drop"):
|
|
178
|
+
extras_mode = "blob"
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
records = _load_records(input_path)
|
|
182
|
+
if not records:
|
|
183
|
+
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|
184
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
185
|
+
f.write("")
|
|
186
|
+
print(json.dumps({"ok": True, "output_path": output_path, "rows": 0, "columns": 0, "flattened_keys": 0, "extras_mode": extras_mode}))
|
|
187
|
+
return
|
|
188
|
+
|
|
189
|
+
flat_keys: List[str] = []
|
|
190
|
+
if flatten_metadata_json and max_keys > 0:
|
|
191
|
+
flat_keys = _gather_flat_keys(records, max_keys)
|
|
192
|
+
|
|
193
|
+
normalized, extras_rows = _normalize_records(records, flat_keys, extras_mode)
|
|
194
|
+
|
|
195
|
+
ext = os.path.splitext(output_path)[1].lower()
|
|
196
|
+
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|
197
|
+
if ext in (".jsonl", ".ndjson"):
|
|
198
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
199
|
+
for r in normalized:
|
|
200
|
+
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
|
201
|
+
elif ext == ".json":
|
|
202
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
203
|
+
json.dump(normalized, f, ensure_ascii=False)
|
|
204
|
+
else:
|
|
205
|
+
raise ValueError(f"Unsupported output format: {ext}. Use .jsonl or .json")
|
|
206
|
+
|
|
207
|
+
columns = len(normalized[0].keys()) if normalized else 0
|
|
208
|
+
print(json.dumps({
|
|
209
|
+
"ok": True,
|
|
210
|
+
"output_path": output_path,
|
|
211
|
+
"rows": len(normalized),
|
|
212
|
+
"columns": columns,
|
|
213
|
+
"flattened_keys": len(flat_keys),
|
|
214
|
+
"extras_mode": extras_mode,
|
|
215
|
+
"extras_rows": extras_rows,
|
|
216
|
+
}))
|
|
217
|
+
except Exception as e:
|
|
218
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
219
|
+
sys.exit(1)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
if __name__ == "__main__":
|
|
223
|
+
main()
|
|
224
|
+
|
|
@@ -48,14 +48,45 @@ function titleTokens(doc) {
|
|
|
48
48
|
const raw = typeof mj.title === "string" ? mj.title : "";
|
|
49
49
|
return tokenize(raw);
|
|
50
50
|
}
|
|
51
|
+
function semanticHintTokens(doc) {
|
|
52
|
+
const mj = doc.metadata_json || {};
|
|
53
|
+
const fields = [];
|
|
54
|
+
if (typeof mj.title === "string")
|
|
55
|
+
fields.push(mj.title);
|
|
56
|
+
if (typeof mj.name === "string")
|
|
57
|
+
fields.push(mj.name);
|
|
58
|
+
if (typeof mj.description === "string")
|
|
59
|
+
fields.push(mj.description);
|
|
60
|
+
if (typeof mj.abstract === "string")
|
|
61
|
+
fields.push(mj.abstract);
|
|
62
|
+
if (Array.isArray(mj.tags))
|
|
63
|
+
fields.push(mj.tags.join(" "));
|
|
64
|
+
if (Array.isArray(mj.topics))
|
|
65
|
+
fields.push(mj.topics.join(" "));
|
|
66
|
+
fields.push(doc.source_url || "");
|
|
67
|
+
return tokenize(fields.join(" "));
|
|
68
|
+
}
|
|
51
69
|
function isSuspiciousPair(a, b) {
|
|
52
70
|
// semantic fallback should be selective; do cheap prefilter first
|
|
71
|
+
// Metadata/topic overlap can indicate same object even with very different body lengths.
|
|
72
|
+
const aHints = semanticHintTokens(a);
|
|
73
|
+
const bHints = semanticHintTokens(b);
|
|
74
|
+
if (aHints.size > 0 && bHints.size > 0) {
|
|
75
|
+
let hInter = 0;
|
|
76
|
+
for (const t of aHints)
|
|
77
|
+
if (bHints.has(t))
|
|
78
|
+
hInter++;
|
|
79
|
+
const hUnion = aHints.size + bHints.size - hInter;
|
|
80
|
+
const hJaccard = hUnion > 0 ? hInter / hUnion : 0;
|
|
81
|
+
if (hJaccard >= 0.2)
|
|
82
|
+
return true;
|
|
83
|
+
}
|
|
53
84
|
const aLen = a.content.length;
|
|
54
85
|
const bLen = b.content.length;
|
|
55
86
|
const maxLen = Math.max(aLen, bLen, 1);
|
|
56
87
|
const lenRatio = Math.abs(aLen - bLen) / maxLen;
|
|
57
88
|
// Loosened again to allow abstract-vs-summary style comparisons.
|
|
58
|
-
if (lenRatio > 0.
|
|
89
|
+
if (lenRatio > 0.9)
|
|
59
90
|
return false;
|
|
60
91
|
// Fast path: same normalized title-like prefix often indicates same research object.
|
|
61
92
|
const aPrefix = a.content.slice(0, 140).toLowerCase().replace(/[^a-z0-9\s]/g, " ").trim();
|
|
@@ -145,6 +176,7 @@ export class WebFusionEngine {
|
|
|
145
176
|
let docs = [];
|
|
146
177
|
let cacheHit = false;
|
|
147
178
|
let latencyMs = 0;
|
|
179
|
+
let rateLimited = false;
|
|
148
180
|
if (spec.type === "s3") {
|
|
149
181
|
const out = await this.collectFromS3(spec);
|
|
150
182
|
docs = out.docs;
|
|
@@ -163,6 +195,7 @@ export class WebFusionEngine {
|
|
|
163
195
|
const perSrcTel = res.telemetry?.per_source?.find((t) => t.source === spec.type);
|
|
164
196
|
cacheHit = perSrcTel ? !!perSrcTel.cache_hit : false;
|
|
165
197
|
latencyMs = perSrcTel ? Number(perSrcTel.latency_ms) : Date.now() - start;
|
|
198
|
+
rateLimited = perSrcTel ? !!perSrcTel.rate_limited : false;
|
|
166
199
|
}
|
|
167
200
|
const filtered = spec.min_stars !== undefined
|
|
168
201
|
? docs.filter((d) => normalizeStars(d) >= Number(spec.min_stars))
|
|
@@ -173,6 +206,7 @@ export class WebFusionEngine {
|
|
|
173
206
|
cache_hit: cacheHit,
|
|
174
207
|
latency_ms: latencyMs || (Date.now() - start),
|
|
175
208
|
result_count: filtered.length,
|
|
209
|
+
...(spec.type === "s3" ? {} : { rate_limited: rateLimited }),
|
|
176
210
|
});
|
|
177
211
|
}
|
|
178
212
|
catch (e) {
|
package/build/web/web-core.js
CHANGED
|
@@ -87,6 +87,7 @@ export class WebCoreEngine {
|
|
|
87
87
|
cache_hit: out.cacheHit,
|
|
88
88
|
latency_ms: out.latencyMs || (Date.now() - t0),
|
|
89
89
|
result_count: docs.length,
|
|
90
|
+
rate_limited: !!out.rateLimited,
|
|
90
91
|
});
|
|
91
92
|
}
|
|
92
93
|
catch (e) {
|
|
@@ -96,6 +97,7 @@ export class WebCoreEngine {
|
|
|
96
97
|
latency_ms: Date.now() - t0,
|
|
97
98
|
result_count: 0,
|
|
98
99
|
error: e?.message || String(e),
|
|
100
|
+
rate_limited: String(e?.message || "").includes("429"),
|
|
99
101
|
});
|
|
100
102
|
}
|
|
101
103
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.27",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalize schema for semi-structured JSON/JSONL exports so they can be converted to Parquet safely.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
normalize_schema_engine.py <input_path> <output_path> [options_json]
|
|
6
|
+
|
|
7
|
+
options_json (optional):
|
|
8
|
+
{
|
|
9
|
+
"flatten_metadata_json": true,
|
|
10
|
+
"max_keys": 200,
|
|
11
|
+
"extras_mode": "blob" | "drop"
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
Outputs JSON:
|
|
15
|
+
{"ok": true, "output_path": "...", "rows": N, "columns": M, "flattened_keys": K, "extras_mode": "..."}
|
|
16
|
+
or {"ok": false, "error": "..."}
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import sys
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
import re
|
|
23
|
+
from typing import Any, Dict, List, Tuple
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _safe_col(name: str) -> str:
|
|
27
|
+
s = str(name or "").strip()
|
|
28
|
+
s = re.sub(r"[^a-zA-Z0-9_]", "_", s)
|
|
29
|
+
s = re.sub(r"_+", "_", s).strip("_")
|
|
30
|
+
if not s:
|
|
31
|
+
return "unknown"
|
|
32
|
+
if len(s) > 64:
|
|
33
|
+
s = s[:64]
|
|
34
|
+
return s
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _coerce_cell(v: Any) -> Any:
|
|
38
|
+
if v is None:
|
|
39
|
+
return None
|
|
40
|
+
if isinstance(v, (str, int, float, bool)):
|
|
41
|
+
return v
|
|
42
|
+
# Keep nested values as JSON strings so downstream Parquet has stable scalar columns.
|
|
43
|
+
try:
|
|
44
|
+
return json.dumps(v, ensure_ascii=False)
|
|
45
|
+
except Exception:
|
|
46
|
+
return str(v)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _load_records(src: str) -> List[Dict[str, Any]]:
|
|
50
|
+
ext = os.path.splitext(src)[1].lower()
|
|
51
|
+
if ext in (".jsonl", ".ndjson"):
|
|
52
|
+
rows: List[Dict[str, Any]] = []
|
|
53
|
+
with open(src, "r", encoding="utf-8") as f:
|
|
54
|
+
for line in f:
|
|
55
|
+
line = line.strip()
|
|
56
|
+
if not line:
|
|
57
|
+
continue
|
|
58
|
+
obj = json.loads(line)
|
|
59
|
+
if isinstance(obj, dict):
|
|
60
|
+
rows.append(obj)
|
|
61
|
+
return rows
|
|
62
|
+
|
|
63
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
64
|
+
if not raw:
|
|
65
|
+
return []
|
|
66
|
+
obj = json.loads(raw)
|
|
67
|
+
|
|
68
|
+
if isinstance(obj, list):
|
|
69
|
+
return [r for r in obj if isinstance(r, dict)]
|
|
70
|
+
if isinstance(obj, dict):
|
|
71
|
+
for key in ("results", "rows", "items", "records", "data", "entries", "samples"):
|
|
72
|
+
v = obj.get(key)
|
|
73
|
+
if isinstance(v, list) and (len(v) == 0 or isinstance(v[0], dict)):
|
|
74
|
+
return [r for r in v if isinstance(r, dict)]
|
|
75
|
+
# Sometimes the dict itself is the record
|
|
76
|
+
return [obj]
|
|
77
|
+
|
|
78
|
+
return []
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _gather_flat_keys(records: List[Dict[str, Any]], max_keys: int) -> List[str]:
|
|
82
|
+
keys: List[str] = []
|
|
83
|
+
seen = set()
|
|
84
|
+
for r in records:
|
|
85
|
+
mj = r.get("metadata_json")
|
|
86
|
+
if not isinstance(mj, dict):
|
|
87
|
+
continue
|
|
88
|
+
for k in mj.keys():
|
|
89
|
+
col = f"metadata__{_safe_col(k)}"
|
|
90
|
+
if col in seen:
|
|
91
|
+
continue
|
|
92
|
+
seen.add(col)
|
|
93
|
+
keys.append(col)
|
|
94
|
+
if len(keys) >= max_keys:
|
|
95
|
+
return keys
|
|
96
|
+
return keys
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _normalize_records(
|
|
100
|
+
records: List[Dict[str, Any]],
|
|
101
|
+
flat_keys: List[str],
|
|
102
|
+
extras_mode: str,
|
|
103
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
104
|
+
flat_set = set(flat_keys)
|
|
105
|
+
extras_count = 0
|
|
106
|
+
out: List[Dict[str, Any]] = []
|
|
107
|
+
|
|
108
|
+
for r in records:
|
|
109
|
+
base: Dict[str, Any] = {}
|
|
110
|
+
# Keep top-level stable fields as-is.
|
|
111
|
+
for k in (
|
|
112
|
+
"source_type",
|
|
113
|
+
"source_url",
|
|
114
|
+
"content",
|
|
115
|
+
"quality_score",
|
|
116
|
+
"collected_at",
|
|
117
|
+
"content_type",
|
|
118
|
+
):
|
|
119
|
+
if k in r:
|
|
120
|
+
base[k] = _coerce_cell(r.get(k))
|
|
121
|
+
|
|
122
|
+
# Preserve source_chain as a JSON string (it is nested).
|
|
123
|
+
if "source_chain" in r:
|
|
124
|
+
base["source_chain"] = _coerce_cell(r.get("source_chain"))
|
|
125
|
+
|
|
126
|
+
mj = r.get("metadata_json")
|
|
127
|
+
extras: Dict[str, Any] = {}
|
|
128
|
+
if isinstance(mj, dict):
|
|
129
|
+
for k, v in mj.items():
|
|
130
|
+
col = f"metadata__{_safe_col(k)}"
|
|
131
|
+
if col in flat_set:
|
|
132
|
+
base[col] = _coerce_cell(v)
|
|
133
|
+
else:
|
|
134
|
+
extras[k] = v
|
|
135
|
+
|
|
136
|
+
# Fill missing flattened keys with nulls for uniform schema.
|
|
137
|
+
for col in flat_keys:
|
|
138
|
+
if col not in base:
|
|
139
|
+
base[col] = None
|
|
140
|
+
|
|
141
|
+
if extras:
|
|
142
|
+
extras_count += 1
|
|
143
|
+
if extras_mode == "blob":
|
|
144
|
+
base["metadata_json_blob"] = _coerce_cell(extras)
|
|
145
|
+
# extras_mode == "drop" => ignore
|
|
146
|
+
else:
|
|
147
|
+
if extras_mode == "blob":
|
|
148
|
+
base["metadata_json_blob"] = None
|
|
149
|
+
|
|
150
|
+
out.append(base)
|
|
151
|
+
|
|
152
|
+
return out, extras_count
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def main():
|
|
156
|
+
if len(sys.argv) < 3:
|
|
157
|
+
print(json.dumps({"ok": False, "error": "Usage: normalize_schema_engine.py <input> <output> [options_json]"}))
|
|
158
|
+
sys.exit(1)
|
|
159
|
+
|
|
160
|
+
input_path = sys.argv[1]
|
|
161
|
+
output_path = sys.argv[2]
|
|
162
|
+
options_raw = sys.argv[3] if len(sys.argv) >= 4 else "{}"
|
|
163
|
+
|
|
164
|
+
if not os.path.exists(input_path):
|
|
165
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
166
|
+
sys.exit(1)
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
options = json.loads(options_raw) if options_raw else {}
|
|
170
|
+
except Exception:
|
|
171
|
+
options = {}
|
|
172
|
+
|
|
173
|
+
flatten_metadata_json = options.get("flatten_metadata_json", True) is True
|
|
174
|
+
max_keys = int(options.get("max_keys", 200) or 200)
|
|
175
|
+
max_keys = max(0, min(2000, max_keys))
|
|
176
|
+
extras_mode = str(options.get("extras_mode", "blob") or "blob").lower()
|
|
177
|
+
if extras_mode not in ("blob", "drop"):
|
|
178
|
+
extras_mode = "blob"
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
records = _load_records(input_path)
|
|
182
|
+
if not records:
|
|
183
|
+
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|
184
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
185
|
+
f.write("")
|
|
186
|
+
print(json.dumps({"ok": True, "output_path": output_path, "rows": 0, "columns": 0, "flattened_keys": 0, "extras_mode": extras_mode}))
|
|
187
|
+
return
|
|
188
|
+
|
|
189
|
+
flat_keys: List[str] = []
|
|
190
|
+
if flatten_metadata_json and max_keys > 0:
|
|
191
|
+
flat_keys = _gather_flat_keys(records, max_keys)
|
|
192
|
+
|
|
193
|
+
normalized, extras_rows = _normalize_records(records, flat_keys, extras_mode)
|
|
194
|
+
|
|
195
|
+
ext = os.path.splitext(output_path)[1].lower()
|
|
196
|
+
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|
197
|
+
if ext in (".jsonl", ".ndjson"):
|
|
198
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
199
|
+
for r in normalized:
|
|
200
|
+
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
|
201
|
+
elif ext == ".json":
|
|
202
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
203
|
+
json.dump(normalized, f, ensure_ascii=False)
|
|
204
|
+
else:
|
|
205
|
+
raise ValueError(f"Unsupported output format: {ext}. Use .jsonl or .json")
|
|
206
|
+
|
|
207
|
+
columns = len(normalized[0].keys()) if normalized else 0
|
|
208
|
+
print(json.dumps({
|
|
209
|
+
"ok": True,
|
|
210
|
+
"output_path": output_path,
|
|
211
|
+
"rows": len(normalized),
|
|
212
|
+
"columns": columns,
|
|
213
|
+
"flattened_keys": len(flat_keys),
|
|
214
|
+
"extras_mode": extras_mode,
|
|
215
|
+
"extras_rows": extras_rows,
|
|
216
|
+
}))
|
|
217
|
+
except Exception as e:
|
|
218
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
219
|
+
sys.exit(1)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
if __name__ == "__main__":
|
|
223
|
+
main()
|
|
224
|
+
|