@vespermcp/mcp-server 1.2.26 → 1.2.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -0
- package/build/index.js +904 -4
- package/build/metadata/semantic-scholar-source.js +64 -11
- package/build/python/normalize_schema_engine.py +224 -0
- package/build/web/fusion-engine.js +3 -0
- package/build/web/web-core.js +2 -0
- package/package.json +6 -1
- package/src/python/normalize_schema_engine.py +224 -0
|
@@ -1,6 +1,19 @@
|
|
|
1
|
-
import { rateLimitedFetch } from "./rate-limiter.js";
|
|
2
1
|
import { CircuitBreaker } from "./circuit-breaker.js";
|
|
3
2
|
import { estimateQualityScore } from "./quality.js";
|
|
3
|
+
function sleep(ms) {
|
|
4
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
5
|
+
}
|
|
6
|
+
function parseRetryAfterMs(value) {
|
|
7
|
+
if (!value)
|
|
8
|
+
return null;
|
|
9
|
+
const asNum = Number(value);
|
|
10
|
+
if (Number.isFinite(asNum) && asNum >= 0)
|
|
11
|
+
return asNum * 1000;
|
|
12
|
+
const ts = Date.parse(value);
|
|
13
|
+
if (!Number.isFinite(ts))
|
|
14
|
+
return null;
|
|
15
|
+
return Math.max(0, ts - Date.now());
|
|
16
|
+
}
|
|
4
17
|
export class SemanticScholarSource {
|
|
5
18
|
cache;
|
|
6
19
|
breaker = new CircuitBreaker("semantic_scholar", {
|
|
@@ -19,12 +32,12 @@ export class SemanticScholarSource {
|
|
|
19
32
|
const start = Date.now();
|
|
20
33
|
const cleanQuery = String(query || "").trim();
|
|
21
34
|
if (!cleanQuery)
|
|
22
|
-
return { results: [], cacheHit: false, latencyMs: Date.now() - start };
|
|
35
|
+
return { results: [], cacheHit: false, latencyMs: Date.now() - start, rateLimited: false, rateLimitAttempts: 0 };
|
|
23
36
|
const perPage = Math.max(1, Math.min(100, Number(limit || 20)));
|
|
24
37
|
const cacheKey = `webcore:semantic_scholar:discover:${cleanQuery.toLowerCase()}:limit=${perPage}`;
|
|
25
38
|
const cached = await this.cache?.getJson(cacheKey);
|
|
26
39
|
if (cached)
|
|
27
|
-
return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
|
|
40
|
+
return { results: cached, cacheHit: true, latencyMs: Date.now() - start, rateLimited: false, rateLimitAttempts: 0 };
|
|
28
41
|
if (!this.breaker.canAttempt()) {
|
|
29
42
|
throw new Error("Semantic Scholar connector is temporarily unavailable (circuit open).");
|
|
30
43
|
}
|
|
@@ -43,16 +56,32 @@ export class SemanticScholarSource {
|
|
|
43
56
|
"publicationTypes",
|
|
44
57
|
"openAccessPdf",
|
|
45
58
|
].join(","));
|
|
46
|
-
const
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
59
|
+
const headers = {
|
|
60
|
+
"Accept": "application/json",
|
|
61
|
+
"User-Agent": "vesper/2.0 (phase1-semantic-scholar-connector)",
|
|
62
|
+
};
|
|
63
|
+
// Handle S2 429s gracefully: respect Retry-After and avoid failing whole fusion runs.
|
|
64
|
+
const fetched = await this.fetchWith429Retry(url.toString(), headers).catch((e) => {
|
|
65
|
+
const msg = String(e?.message || e || "");
|
|
66
|
+
if (msg.includes("429")) {
|
|
67
|
+
this.breaker.onFailure();
|
|
68
|
+
return { response: null, rateLimited: true, attempts: 6 };
|
|
69
|
+
}
|
|
52
70
|
this.breaker.onFailure();
|
|
53
71
|
throw e;
|
|
54
72
|
});
|
|
55
|
-
|
|
73
|
+
if (!fetched?.response) {
|
|
74
|
+
// Cache short empty result to avoid immediate retry storms on repeated identical queries.
|
|
75
|
+
await this.cache?.setJson(cacheKey, [], 120);
|
|
76
|
+
return {
|
|
77
|
+
results: [],
|
|
78
|
+
cacheHit: false,
|
|
79
|
+
latencyMs: Date.now() - start,
|
|
80
|
+
rateLimited: !!fetched?.rateLimited,
|
|
81
|
+
rateLimitAttempts: fetched?.attempts ?? 0,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
const data = await fetched.response.json().catch((e) => {
|
|
56
85
|
this.breaker.onFailure();
|
|
57
86
|
throw new Error(`Semantic Scholar JSON parse failed: ${e?.message || String(e)}`);
|
|
58
87
|
});
|
|
@@ -60,7 +89,31 @@ export class SemanticScholarSource {
|
|
|
60
89
|
const result = papers.map((p) => this.toDatasetMetadata(p)).filter(Boolean);
|
|
61
90
|
this.breaker.onSuccess();
|
|
62
91
|
await this.cache?.setJson(cacheKey, result, 86400); // 24h
|
|
63
|
-
return {
|
|
92
|
+
return {
|
|
93
|
+
results: result,
|
|
94
|
+
cacheHit: false,
|
|
95
|
+
latencyMs: Date.now() - start,
|
|
96
|
+
rateLimited: false,
|
|
97
|
+
rateLimitAttempts: fetched.attempts,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
async fetchWith429Retry(url, headers) {
|
|
101
|
+
const maxAttempts = 6;
|
|
102
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
103
|
+
const response = await fetch(url, { headers });
|
|
104
|
+
if (response.ok)
|
|
105
|
+
return { response, rateLimited: false, attempts: attempt };
|
|
106
|
+
if (response.status !== 429) {
|
|
107
|
+
const error = new Error(`HTTP error: ${response.status}`);
|
|
108
|
+
error.status = response.status;
|
|
109
|
+
throw error;
|
|
110
|
+
}
|
|
111
|
+
const retryAfterMs = parseRetryAfterMs(response.headers.get("retry-after"));
|
|
112
|
+
const backoffMs = Math.min(30000, 1500 * Math.pow(2, attempt - 1));
|
|
113
|
+
const jitterMs = Math.floor(Math.random() * 400);
|
|
114
|
+
await sleep((retryAfterMs ?? backoffMs) + jitterMs);
|
|
115
|
+
}
|
|
116
|
+
return { response: null, rateLimited: true, attempts: maxAttempts };
|
|
64
117
|
}
|
|
65
118
|
toDatasetMetadata(paper) {
|
|
66
119
|
const paperId = String(paper.paperId || paper.externalIds?.DOI || "").trim();
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalize schema for semi-structured JSON/JSONL exports so they can be converted to Parquet safely.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
normalize_schema_engine.py <input_path> <output_path> [options_json]
|
|
6
|
+
|
|
7
|
+
options_json (optional):
|
|
8
|
+
{
|
|
9
|
+
"flatten_metadata_json": true,
|
|
10
|
+
"max_keys": 200,
|
|
11
|
+
"extras_mode": "blob" | "drop"
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
Outputs JSON:
|
|
15
|
+
{"ok": true, "output_path": "...", "rows": N, "columns": M, "flattened_keys": K, "extras_mode": "..."}
|
|
16
|
+
or {"ok": false, "error": "..."}
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import sys
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
import re
|
|
23
|
+
from typing import Any, Dict, List, Tuple
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _safe_col(name: str) -> str:
|
|
27
|
+
s = str(name or "").strip()
|
|
28
|
+
s = re.sub(r"[^a-zA-Z0-9_]", "_", s)
|
|
29
|
+
s = re.sub(r"_+", "_", s).strip("_")
|
|
30
|
+
if not s:
|
|
31
|
+
return "unknown"
|
|
32
|
+
if len(s) > 64:
|
|
33
|
+
s = s[:64]
|
|
34
|
+
return s
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _coerce_cell(v: Any) -> Any:
|
|
38
|
+
if v is None:
|
|
39
|
+
return None
|
|
40
|
+
if isinstance(v, (str, int, float, bool)):
|
|
41
|
+
return v
|
|
42
|
+
# Keep nested values as JSON strings so downstream Parquet has stable scalar columns.
|
|
43
|
+
try:
|
|
44
|
+
return json.dumps(v, ensure_ascii=False)
|
|
45
|
+
except Exception:
|
|
46
|
+
return str(v)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _load_records(src: str) -> List[Dict[str, Any]]:
|
|
50
|
+
ext = os.path.splitext(src)[1].lower()
|
|
51
|
+
if ext in (".jsonl", ".ndjson"):
|
|
52
|
+
rows: List[Dict[str, Any]] = []
|
|
53
|
+
with open(src, "r", encoding="utf-8") as f:
|
|
54
|
+
for line in f:
|
|
55
|
+
line = line.strip()
|
|
56
|
+
if not line:
|
|
57
|
+
continue
|
|
58
|
+
obj = json.loads(line)
|
|
59
|
+
if isinstance(obj, dict):
|
|
60
|
+
rows.append(obj)
|
|
61
|
+
return rows
|
|
62
|
+
|
|
63
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
64
|
+
if not raw:
|
|
65
|
+
return []
|
|
66
|
+
obj = json.loads(raw)
|
|
67
|
+
|
|
68
|
+
if isinstance(obj, list):
|
|
69
|
+
return [r for r in obj if isinstance(r, dict)]
|
|
70
|
+
if isinstance(obj, dict):
|
|
71
|
+
for key in ("results", "rows", "items", "records", "data", "entries", "samples"):
|
|
72
|
+
v = obj.get(key)
|
|
73
|
+
if isinstance(v, list) and (len(v) == 0 or isinstance(v[0], dict)):
|
|
74
|
+
return [r for r in v if isinstance(r, dict)]
|
|
75
|
+
# Sometimes the dict itself is the record
|
|
76
|
+
return [obj]
|
|
77
|
+
|
|
78
|
+
return []
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _gather_flat_keys(records: List[Dict[str, Any]], max_keys: int) -> List[str]:
|
|
82
|
+
keys: List[str] = []
|
|
83
|
+
seen = set()
|
|
84
|
+
for r in records:
|
|
85
|
+
mj = r.get("metadata_json")
|
|
86
|
+
if not isinstance(mj, dict):
|
|
87
|
+
continue
|
|
88
|
+
for k in mj.keys():
|
|
89
|
+
col = f"metadata__{_safe_col(k)}"
|
|
90
|
+
if col in seen:
|
|
91
|
+
continue
|
|
92
|
+
seen.add(col)
|
|
93
|
+
keys.append(col)
|
|
94
|
+
if len(keys) >= max_keys:
|
|
95
|
+
return keys
|
|
96
|
+
return keys
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _normalize_records(
|
|
100
|
+
records: List[Dict[str, Any]],
|
|
101
|
+
flat_keys: List[str],
|
|
102
|
+
extras_mode: str,
|
|
103
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
104
|
+
flat_set = set(flat_keys)
|
|
105
|
+
extras_count = 0
|
|
106
|
+
out: List[Dict[str, Any]] = []
|
|
107
|
+
|
|
108
|
+
for r in records:
|
|
109
|
+
base: Dict[str, Any] = {}
|
|
110
|
+
# Keep top-level stable fields as-is.
|
|
111
|
+
for k in (
|
|
112
|
+
"source_type",
|
|
113
|
+
"source_url",
|
|
114
|
+
"content",
|
|
115
|
+
"quality_score",
|
|
116
|
+
"collected_at",
|
|
117
|
+
"content_type",
|
|
118
|
+
):
|
|
119
|
+
if k in r:
|
|
120
|
+
base[k] = _coerce_cell(r.get(k))
|
|
121
|
+
|
|
122
|
+
# Preserve source_chain as a JSON string (it is nested).
|
|
123
|
+
if "source_chain" in r:
|
|
124
|
+
base["source_chain"] = _coerce_cell(r.get("source_chain"))
|
|
125
|
+
|
|
126
|
+
mj = r.get("metadata_json")
|
|
127
|
+
extras: Dict[str, Any] = {}
|
|
128
|
+
if isinstance(mj, dict):
|
|
129
|
+
for k, v in mj.items():
|
|
130
|
+
col = f"metadata__{_safe_col(k)}"
|
|
131
|
+
if col in flat_set:
|
|
132
|
+
base[col] = _coerce_cell(v)
|
|
133
|
+
else:
|
|
134
|
+
extras[k] = v
|
|
135
|
+
|
|
136
|
+
# Fill missing flattened keys with nulls for uniform schema.
|
|
137
|
+
for col in flat_keys:
|
|
138
|
+
if col not in base:
|
|
139
|
+
base[col] = None
|
|
140
|
+
|
|
141
|
+
if extras:
|
|
142
|
+
extras_count += 1
|
|
143
|
+
if extras_mode == "blob":
|
|
144
|
+
base["metadata_json_blob"] = _coerce_cell(extras)
|
|
145
|
+
# extras_mode == "drop" => ignore
|
|
146
|
+
else:
|
|
147
|
+
if extras_mode == "blob":
|
|
148
|
+
base["metadata_json_blob"] = None
|
|
149
|
+
|
|
150
|
+
out.append(base)
|
|
151
|
+
|
|
152
|
+
return out, extras_count
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def main():
|
|
156
|
+
if len(sys.argv) < 3:
|
|
157
|
+
print(json.dumps({"ok": False, "error": "Usage: normalize_schema_engine.py <input> <output> [options_json]"}))
|
|
158
|
+
sys.exit(1)
|
|
159
|
+
|
|
160
|
+
input_path = sys.argv[1]
|
|
161
|
+
output_path = sys.argv[2]
|
|
162
|
+
options_raw = sys.argv[3] if len(sys.argv) >= 4 else "{}"
|
|
163
|
+
|
|
164
|
+
if not os.path.exists(input_path):
|
|
165
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
166
|
+
sys.exit(1)
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
options = json.loads(options_raw) if options_raw else {}
|
|
170
|
+
except Exception:
|
|
171
|
+
options = {}
|
|
172
|
+
|
|
173
|
+
flatten_metadata_json = options.get("flatten_metadata_json", True) is True
|
|
174
|
+
max_keys = int(options.get("max_keys", 200) or 200)
|
|
175
|
+
max_keys = max(0, min(2000, max_keys))
|
|
176
|
+
extras_mode = str(options.get("extras_mode", "blob") or "blob").lower()
|
|
177
|
+
if extras_mode not in ("blob", "drop"):
|
|
178
|
+
extras_mode = "blob"
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
records = _load_records(input_path)
|
|
182
|
+
if not records:
|
|
183
|
+
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|
184
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
185
|
+
f.write("")
|
|
186
|
+
print(json.dumps({"ok": True, "output_path": output_path, "rows": 0, "columns": 0, "flattened_keys": 0, "extras_mode": extras_mode}))
|
|
187
|
+
return
|
|
188
|
+
|
|
189
|
+
flat_keys: List[str] = []
|
|
190
|
+
if flatten_metadata_json and max_keys > 0:
|
|
191
|
+
flat_keys = _gather_flat_keys(records, max_keys)
|
|
192
|
+
|
|
193
|
+
normalized, extras_rows = _normalize_records(records, flat_keys, extras_mode)
|
|
194
|
+
|
|
195
|
+
ext = os.path.splitext(output_path)[1].lower()
|
|
196
|
+
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|
197
|
+
if ext in (".jsonl", ".ndjson"):
|
|
198
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
199
|
+
for r in normalized:
|
|
200
|
+
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
|
201
|
+
elif ext == ".json":
|
|
202
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
203
|
+
json.dump(normalized, f, ensure_ascii=False)
|
|
204
|
+
else:
|
|
205
|
+
raise ValueError(f"Unsupported output format: {ext}. Use .jsonl or .json")
|
|
206
|
+
|
|
207
|
+
columns = len(normalized[0].keys()) if normalized else 0
|
|
208
|
+
print(json.dumps({
|
|
209
|
+
"ok": True,
|
|
210
|
+
"output_path": output_path,
|
|
211
|
+
"rows": len(normalized),
|
|
212
|
+
"columns": columns,
|
|
213
|
+
"flattened_keys": len(flat_keys),
|
|
214
|
+
"extras_mode": extras_mode,
|
|
215
|
+
"extras_rows": extras_rows,
|
|
216
|
+
}))
|
|
217
|
+
except Exception as e:
|
|
218
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
219
|
+
sys.exit(1)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
if __name__ == "__main__":
|
|
223
|
+
main()
|
|
224
|
+
|
|
@@ -176,6 +176,7 @@ export class WebFusionEngine {
|
|
|
176
176
|
let docs = [];
|
|
177
177
|
let cacheHit = false;
|
|
178
178
|
let latencyMs = 0;
|
|
179
|
+
let rateLimited = false;
|
|
179
180
|
if (spec.type === "s3") {
|
|
180
181
|
const out = await this.collectFromS3(spec);
|
|
181
182
|
docs = out.docs;
|
|
@@ -194,6 +195,7 @@ export class WebFusionEngine {
|
|
|
194
195
|
const perSrcTel = res.telemetry?.per_source?.find((t) => t.source === spec.type);
|
|
195
196
|
cacheHit = perSrcTel ? !!perSrcTel.cache_hit : false;
|
|
196
197
|
latencyMs = perSrcTel ? Number(perSrcTel.latency_ms) : Date.now() - start;
|
|
198
|
+
rateLimited = perSrcTel ? !!perSrcTel.rate_limited : false;
|
|
197
199
|
}
|
|
198
200
|
const filtered = spec.min_stars !== undefined
|
|
199
201
|
? docs.filter((d) => normalizeStars(d) >= Number(spec.min_stars))
|
|
@@ -204,6 +206,7 @@ export class WebFusionEngine {
|
|
|
204
206
|
cache_hit: cacheHit,
|
|
205
207
|
latency_ms: latencyMs || (Date.now() - start),
|
|
206
208
|
result_count: filtered.length,
|
|
209
|
+
...(spec.type === "s3" ? {} : { rate_limited: rateLimited }),
|
|
207
210
|
});
|
|
208
211
|
}
|
|
209
212
|
catch (e) {
|
package/build/web/web-core.js
CHANGED
|
@@ -87,6 +87,7 @@ export class WebCoreEngine {
|
|
|
87
87
|
cache_hit: out.cacheHit,
|
|
88
88
|
latency_ms: out.latencyMs || (Date.now() - t0),
|
|
89
89
|
result_count: docs.length,
|
|
90
|
+
rate_limited: !!out.rateLimited,
|
|
90
91
|
});
|
|
91
92
|
}
|
|
92
93
|
catch (e) {
|
|
@@ -96,6 +97,7 @@ export class WebCoreEngine {
|
|
|
96
97
|
latency_ms: Date.now() - t0,
|
|
97
98
|
result_count: 0,
|
|
98
99
|
error: e?.message || String(e),
|
|
100
|
+
rate_limited: String(e?.message || "").includes("429"),
|
|
99
101
|
});
|
|
100
102
|
}
|
|
101
103
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.28",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
|
@@ -37,6 +37,7 @@
|
|
|
37
37
|
"setup": "node build/index.js --setup",
|
|
38
38
|
"setup:silent": "node build/index.js --setup --silent",
|
|
39
39
|
"refresh-index": "node scripts/refresh-index.cjs",
|
|
40
|
+
"telemetry:receiver": "tsx telemetry/lineage-receiver.ts",
|
|
40
41
|
"test": "vitest",
|
|
41
42
|
"start": "node build/index.js"
|
|
42
43
|
},
|
|
@@ -79,9 +80,13 @@
|
|
|
79
80
|
"ajv": "^8.17.1",
|
|
80
81
|
"ajv-formats": "^3.0.1",
|
|
81
82
|
"better-sqlite3": "^12.6.0",
|
|
83
|
+
"chalk": "^5.6.2",
|
|
84
|
+
"cli-table3": "^0.6.5",
|
|
85
|
+
"express": "^5.1.0",
|
|
82
86
|
"inquirer": "^13.3.0",
|
|
83
87
|
"lodash": "^4.17.21",
|
|
84
88
|
"pdf-parse": "^2.4.5",
|
|
89
|
+
"pg": "^8.16.3",
|
|
85
90
|
"uuid": "^13.0.0",
|
|
86
91
|
"zod": "^4.3.5",
|
|
87
92
|
"zod-to-json-schema": "^3.25.1"
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalize schema for semi-structured JSON/JSONL exports so they can be converted to Parquet safely.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
normalize_schema_engine.py <input_path> <output_path> [options_json]
|
|
6
|
+
|
|
7
|
+
options_json (optional):
|
|
8
|
+
{
|
|
9
|
+
"flatten_metadata_json": true,
|
|
10
|
+
"max_keys": 200,
|
|
11
|
+
"extras_mode": "blob" | "drop"
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
Outputs JSON:
|
|
15
|
+
{"ok": true, "output_path": "...", "rows": N, "columns": M, "flattened_keys": K, "extras_mode": "..."}
|
|
16
|
+
or {"ok": false, "error": "..."}
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import sys
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
import re
|
|
23
|
+
from typing import Any, Dict, List, Tuple
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _safe_col(name: str) -> str:
|
|
27
|
+
s = str(name or "").strip()
|
|
28
|
+
s = re.sub(r"[^a-zA-Z0-9_]", "_", s)
|
|
29
|
+
s = re.sub(r"_+", "_", s).strip("_")
|
|
30
|
+
if not s:
|
|
31
|
+
return "unknown"
|
|
32
|
+
if len(s) > 64:
|
|
33
|
+
s = s[:64]
|
|
34
|
+
return s
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _coerce_cell(v: Any) -> Any:
|
|
38
|
+
if v is None:
|
|
39
|
+
return None
|
|
40
|
+
if isinstance(v, (str, int, float, bool)):
|
|
41
|
+
return v
|
|
42
|
+
# Keep nested values as JSON strings so downstream Parquet has stable scalar columns.
|
|
43
|
+
try:
|
|
44
|
+
return json.dumps(v, ensure_ascii=False)
|
|
45
|
+
except Exception:
|
|
46
|
+
return str(v)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _load_records(src: str) -> List[Dict[str, Any]]:
|
|
50
|
+
ext = os.path.splitext(src)[1].lower()
|
|
51
|
+
if ext in (".jsonl", ".ndjson"):
|
|
52
|
+
rows: List[Dict[str, Any]] = []
|
|
53
|
+
with open(src, "r", encoding="utf-8") as f:
|
|
54
|
+
for line in f:
|
|
55
|
+
line = line.strip()
|
|
56
|
+
if not line:
|
|
57
|
+
continue
|
|
58
|
+
obj = json.loads(line)
|
|
59
|
+
if isinstance(obj, dict):
|
|
60
|
+
rows.append(obj)
|
|
61
|
+
return rows
|
|
62
|
+
|
|
63
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
64
|
+
if not raw:
|
|
65
|
+
return []
|
|
66
|
+
obj = json.loads(raw)
|
|
67
|
+
|
|
68
|
+
if isinstance(obj, list):
|
|
69
|
+
return [r for r in obj if isinstance(r, dict)]
|
|
70
|
+
if isinstance(obj, dict):
|
|
71
|
+
for key in ("results", "rows", "items", "records", "data", "entries", "samples"):
|
|
72
|
+
v = obj.get(key)
|
|
73
|
+
if isinstance(v, list) and (len(v) == 0 or isinstance(v[0], dict)):
|
|
74
|
+
return [r for r in v if isinstance(r, dict)]
|
|
75
|
+
# Sometimes the dict itself is the record
|
|
76
|
+
return [obj]
|
|
77
|
+
|
|
78
|
+
return []
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _gather_flat_keys(records: List[Dict[str, Any]], max_keys: int) -> List[str]:
|
|
82
|
+
keys: List[str] = []
|
|
83
|
+
seen = set()
|
|
84
|
+
for r in records:
|
|
85
|
+
mj = r.get("metadata_json")
|
|
86
|
+
if not isinstance(mj, dict):
|
|
87
|
+
continue
|
|
88
|
+
for k in mj.keys():
|
|
89
|
+
col = f"metadata__{_safe_col(k)}"
|
|
90
|
+
if col in seen:
|
|
91
|
+
continue
|
|
92
|
+
seen.add(col)
|
|
93
|
+
keys.append(col)
|
|
94
|
+
if len(keys) >= max_keys:
|
|
95
|
+
return keys
|
|
96
|
+
return keys
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _normalize_records(
|
|
100
|
+
records: List[Dict[str, Any]],
|
|
101
|
+
flat_keys: List[str],
|
|
102
|
+
extras_mode: str,
|
|
103
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
104
|
+
flat_set = set(flat_keys)
|
|
105
|
+
extras_count = 0
|
|
106
|
+
out: List[Dict[str, Any]] = []
|
|
107
|
+
|
|
108
|
+
for r in records:
|
|
109
|
+
base: Dict[str, Any] = {}
|
|
110
|
+
# Keep top-level stable fields as-is.
|
|
111
|
+
for k in (
|
|
112
|
+
"source_type",
|
|
113
|
+
"source_url",
|
|
114
|
+
"content",
|
|
115
|
+
"quality_score",
|
|
116
|
+
"collected_at",
|
|
117
|
+
"content_type",
|
|
118
|
+
):
|
|
119
|
+
if k in r:
|
|
120
|
+
base[k] = _coerce_cell(r.get(k))
|
|
121
|
+
|
|
122
|
+
# Preserve source_chain as a JSON string (it is nested).
|
|
123
|
+
if "source_chain" in r:
|
|
124
|
+
base["source_chain"] = _coerce_cell(r.get("source_chain"))
|
|
125
|
+
|
|
126
|
+
mj = r.get("metadata_json")
|
|
127
|
+
extras: Dict[str, Any] = {}
|
|
128
|
+
if isinstance(mj, dict):
|
|
129
|
+
for k, v in mj.items():
|
|
130
|
+
col = f"metadata__{_safe_col(k)}"
|
|
131
|
+
if col in flat_set:
|
|
132
|
+
base[col] = _coerce_cell(v)
|
|
133
|
+
else:
|
|
134
|
+
extras[k] = v
|
|
135
|
+
|
|
136
|
+
# Fill missing flattened keys with nulls for uniform schema.
|
|
137
|
+
for col in flat_keys:
|
|
138
|
+
if col not in base:
|
|
139
|
+
base[col] = None
|
|
140
|
+
|
|
141
|
+
if extras:
|
|
142
|
+
extras_count += 1
|
|
143
|
+
if extras_mode == "blob":
|
|
144
|
+
base["metadata_json_blob"] = _coerce_cell(extras)
|
|
145
|
+
# extras_mode == "drop" => ignore
|
|
146
|
+
else:
|
|
147
|
+
if extras_mode == "blob":
|
|
148
|
+
base["metadata_json_blob"] = None
|
|
149
|
+
|
|
150
|
+
out.append(base)
|
|
151
|
+
|
|
152
|
+
return out, extras_count
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def main():
|
|
156
|
+
if len(sys.argv) < 3:
|
|
157
|
+
print(json.dumps({"ok": False, "error": "Usage: normalize_schema_engine.py <input> <output> [options_json]"}))
|
|
158
|
+
sys.exit(1)
|
|
159
|
+
|
|
160
|
+
input_path = sys.argv[1]
|
|
161
|
+
output_path = sys.argv[2]
|
|
162
|
+
options_raw = sys.argv[3] if len(sys.argv) >= 4 else "{}"
|
|
163
|
+
|
|
164
|
+
if not os.path.exists(input_path):
|
|
165
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
166
|
+
sys.exit(1)
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
options = json.loads(options_raw) if options_raw else {}
|
|
170
|
+
except Exception:
|
|
171
|
+
options = {}
|
|
172
|
+
|
|
173
|
+
flatten_metadata_json = options.get("flatten_metadata_json", True) is True
|
|
174
|
+
max_keys = int(options.get("max_keys", 200) or 200)
|
|
175
|
+
max_keys = max(0, min(2000, max_keys))
|
|
176
|
+
extras_mode = str(options.get("extras_mode", "blob") or "blob").lower()
|
|
177
|
+
if extras_mode not in ("blob", "drop"):
|
|
178
|
+
extras_mode = "blob"
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
records = _load_records(input_path)
|
|
182
|
+
if not records:
|
|
183
|
+
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|
184
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
185
|
+
f.write("")
|
|
186
|
+
print(json.dumps({"ok": True, "output_path": output_path, "rows": 0, "columns": 0, "flattened_keys": 0, "extras_mode": extras_mode}))
|
|
187
|
+
return
|
|
188
|
+
|
|
189
|
+
flat_keys: List[str] = []
|
|
190
|
+
if flatten_metadata_json and max_keys > 0:
|
|
191
|
+
flat_keys = _gather_flat_keys(records, max_keys)
|
|
192
|
+
|
|
193
|
+
normalized, extras_rows = _normalize_records(records, flat_keys, extras_mode)
|
|
194
|
+
|
|
195
|
+
ext = os.path.splitext(output_path)[1].lower()
|
|
196
|
+
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|
197
|
+
if ext in (".jsonl", ".ndjson"):
|
|
198
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
199
|
+
for r in normalized:
|
|
200
|
+
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
|
201
|
+
elif ext == ".json":
|
|
202
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
203
|
+
json.dump(normalized, f, ensure_ascii=False)
|
|
204
|
+
else:
|
|
205
|
+
raise ValueError(f"Unsupported output format: {ext}. Use .jsonl or .json")
|
|
206
|
+
|
|
207
|
+
columns = len(normalized[0].keys()) if normalized else 0
|
|
208
|
+
print(json.dumps({
|
|
209
|
+
"ok": True,
|
|
210
|
+
"output_path": output_path,
|
|
211
|
+
"rows": len(normalized),
|
|
212
|
+
"columns": columns,
|
|
213
|
+
"flattened_keys": len(flat_keys),
|
|
214
|
+
"extras_mode": extras_mode,
|
|
215
|
+
"extras_rows": extras_rows,
|
|
216
|
+
}))
|
|
217
|
+
except Exception as e:
|
|
218
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
219
|
+
sys.exit(1)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
if __name__ == "__main__":
|
|
223
|
+
main()
|
|
224
|
+
|