@vespermcp/mcp-server 1.2.26 → 1.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -1540,6 +1540,42 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1540
1540
  required: ["file_path", "target_format"],
1541
1541
  },
1542
1542
  },
1543
+ {
1544
+ name: "vesper_normalize_schema",
1545
+ description: "Normalize ragged JSON/JSONL rows into a schema-uniform JSONL (or JSON) by flattening metadata_json into stable columns (fills missing values with null). Useful before converting fused WebCore JSON to Parquet.",
1546
+ inputSchema: {
1547
+ type: "object",
1548
+ properties: {
1549
+ file_path: {
1550
+ type: "string",
1551
+ description: "Absolute path to the input file (.json or .jsonl). If it's a fused Vesper output JSON, tool will extract results[].",
1552
+ },
1553
+ output_format: {
1554
+ type: "string",
1555
+ enum: ["jsonl", "json"],
1556
+ description: "Output format for normalized rows. Default: jsonl.",
1557
+ },
1558
+ output_dir: {
1559
+ type: "string",
1560
+ description: "Directory to write normalized output. Default: ~/.vesper/data/normalized_schema",
1561
+ },
1562
+ flatten_metadata_json: {
1563
+ type: "boolean",
1564
+ description: "Flatten metadata_json into metadata__* columns. Default: true.",
1565
+ },
1566
+ max_keys: {
1567
+ type: "number",
1568
+ description: "Max number of metadata_json keys to materialize as columns. Extra keys go into metadata_json_blob (if extras_mode='blob'). Default: 200.",
1569
+ },
1570
+ extras_mode: {
1571
+ type: "string",
1572
+ enum: ["blob", "drop"],
1573
+ description: "How to handle metadata_json keys beyond max_keys. blob keeps them in metadata_json_blob; drop discards them. Default: blob.",
1574
+ },
1575
+ },
1576
+ required: ["file_path"],
1577
+ },
1578
+ },
1543
1579
  {
1544
1580
  name: "fuse_datasets",
1545
1581
  description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
@@ -2637,6 +2673,62 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2637
2673
  };
2638
2674
  }
2639
2675
  }
2676
+ case "vesper_normalize_schema": {
2677
+ const filePath = String(request.params.arguments?.file_path || "").trim();
2678
+ const outputFormat = String(request.params.arguments?.output_format || "jsonl").trim().toLowerCase();
2679
+ const outputDirRaw = request.params.arguments?.output_dir ? String(request.params.arguments.output_dir).trim() : "";
2680
+ const flattenMetadataJson = request.params.arguments?.flatten_metadata_json !== false;
2681
+ const maxKeys = Number(request.params.arguments?.max_keys ?? 200);
2682
+ const extrasMode = String(request.params.arguments?.extras_mode || "blob").trim().toLowerCase();
2683
+ if (!filePath) {
2684
+ throw new McpError(ErrorCode.InvalidParams, "file_path is required");
2685
+ }
2686
+ if (!["jsonl", "json"].includes(outputFormat)) {
2687
+ throw new McpError(ErrorCode.InvalidParams, "output_format must be one of: jsonl, json");
2688
+ }
2689
+ if (!fs.existsSync(filePath)) {
2690
+ return { content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }], isError: true };
2691
+ }
2692
+ const outDir = outputDirRaw || path.join(dataRoot, "data", "normalized_schema");
2693
+ if (!fs.existsSync(outDir))
2694
+ fs.mkdirSync(outDir, { recursive: true });
2695
+ const baseName = path.parse(filePath).name || `normalized_${Date.now()}`;
2696
+ const outputPath = path.join(outDir, `${baseName}.normalized.${outputFormat}`);
2697
+ try {
2698
+ const scriptPath = path.join(dataRoot, "python", "normalize_schema_engine.py");
2699
+ const options = {
2700
+ flatten_metadata_json: !!flattenMetadataJson,
2701
+ max_keys: Number.isFinite(maxKeys) ? maxKeys : 200,
2702
+ extras_mode: ["blob", "drop"].includes(extrasMode) ? extrasMode : "blob",
2703
+ };
2704
+ const result = await runPythonJson(scriptPath, [filePath, outputPath, JSON.stringify(options)]);
2705
+ if (!result.ok) {
2706
+ return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${result.error}` }], isError: true };
2707
+ }
2708
+ // Register normalized file to make follow-up conversion easier.
2709
+ try {
2710
+ const datasetId = path.basename(outputPath, path.extname(outputPath));
2711
+ upsertRegistry(datasetId, outputPath, "completed");
2712
+ }
2713
+ catch (e) {
2714
+ console.error(`[NormalizeSchema] Registry write failed: ${e?.message || e}`);
2715
+ }
2716
+ let msg = `**Schema normalization complete**\n`;
2717
+ msg += `- **Input**: ${filePath}\n`;
2718
+ msg += `- **Output**: ${result.output_path}\n`;
2719
+ msg += `- **Rows**: ${result.rows?.toLocaleString?.() ?? result.rows}\n`;
2720
+ msg += `- **Columns**: ${result.columns}\n`;
2721
+ msg += `- **Flattened keys**: ${result.flattened_keys}\n`;
2722
+ msg += `- **Extras mode**: ${result.extras_mode}\n`;
2723
+ if (result.extras_rows !== undefined)
2724
+ msg += `- **Rows with extras**: ${result.extras_rows}\n`;
2725
+ msg += `\nNext: run \`vesper_convert_format\` on the output to convert to parquet.\n`;
2726
+ return { content: [{ type: "text", text: msg }] };
2727
+ }
2728
+ catch (error) {
2729
+ return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${error.message}` }], isError: true };
2730
+ }
2731
+ }
2640
2732
  case "fuse_datasets": {
2641
2733
  const rawSources = request.params.arguments?.sources;
2642
2734
  if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
@@ -1,6 +1,19 @@
1
- import { rateLimitedFetch } from "./rate-limiter.js";
2
1
  import { CircuitBreaker } from "./circuit-breaker.js";
3
2
  import { estimateQualityScore } from "./quality.js";
3
+ function sleep(ms) {
4
+ return new Promise((resolve) => setTimeout(resolve, ms));
5
+ }
6
+ function parseRetryAfterMs(value) {
7
+ if (!value)
8
+ return null;
9
+ const asNum = Number(value);
10
+ if (Number.isFinite(asNum) && asNum >= 0)
11
+ return asNum * 1000;
12
+ const ts = Date.parse(value);
13
+ if (!Number.isFinite(ts))
14
+ return null;
15
+ return Math.max(0, ts - Date.now());
16
+ }
4
17
  export class SemanticScholarSource {
5
18
  cache;
6
19
  breaker = new CircuitBreaker("semantic_scholar", {
@@ -19,12 +32,12 @@ export class SemanticScholarSource {
19
32
  const start = Date.now();
20
33
  const cleanQuery = String(query || "").trim();
21
34
  if (!cleanQuery)
22
- return { results: [], cacheHit: false, latencyMs: Date.now() - start };
35
+ return { results: [], cacheHit: false, latencyMs: Date.now() - start, rateLimited: false, rateLimitAttempts: 0 };
23
36
  const perPage = Math.max(1, Math.min(100, Number(limit || 20)));
24
37
  const cacheKey = `webcore:semantic_scholar:discover:${cleanQuery.toLowerCase()}:limit=${perPage}`;
25
38
  const cached = await this.cache?.getJson(cacheKey);
26
39
  if (cached)
27
- return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
40
+ return { results: cached, cacheHit: true, latencyMs: Date.now() - start, rateLimited: false, rateLimitAttempts: 0 };
28
41
  if (!this.breaker.canAttempt()) {
29
42
  throw new Error("Semantic Scholar connector is temporarily unavailable (circuit open).");
30
43
  }
@@ -43,16 +56,32 @@ export class SemanticScholarSource {
43
56
  "publicationTypes",
44
57
  "openAccessPdf",
45
58
  ].join(","));
46
- const response = await rateLimitedFetch(url.toString(), {
47
- headers: {
48
- "Accept": "application/json",
49
- "User-Agent": "vesper/2.0 (phase1-semantic-scholar-connector)",
50
- },
51
- }, { maxRetries: 5, initialDelay: 1000, maxDelay: 20000 }).catch((e) => {
59
+ const headers = {
60
+ "Accept": "application/json",
61
+ "User-Agent": "vesper/2.0 (phase1-semantic-scholar-connector)",
62
+ };
63
+ // Handle S2 429s gracefully: respect Retry-After and avoid failing whole fusion runs.
64
+ const fetched = await this.fetchWith429Retry(url.toString(), headers).catch((e) => {
65
+ const msg = String(e?.message || e || "");
66
+ if (msg.includes("429")) {
67
+ this.breaker.onFailure();
68
+ return { response: null, rateLimited: true, attempts: 6 };
69
+ }
52
70
  this.breaker.onFailure();
53
71
  throw e;
54
72
  });
55
- const data = await response.json().catch((e) => {
73
+ if (!fetched?.response) {
74
+ // Cache short empty result to avoid immediate retry storms on repeated identical queries.
75
+ await this.cache?.setJson(cacheKey, [], 120);
76
+ return {
77
+ results: [],
78
+ cacheHit: false,
79
+ latencyMs: Date.now() - start,
80
+ rateLimited: !!fetched?.rateLimited,
81
+ rateLimitAttempts: fetched?.attempts ?? 0,
82
+ };
83
+ }
84
+ const data = await fetched.response.json().catch((e) => {
56
85
  this.breaker.onFailure();
57
86
  throw new Error(`Semantic Scholar JSON parse failed: ${e?.message || String(e)}`);
58
87
  });
@@ -60,7 +89,31 @@ export class SemanticScholarSource {
60
89
  const result = papers.map((p) => this.toDatasetMetadata(p)).filter(Boolean);
61
90
  this.breaker.onSuccess();
62
91
  await this.cache?.setJson(cacheKey, result, 86400); // 24h
63
- return { results: result, cacheHit: false, latencyMs: Date.now() - start };
92
+ return {
93
+ results: result,
94
+ cacheHit: false,
95
+ latencyMs: Date.now() - start,
96
+ rateLimited: false,
97
+ rateLimitAttempts: fetched.attempts,
98
+ };
99
+ }
100
+ async fetchWith429Retry(url, headers) {
101
+ const maxAttempts = 6;
102
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
103
+ const response = await fetch(url, { headers });
104
+ if (response.ok)
105
+ return { response, rateLimited: false, attempts: attempt };
106
+ if (response.status !== 429) {
107
+ const error = new Error(`HTTP error: ${response.status}`);
108
+ error.status = response.status;
109
+ throw error;
110
+ }
111
+ const retryAfterMs = parseRetryAfterMs(response.headers.get("retry-after"));
112
+ const backoffMs = Math.min(30000, 1500 * Math.pow(2, attempt - 1));
113
+ const jitterMs = Math.floor(Math.random() * 400);
114
+ await sleep((retryAfterMs ?? backoffMs) + jitterMs);
115
+ }
116
+ return { response: null, rateLimited: true, attempts: maxAttempts };
64
117
  }
65
118
  toDatasetMetadata(paper) {
66
119
  const paperId = String(paper.paperId || paper.externalIds?.DOI || "").trim();
@@ -0,0 +1,224 @@
1
+ """
2
+ Normalize schema for semi-structured JSON/JSONL exports so they can be converted to Parquet safely.
3
+
4
+ Usage:
5
+ normalize_schema_engine.py <input_path> <output_path> [options_json]
6
+
7
+ options_json (optional):
8
+ {
9
+ "flatten_metadata_json": true,
10
+ "max_keys": 200,
11
+ "extras_mode": "blob" | "drop"
12
+ }
13
+
14
+ Outputs JSON:
15
+ {"ok": true, "output_path": "...", "rows": N, "columns": M, "flattened_keys": K, "extras_mode": "..."}
16
+ or {"ok": false, "error": "..."}
17
+ """
18
+
19
+ import sys
20
+ import json
21
+ import os
22
+ import re
23
+ from typing import Any, Dict, List, Tuple
24
+
25
+
26
+ def _safe_col(name: str) -> str:
27
+ s = str(name or "").strip()
28
+ s = re.sub(r"[^a-zA-Z0-9_]", "_", s)
29
+ s = re.sub(r"_+", "_", s).strip("_")
30
+ if not s:
31
+ return "unknown"
32
+ if len(s) > 64:
33
+ s = s[:64]
34
+ return s
35
+
36
+
37
+ def _coerce_cell(v: Any) -> Any:
38
+ if v is None:
39
+ return None
40
+ if isinstance(v, (str, int, float, bool)):
41
+ return v
42
+ # Keep nested values as JSON strings so downstream Parquet has stable scalar columns.
43
+ try:
44
+ return json.dumps(v, ensure_ascii=False)
45
+ except Exception:
46
+ return str(v)
47
+
48
+
49
+ def _load_records(src: str) -> List[Dict[str, Any]]:
50
+ ext = os.path.splitext(src)[1].lower()
51
+ if ext in (".jsonl", ".ndjson"):
52
+ rows: List[Dict[str, Any]] = []
53
+ with open(src, "r", encoding="utf-8") as f:
54
+ for line in f:
55
+ line = line.strip()
56
+ if not line:
57
+ continue
58
+ obj = json.loads(line)
59
+ if isinstance(obj, dict):
60
+ rows.append(obj)
61
+ return rows
62
+
63
+ raw = open(src, "r", encoding="utf-8").read().strip()
64
+ if not raw:
65
+ return []
66
+ obj = json.loads(raw)
67
+
68
+ if isinstance(obj, list):
69
+ return [r for r in obj if isinstance(r, dict)]
70
+ if isinstance(obj, dict):
71
+ for key in ("results", "rows", "items", "records", "data", "entries", "samples"):
72
+ v = obj.get(key)
73
+ if isinstance(v, list) and (len(v) == 0 or isinstance(v[0], dict)):
74
+ return [r for r in v if isinstance(r, dict)]
75
+ # Sometimes the dict itself is the record
76
+ return [obj]
77
+
78
+ return []
79
+
80
+
81
+ def _gather_flat_keys(records: List[Dict[str, Any]], max_keys: int) -> List[str]:
82
+ keys: List[str] = []
83
+ seen = set()
84
+ for r in records:
85
+ mj = r.get("metadata_json")
86
+ if not isinstance(mj, dict):
87
+ continue
88
+ for k in mj.keys():
89
+ col = f"metadata__{_safe_col(k)}"
90
+ if col in seen:
91
+ continue
92
+ seen.add(col)
93
+ keys.append(col)
94
+ if len(keys) >= max_keys:
95
+ return keys
96
+ return keys
97
+
98
+
99
+ def _normalize_records(
100
+ records: List[Dict[str, Any]],
101
+ flat_keys: List[str],
102
+ extras_mode: str,
103
+ ) -> Tuple[List[Dict[str, Any]], int]:
104
+ flat_set = set(flat_keys)
105
+ extras_count = 0
106
+ out: List[Dict[str, Any]] = []
107
+
108
+ for r in records:
109
+ base: Dict[str, Any] = {}
110
+ # Keep top-level stable fields as-is.
111
+ for k in (
112
+ "source_type",
113
+ "source_url",
114
+ "content",
115
+ "quality_score",
116
+ "collected_at",
117
+ "content_type",
118
+ ):
119
+ if k in r:
120
+ base[k] = _coerce_cell(r.get(k))
121
+
122
+ # Preserve source_chain as a JSON string (it is nested).
123
+ if "source_chain" in r:
124
+ base["source_chain"] = _coerce_cell(r.get("source_chain"))
125
+
126
+ mj = r.get("metadata_json")
127
+ extras: Dict[str, Any] = {}
128
+ if isinstance(mj, dict):
129
+ for k, v in mj.items():
130
+ col = f"metadata__{_safe_col(k)}"
131
+ if col in flat_set:
132
+ base[col] = _coerce_cell(v)
133
+ else:
134
+ extras[k] = v
135
+
136
+ # Fill missing flattened keys with nulls for uniform schema.
137
+ for col in flat_keys:
138
+ if col not in base:
139
+ base[col] = None
140
+
141
+ if extras:
142
+ extras_count += 1
143
+ if extras_mode == "blob":
144
+ base["metadata_json_blob"] = _coerce_cell(extras)
145
+ # extras_mode == "drop" => ignore
146
+ else:
147
+ if extras_mode == "blob":
148
+ base["metadata_json_blob"] = None
149
+
150
+ out.append(base)
151
+
152
+ return out, extras_count
153
+
154
+
155
+ def main():
156
+ if len(sys.argv) < 3:
157
+ print(json.dumps({"ok": False, "error": "Usage: normalize_schema_engine.py <input> <output> [options_json]"}))
158
+ sys.exit(1)
159
+
160
+ input_path = sys.argv[1]
161
+ output_path = sys.argv[2]
162
+ options_raw = sys.argv[3] if len(sys.argv) >= 4 else "{}"
163
+
164
+ if not os.path.exists(input_path):
165
+ print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
166
+ sys.exit(1)
167
+
168
+ try:
169
+ options = json.loads(options_raw) if options_raw else {}
170
+ except Exception:
171
+ options = {}
172
+
173
+ flatten_metadata_json = options.get("flatten_metadata_json", True) is True
174
+ max_keys = int(options.get("max_keys", 200) or 200)
175
+ max_keys = max(0, min(2000, max_keys))
176
+ extras_mode = str(options.get("extras_mode", "blob") or "blob").lower()
177
+ if extras_mode not in ("blob", "drop"):
178
+ extras_mode = "blob"
179
+
180
+ try:
181
+ records = _load_records(input_path)
182
+ if not records:
183
+ os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
184
+ with open(output_path, "w", encoding="utf-8") as f:
185
+ f.write("")
186
+ print(json.dumps({"ok": True, "output_path": output_path, "rows": 0, "columns": 0, "flattened_keys": 0, "extras_mode": extras_mode}))
187
+ return
188
+
189
+ flat_keys: List[str] = []
190
+ if flatten_metadata_json and max_keys > 0:
191
+ flat_keys = _gather_flat_keys(records, max_keys)
192
+
193
+ normalized, extras_rows = _normalize_records(records, flat_keys, extras_mode)
194
+
195
+ ext = os.path.splitext(output_path)[1].lower()
196
+ os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
197
+ if ext in (".jsonl", ".ndjson"):
198
+ with open(output_path, "w", encoding="utf-8") as f:
199
+ for r in normalized:
200
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
201
+ elif ext == ".json":
202
+ with open(output_path, "w", encoding="utf-8") as f:
203
+ json.dump(normalized, f, ensure_ascii=False)
204
+ else:
205
+ raise ValueError(f"Unsupported output format: {ext}. Use .jsonl or .json")
206
+
207
+ columns = len(normalized[0].keys()) if normalized else 0
208
+ print(json.dumps({
209
+ "ok": True,
210
+ "output_path": output_path,
211
+ "rows": len(normalized),
212
+ "columns": columns,
213
+ "flattened_keys": len(flat_keys),
214
+ "extras_mode": extras_mode,
215
+ "extras_rows": extras_rows,
216
+ }))
217
+ except Exception as e:
218
+ print(json.dumps({"ok": False, "error": str(e)}))
219
+ sys.exit(1)
220
+
221
+
222
+ if __name__ == "__main__":
223
+ main()
224
+
@@ -176,6 +176,7 @@ export class WebFusionEngine {
176
176
  let docs = [];
177
177
  let cacheHit = false;
178
178
  let latencyMs = 0;
179
+ let rateLimited = false;
179
180
  if (spec.type === "s3") {
180
181
  const out = await this.collectFromS3(spec);
181
182
  docs = out.docs;
@@ -194,6 +195,7 @@ export class WebFusionEngine {
194
195
  const perSrcTel = res.telemetry?.per_source?.find((t) => t.source === spec.type);
195
196
  cacheHit = perSrcTel ? !!perSrcTel.cache_hit : false;
196
197
  latencyMs = perSrcTel ? Number(perSrcTel.latency_ms) : Date.now() - start;
198
+ rateLimited = perSrcTel ? !!perSrcTel.rate_limited : false;
197
199
  }
198
200
  const filtered = spec.min_stars !== undefined
199
201
  ? docs.filter((d) => normalizeStars(d) >= Number(spec.min_stars))
@@ -204,6 +206,7 @@ export class WebFusionEngine {
204
206
  cache_hit: cacheHit,
205
207
  latency_ms: latencyMs || (Date.now() - start),
206
208
  result_count: filtered.length,
209
+ ...(spec.type === "s3" ? {} : { rate_limited: rateLimited }),
207
210
  });
208
211
  }
209
212
  catch (e) {
@@ -87,6 +87,7 @@ export class WebCoreEngine {
87
87
  cache_hit: out.cacheHit,
88
88
  latency_ms: out.latencyMs || (Date.now() - t0),
89
89
  result_count: docs.length,
90
+ rate_limited: !!out.rateLimited,
90
91
  });
91
92
  }
92
93
  catch (e) {
@@ -96,6 +97,7 @@ export class WebCoreEngine {
96
97
  latency_ms: Date.now() - t0,
97
98
  result_count: 0,
98
99
  error: e?.message || String(e),
100
+ rate_limited: String(e?.message || "").includes("429"),
99
101
  });
100
102
  }
101
103
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.26",
3
+ "version": "1.2.27",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -0,0 +1,224 @@
1
+ """
2
+ Normalize schema for semi-structured JSON/JSONL exports so they can be converted to Parquet safely.
3
+
4
+ Usage:
5
+ normalize_schema_engine.py <input_path> <output_path> [options_json]
6
+
7
+ options_json (optional):
8
+ {
9
+ "flatten_metadata_json": true,
10
+ "max_keys": 200,
11
+ "extras_mode": "blob" | "drop"
12
+ }
13
+
14
+ Outputs JSON:
15
+ {"ok": true, "output_path": "...", "rows": N, "columns": M, "flattened_keys": K, "extras_mode": "..."}
16
+ or {"ok": false, "error": "..."}
17
+ """
18
+
19
+ import sys
20
+ import json
21
+ import os
22
+ import re
23
+ from typing import Any, Dict, List, Tuple
24
+
25
+
26
+ def _safe_col(name: str) -> str:
27
+ s = str(name or "").strip()
28
+ s = re.sub(r"[^a-zA-Z0-9_]", "_", s)
29
+ s = re.sub(r"_+", "_", s).strip("_")
30
+ if not s:
31
+ return "unknown"
32
+ if len(s) > 64:
33
+ s = s[:64]
34
+ return s
35
+
36
+
37
+ def _coerce_cell(v: Any) -> Any:
38
+ if v is None:
39
+ return None
40
+ if isinstance(v, (str, int, float, bool)):
41
+ return v
42
+ # Keep nested values as JSON strings so downstream Parquet has stable scalar columns.
43
+ try:
44
+ return json.dumps(v, ensure_ascii=False)
45
+ except Exception:
46
+ return str(v)
47
+
48
+
49
+ def _load_records(src: str) -> List[Dict[str, Any]]:
50
+ ext = os.path.splitext(src)[1].lower()
51
+ if ext in (".jsonl", ".ndjson"):
52
+ rows: List[Dict[str, Any]] = []
53
+ with open(src, "r", encoding="utf-8") as f:
54
+ for line in f:
55
+ line = line.strip()
56
+ if not line:
57
+ continue
58
+ obj = json.loads(line)
59
+ if isinstance(obj, dict):
60
+ rows.append(obj)
61
+ return rows
62
+
63
+ raw = open(src, "r", encoding="utf-8").read().strip()
64
+ if not raw:
65
+ return []
66
+ obj = json.loads(raw)
67
+
68
+ if isinstance(obj, list):
69
+ return [r for r in obj if isinstance(r, dict)]
70
+ if isinstance(obj, dict):
71
+ for key in ("results", "rows", "items", "records", "data", "entries", "samples"):
72
+ v = obj.get(key)
73
+ if isinstance(v, list) and (len(v) == 0 or isinstance(v[0], dict)):
74
+ return [r for r in v if isinstance(r, dict)]
75
+ # Sometimes the dict itself is the record
76
+ return [obj]
77
+
78
+ return []
79
+
80
+
81
+ def _gather_flat_keys(records: List[Dict[str, Any]], max_keys: int) -> List[str]:
82
+ keys: List[str] = []
83
+ seen = set()
84
+ for r in records:
85
+ mj = r.get("metadata_json")
86
+ if not isinstance(mj, dict):
87
+ continue
88
+ for k in mj.keys():
89
+ col = f"metadata__{_safe_col(k)}"
90
+ if col in seen:
91
+ continue
92
+ seen.add(col)
93
+ keys.append(col)
94
+ if len(keys) >= max_keys:
95
+ return keys
96
+ return keys
97
+
98
+
99
+ def _normalize_records(
100
+ records: List[Dict[str, Any]],
101
+ flat_keys: List[str],
102
+ extras_mode: str,
103
+ ) -> Tuple[List[Dict[str, Any]], int]:
104
+ flat_set = set(flat_keys)
105
+ extras_count = 0
106
+ out: List[Dict[str, Any]] = []
107
+
108
+ for r in records:
109
+ base: Dict[str, Any] = {}
110
+ # Keep top-level stable fields as-is.
111
+ for k in (
112
+ "source_type",
113
+ "source_url",
114
+ "content",
115
+ "quality_score",
116
+ "collected_at",
117
+ "content_type",
118
+ ):
119
+ if k in r:
120
+ base[k] = _coerce_cell(r.get(k))
121
+
122
+ # Preserve source_chain as a JSON string (it is nested).
123
+ if "source_chain" in r:
124
+ base["source_chain"] = _coerce_cell(r.get("source_chain"))
125
+
126
+ mj = r.get("metadata_json")
127
+ extras: Dict[str, Any] = {}
128
+ if isinstance(mj, dict):
129
+ for k, v in mj.items():
130
+ col = f"metadata__{_safe_col(k)}"
131
+ if col in flat_set:
132
+ base[col] = _coerce_cell(v)
133
+ else:
134
+ extras[k] = v
135
+
136
+ # Fill missing flattened keys with nulls for uniform schema.
137
+ for col in flat_keys:
138
+ if col not in base:
139
+ base[col] = None
140
+
141
+ if extras:
142
+ extras_count += 1
143
+ if extras_mode == "blob":
144
+ base["metadata_json_blob"] = _coerce_cell(extras)
145
+ # extras_mode == "drop" => ignore
146
+ else:
147
+ if extras_mode == "blob":
148
+ base["metadata_json_blob"] = None
149
+
150
+ out.append(base)
151
+
152
+ return out, extras_count
153
+
154
+
155
+ def main():
156
+ if len(sys.argv) < 3:
157
+ print(json.dumps({"ok": False, "error": "Usage: normalize_schema_engine.py <input> <output> [options_json]"}))
158
+ sys.exit(1)
159
+
160
+ input_path = sys.argv[1]
161
+ output_path = sys.argv[2]
162
+ options_raw = sys.argv[3] if len(sys.argv) >= 4 else "{}"
163
+
164
+ if not os.path.exists(input_path):
165
+ print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
166
+ sys.exit(1)
167
+
168
+ try:
169
+ options = json.loads(options_raw) if options_raw else {}
170
+ except Exception:
171
+ options = {}
172
+
173
+ flatten_metadata_json = options.get("flatten_metadata_json", True) is True
174
+ max_keys = int(options.get("max_keys", 200) or 200)
175
+ max_keys = max(0, min(2000, max_keys))
176
+ extras_mode = str(options.get("extras_mode", "blob") or "blob").lower()
177
+ if extras_mode not in ("blob", "drop"):
178
+ extras_mode = "blob"
179
+
180
+ try:
181
+ records = _load_records(input_path)
182
+ if not records:
183
+ os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
184
+ with open(output_path, "w", encoding="utf-8") as f:
185
+ f.write("")
186
+ print(json.dumps({"ok": True, "output_path": output_path, "rows": 0, "columns": 0, "flattened_keys": 0, "extras_mode": extras_mode}))
187
+ return
188
+
189
+ flat_keys: List[str] = []
190
+ if flatten_metadata_json and max_keys > 0:
191
+ flat_keys = _gather_flat_keys(records, max_keys)
192
+
193
+ normalized, extras_rows = _normalize_records(records, flat_keys, extras_mode)
194
+
195
+ ext = os.path.splitext(output_path)[1].lower()
196
+ os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
197
+ if ext in (".jsonl", ".ndjson"):
198
+ with open(output_path, "w", encoding="utf-8") as f:
199
+ for r in normalized:
200
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
201
+ elif ext == ".json":
202
+ with open(output_path, "w", encoding="utf-8") as f:
203
+ json.dump(normalized, f, ensure_ascii=False)
204
+ else:
205
+ raise ValueError(f"Unsupported output format: {ext}. Use .jsonl or .json")
206
+
207
+ columns = len(normalized[0].keys()) if normalized else 0
208
+ print(json.dumps({
209
+ "ok": True,
210
+ "output_path": output_path,
211
+ "rows": len(normalized),
212
+ "columns": columns,
213
+ "flattened_keys": len(flat_keys),
214
+ "extras_mode": extras_mode,
215
+ "extras_rows": extras_rows,
216
+ }))
217
+ except Exception as e:
218
+ print(json.dumps({"ok": False, "error": str(e)}))
219
+ sys.exit(1)
220
+
221
+
222
+ if __name__ == "__main__":
223
+ main()
224
+