npm - @vespermcp/mcp-server - Versions diffs - 1.2.25 → 1.2.27 - Mend

@vespermcp/mcp-server 1.2.25 → 1.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/build/index.js +92 -0
package/build/metadata/semantic-scholar-source.js +64 -11
package/build/python/normalize_schema_engine.py +224 -0
package/build/web/fusion-engine.js +35 -1
package/build/web/web-core.js +2 -0
package/package.json +1 -1
package/src/python/normalize_schema_engine.py +224 -0

package/build/index.js CHANGED Viewed

@@ -1540,6 +1540,42 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                     required: ["file_path", "target_format"],
                 },
             },
+            {
+                name: "vesper_normalize_schema",
+                description: "Normalize ragged JSON/JSONL rows into a schema-uniform JSONL (or JSON) by flattening metadata_json into stable columns (fills missing values with null). Useful before converting fused WebCore JSON to Parquet.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        file_path: {
+                            type: "string",
+                            description: "Absolute path to the input file (.json or .jsonl). If it's a fused Vesper output JSON, tool will extract results[].",
+                        },
+                        output_format: {
+                            type: "string",
+                            enum: ["jsonl", "json"],
+                            description: "Output format for normalized rows. Default: jsonl.",
+                        },
+                        output_dir: {
+                            type: "string",
+                            description: "Directory to write normalized output. Default: ~/.vesper/data/normalized_schema",
+                        },
+                        flatten_metadata_json: {
+                            type: "boolean",
+                            description: "Flatten metadata_json into metadata__* columns. Default: true.",
+                        },
+                        max_keys: {
+                            type: "number",
+                            description: "Max number of metadata_json keys to materialize as columns. Extra keys go into metadata_json_blob (if extras_mode='blob'). Default: 200.",
+                        },
+                        extras_mode: {
+                            type: "string",
+                            enum: ["blob", "drop"],
+                            description: "How to handle metadata_json keys beyond max_keys. blob keeps them in metadata_json_blob; drop discards them. Default: blob.",
+                        },
+                    },
+                    required: ["file_path"],
+                },
+            },
             {
                 name: "fuse_datasets",
                 description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
@@ -2637,6 +2673,62 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     };
                 }
             }
+            case "vesper_normalize_schema": {
+                const filePath = String(request.params.arguments?.file_path || "").trim();
+                const outputFormat = String(request.params.arguments?.output_format || "jsonl").trim().toLowerCase();
+                const outputDirRaw = request.params.arguments?.output_dir ? String(request.params.arguments.output_dir).trim() : "";
+                const flattenMetadataJson = request.params.arguments?.flatten_metadata_json !== false;
+                const maxKeys = Number(request.params.arguments?.max_keys ?? 200);
+                const extrasMode = String(request.params.arguments?.extras_mode || "blob").trim().toLowerCase();
+                if (!filePath) {
+                    throw new McpError(ErrorCode.InvalidParams, "file_path is required");
+                }
+                if (!["jsonl", "json"].includes(outputFormat)) {
+                    throw new McpError(ErrorCode.InvalidParams, "output_format must be one of: jsonl, json");
+                }
+                if (!fs.existsSync(filePath)) {
+                    return { content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }], isError: true };
+                }
+                const outDir = outputDirRaw || path.join(dataRoot, "data", "normalized_schema");
+                if (!fs.existsSync(outDir))
+                    fs.mkdirSync(outDir, { recursive: true });
+                const baseName = path.parse(filePath).name || `normalized_${Date.now()}`;
+                const outputPath = path.join(outDir, `${baseName}.normalized.${outputFormat}`);
+                try {
+                    const scriptPath = path.join(dataRoot, "python", "normalize_schema_engine.py");
+                    const options = {
+                        flatten_metadata_json: !!flattenMetadataJson,
+                        max_keys: Number.isFinite(maxKeys) ? maxKeys : 200,
+                        extras_mode: ["blob", "drop"].includes(extrasMode) ? extrasMode : "blob",
+                    };
+                    const result = await runPythonJson(scriptPath, [filePath, outputPath, JSON.stringify(options)]);
+                    if (!result.ok) {
+                        return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${result.error}` }], isError: true };
+                    }
+                    // Register normalized file to make follow-up conversion easier.
+                    try {
+                        const datasetId = path.basename(outputPath, path.extname(outputPath));
+                        upsertRegistry(datasetId, outputPath, "completed");
+                    }
+                    catch (e) {
+                        console.error(`[NormalizeSchema] Registry write failed: ${e?.message || e}`);
+                    }
+                    let msg = `**Schema normalization complete**\n`;
+                    msg += `- **Input**: ${filePath}\n`;
+                    msg += `- **Output**: ${result.output_path}\n`;
+                    msg += `- **Rows**: ${result.rows?.toLocaleString?.() ?? result.rows}\n`;
+                    msg += `- **Columns**: ${result.columns}\n`;
+                    msg += `- **Flattened keys**: ${result.flattened_keys}\n`;
+                    msg += `- **Extras mode**: ${result.extras_mode}\n`;
+                    if (result.extras_rows !== undefined)
+                        msg += `- **Rows with extras**: ${result.extras_rows}\n`;
+                    msg += `\nNext: run \`vesper_convert_format\` on the output to convert to parquet.\n`;
+                    return { content: [{ type: "text", text: msg }] };
+                }
+                catch (error) {
+                    return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${error.message}` }], isError: true };
+                }
+            }
             case "fuse_datasets": {
                 const rawSources = request.params.arguments?.sources;
                 if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {

package/build/metadata/semantic-scholar-source.js CHANGED Viewed

@@ -1,6 +1,19 @@
-import { rateLimitedFetch } from "./rate-limiter.js";
 import { CircuitBreaker } from "./circuit-breaker.js";
 import { estimateQualityScore } from "./quality.js";
+function sleep(ms) {
+    return new Promise((resolve) => setTimeout(resolve, ms));
+}
+function parseRetryAfterMs(value) {
+    if (!value)
+        return null;
+    const asNum = Number(value);
+    if (Number.isFinite(asNum) && asNum >= 0)
+        return asNum * 1000;
+    const ts = Date.parse(value);
+    if (!Number.isFinite(ts))
+        return null;
+    return Math.max(0, ts - Date.now());
+}
 export class SemanticScholarSource {
     cache;
     breaker = new CircuitBreaker("semantic_scholar", {
@@ -19,12 +32,12 @@ export class SemanticScholarSource {
         const start = Date.now();
         const cleanQuery = String(query || "").trim();
         if (!cleanQuery)
-            return { results: [], cacheHit: false, latencyMs: Date.now() - start };
+            return { results: [], cacheHit: false, latencyMs: Date.now() - start, rateLimited: false, rateLimitAttempts: 0 };
         const perPage = Math.max(1, Math.min(100, Number(limit || 20)));
         const cacheKey = `webcore:semantic_scholar:discover:${cleanQuery.toLowerCase()}:limit=${perPage}`;
         const cached = await this.cache?.getJson(cacheKey);
         if (cached)
-            return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
+            return { results: cached, cacheHit: true, latencyMs: Date.now() - start, rateLimited: false, rateLimitAttempts: 0 };
         if (!this.breaker.canAttempt()) {
             throw new Error("Semantic Scholar connector is temporarily unavailable (circuit open).");
         }
@@ -43,16 +56,32 @@ export class SemanticScholarSource {
             "publicationTypes",
             "openAccessPdf",
         ].join(","));
-        const response = await rateLimitedFetch(url.toString(), {
-            headers: {
-                "Accept": "application/json",
-                "User-Agent": "vesper/2.0 (phase1-semantic-scholar-connector)",
-            },
-        }, { maxRetries: 5, initialDelay: 1000, maxDelay: 20000 }).catch((e) => {
+        const headers = {
+            "Accept": "application/json",
+            "User-Agent": "vesper/2.0 (phase1-semantic-scholar-connector)",
+        };
+        // Handle S2 429s gracefully: respect Retry-After and avoid failing whole fusion runs.
+        const fetched = await this.fetchWith429Retry(url.toString(), headers).catch((e) => {
+            const msg = String(e?.message || e || "");
+            if (msg.includes("429")) {
+                this.breaker.onFailure();
+                return { response: null, rateLimited: true, attempts: 6 };
+            }
             this.breaker.onFailure();
             throw e;
         });
-        const data = await response.json().catch((e) => {
+        if (!fetched?.response) {
+            // Cache short empty result to avoid immediate retry storms on repeated identical queries.
+            await this.cache?.setJson(cacheKey, [], 120);
+            return {
+                results: [],
+                cacheHit: false,
+                latencyMs: Date.now() - start,
+                rateLimited: !!fetched?.rateLimited,
+                rateLimitAttempts: fetched?.attempts ?? 0,
+            };
+        }
+        const data = await fetched.response.json().catch((e) => {
             this.breaker.onFailure();
             throw new Error(`Semantic Scholar JSON parse failed: ${e?.message || String(e)}`);
         });
@@ -60,7 +89,31 @@ export class SemanticScholarSource {
         const result = papers.map((p) => this.toDatasetMetadata(p)).filter(Boolean);
         this.breaker.onSuccess();
         await this.cache?.setJson(cacheKey, result, 86400); // 24h
-        return { results: result, cacheHit: false, latencyMs: Date.now() - start };
+        return {
+            results: result,
+            cacheHit: false,
+            latencyMs: Date.now() - start,
+            rateLimited: false,
+            rateLimitAttempts: fetched.attempts,
+        };
+    }
+    async fetchWith429Retry(url, headers) {
+        const maxAttempts = 6;
+        for (let attempt = 1; attempt <= maxAttempts; attempt++) {
+            const response = await fetch(url, { headers });
+            if (response.ok)
+                return { response, rateLimited: false, attempts: attempt };
+            if (response.status !== 429) {
+                const error = new Error(`HTTP error: ${response.status}`);
+                error.status = response.status;
+                throw error;
+            }
+            const retryAfterMs = parseRetryAfterMs(response.headers.get("retry-after"));
+            const backoffMs = Math.min(30000, 1500 * Math.pow(2, attempt - 1));
+            const jitterMs = Math.floor(Math.random() * 400);
+            await sleep((retryAfterMs ?? backoffMs) + jitterMs);
+        }
+        return { response: null, rateLimited: true, attempts: maxAttempts };
     }
     toDatasetMetadata(paper) {
         const paperId = String(paper.paperId || paper.externalIds?.DOI || "").trim();

package/build/python/normalize_schema_engine.py ADDED Viewed

@@ -0,0 +1,224 @@
+"""
+Normalize schema for semi-structured JSON/JSONL exports so they can be converted to Parquet safely.
+Usage:
+  normalize_schema_engine.py <input_path> <output_path> [options_json]
+options_json (optional):
+  {
+    "flatten_metadata_json": true,
+    "max_keys": 200,
+    "extras_mode": "blob" | "drop"
+  }
+Outputs JSON:
+  {"ok": true, "output_path": "...", "rows": N, "columns": M, "flattened_keys": K, "extras_mode": "..."}
+or {"ok": false, "error": "..."}
+"""
+import sys
+import json
+import os
+import re
+from typing import Any, Dict, List, Tuple
+def _safe_col(name: str) -> str:
+    s = str(name or "").strip()
+    s = re.sub(r"[^a-zA-Z0-9_]", "_", s)
+    s = re.sub(r"_+", "_", s).strip("_")
+    if not s:
+        return "unknown"
+    if len(s) > 64:
+        s = s[:64]
+    return s
+def _coerce_cell(v: Any) -> Any:
+    if v is None:
+        return None
+    if isinstance(v, (str, int, float, bool)):
+        return v
+    # Keep nested values as JSON strings so downstream Parquet has stable scalar columns.
+    try:
+        return json.dumps(v, ensure_ascii=False)
+    except Exception:
+        return str(v)
+def _load_records(src: str) -> List[Dict[str, Any]]:
+    ext = os.path.splitext(src)[1].lower()
+    if ext in (".jsonl", ".ndjson"):
+        rows: List[Dict[str, Any]] = []
+        with open(src, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                obj = json.loads(line)
+                if isinstance(obj, dict):
+                    rows.append(obj)
+        return rows
+    raw = open(src, "r", encoding="utf-8").read().strip()
+    if not raw:
+        return []
+    obj = json.loads(raw)
+    if isinstance(obj, list):
+        return [r for r in obj if isinstance(r, dict)]
+    if isinstance(obj, dict):
+        for key in ("results", "rows", "items", "records", "data", "entries", "samples"):
+            v = obj.get(key)
+            if isinstance(v, list) and (len(v) == 0 or isinstance(v[0], dict)):
+                return [r for r in v if isinstance(r, dict)]
+        # Sometimes the dict itself is the record
+        return [obj]
+    return []
+def _gather_flat_keys(records: List[Dict[str, Any]], max_keys: int) -> List[str]:
+    keys: List[str] = []
+    seen = set()
+    for r in records:
+        mj = r.get("metadata_json")
+        if not isinstance(mj, dict):
+            continue
+        for k in mj.keys():
+            col = f"metadata__{_safe_col(k)}"
+            if col in seen:
+                continue
+            seen.add(col)
+            keys.append(col)
+            if len(keys) >= max_keys:
+                return keys
+    return keys
+def _normalize_records(
+    records: List[Dict[str, Any]],
+    flat_keys: List[str],
+    extras_mode: str,
+) -> Tuple[List[Dict[str, Any]], int]:
+    flat_set = set(flat_keys)
+    extras_count = 0
+    out: List[Dict[str, Any]] = []
+    for r in records:
+        base: Dict[str, Any] = {}
+        # Keep top-level stable fields as-is.
+        for k in (
+            "source_type",
+            "source_url",
+            "content",
+            "quality_score",
+            "collected_at",
+            "content_type",
+        ):
+            if k in r:
+                base[k] = _coerce_cell(r.get(k))
+        # Preserve source_chain as a JSON string (it is nested).
+        if "source_chain" in r:
+            base["source_chain"] = _coerce_cell(r.get("source_chain"))
+        mj = r.get("metadata_json")
+        extras: Dict[str, Any] = {}
+        if isinstance(mj, dict):
+            for k, v in mj.items():
+                col = f"metadata__{_safe_col(k)}"
+                if col in flat_set:
+                    base[col] = _coerce_cell(v)
+                else:
+                    extras[k] = v
+        # Fill missing flattened keys with nulls for uniform schema.
+        for col in flat_keys:
+            if col not in base:
+                base[col] = None
+        if extras:
+            extras_count += 1
+            if extras_mode == "blob":
+                base["metadata_json_blob"] = _coerce_cell(extras)
+            # extras_mode == "drop" => ignore
+        else:
+            if extras_mode == "blob":
+                base["metadata_json_blob"] = None
+        out.append(base)
+    return out, extras_count
+def main():
+    if len(sys.argv) < 3:
+        print(json.dumps({"ok": False, "error": "Usage: normalize_schema_engine.py <input> <output> [options_json]"}))
+        sys.exit(1)
+    input_path = sys.argv[1]
+    output_path = sys.argv[2]
+    options_raw = sys.argv[3] if len(sys.argv) >= 4 else "{}"
+    if not os.path.exists(input_path):
+        print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
+        sys.exit(1)
+    try:
+        options = json.loads(options_raw) if options_raw else {}
+    except Exception:
+        options = {}
+    flatten_metadata_json = options.get("flatten_metadata_json", True) is True
+    max_keys = int(options.get("max_keys", 200) or 200)
+    max_keys = max(0, min(2000, max_keys))
+    extras_mode = str(options.get("extras_mode", "blob") or "blob").lower()
+    if extras_mode not in ("blob", "drop"):
+        extras_mode = "blob"
+    try:
+        records = _load_records(input_path)
+        if not records:
+            os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write("")
+            print(json.dumps({"ok": True, "output_path": output_path, "rows": 0, "columns": 0, "flattened_keys": 0, "extras_mode": extras_mode}))
+            return
+        flat_keys: List[str] = []
+        if flatten_metadata_json and max_keys > 0:
+            flat_keys = _gather_flat_keys(records, max_keys)
+        normalized, extras_rows = _normalize_records(records, flat_keys, extras_mode)
+        ext = os.path.splitext(output_path)[1].lower()
+        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+        if ext in (".jsonl", ".ndjson"):
+            with open(output_path, "w", encoding="utf-8") as f:
+                for r in normalized:
+                    f.write(json.dumps(r, ensure_ascii=False) + "\n")
+        elif ext == ".json":
+            with open(output_path, "w", encoding="utf-8") as f:
+                json.dump(normalized, f, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported output format: {ext}. Use .jsonl or .json")
+        columns = len(normalized[0].keys()) if normalized else 0
+        print(json.dumps({
+            "ok": True,
+            "output_path": output_path,
+            "rows": len(normalized),
+            "columns": columns,
+            "flattened_keys": len(flat_keys),
+            "extras_mode": extras_mode,
+            "extras_rows": extras_rows,
+        }))
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": str(e)}))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

package/build/web/fusion-engine.js CHANGED Viewed

@@ -48,14 +48,45 @@ function titleTokens(doc) {
     const raw = typeof mj.title === "string" ? mj.title : "";
     return tokenize(raw);
 }
+function semanticHintTokens(doc) {
+    const mj = doc.metadata_json || {};
+    const fields = [];
+    if (typeof mj.title === "string")
+        fields.push(mj.title);
+    if (typeof mj.name === "string")
+        fields.push(mj.name);
+    if (typeof mj.description === "string")
+        fields.push(mj.description);
+    if (typeof mj.abstract === "string")
+        fields.push(mj.abstract);
+    if (Array.isArray(mj.tags))
+        fields.push(mj.tags.join(" "));
+    if (Array.isArray(mj.topics))
+        fields.push(mj.topics.join(" "));
+    fields.push(doc.source_url || "");
+    return tokenize(fields.join(" "));
+}
 function isSuspiciousPair(a, b) {
     // semantic fallback should be selective; do cheap prefilter first
+    // Metadata/topic overlap can indicate same object even with very different body lengths.
+    const aHints = semanticHintTokens(a);
+    const bHints = semanticHintTokens(b);
+    if (aHints.size > 0 && bHints.size > 0) {
+        let hInter = 0;
+        for (const t of aHints)
+            if (bHints.has(t))
+                hInter++;
+        const hUnion = aHints.size + bHints.size - hInter;
+        const hJaccard = hUnion > 0 ? hInter / hUnion : 0;
+        if (hJaccard >= 0.2)
+            return true;
+    }
     const aLen = a.content.length;
     const bLen = b.content.length;
     const maxLen = Math.max(aLen, bLen, 1);
     const lenRatio = Math.abs(aLen - bLen) / maxLen;
     // Loosened again to allow abstract-vs-summary style comparisons.
-    if (lenRatio > 0.8)
+    if (lenRatio > 0.9)
         return false;
     // Fast path: same normalized title-like prefix often indicates same research object.
     const aPrefix = a.content.slice(0, 140).toLowerCase().replace(/[^a-z0-9\s]/g, " ").trim();
@@ -145,6 +176,7 @@ export class WebFusionEngine {
                 let docs = [];
                 let cacheHit = false;
                 let latencyMs = 0;
+                let rateLimited = false;
                 if (spec.type === "s3") {
                     const out = await this.collectFromS3(spec);
                     docs = out.docs;
@@ -163,6 +195,7 @@ export class WebFusionEngine {
                     const perSrcTel = res.telemetry?.per_source?.find((t) => t.source === spec.type);
                     cacheHit = perSrcTel ? !!perSrcTel.cache_hit : false;
                     latencyMs = perSrcTel ? Number(perSrcTel.latency_ms) : Date.now() - start;
+                    rateLimited = perSrcTel ? !!perSrcTel.rate_limited : false;
                 }
                 const filtered = spec.min_stars !== undefined
                     ? docs.filter((d) => normalizeStars(d) >= Number(spec.min_stars))
@@ -173,6 +206,7 @@ export class WebFusionEngine {
                     cache_hit: cacheHit,
                     latency_ms: latencyMs || (Date.now() - start),
                     result_count: filtered.length,
+                    ...(spec.type === "s3" ? {} : { rate_limited: rateLimited }),
                 });
             }
             catch (e) {

package/build/web/web-core.js CHANGED Viewed

@@ -87,6 +87,7 @@ export class WebCoreEngine {
                         cache_hit: out.cacheHit,
                         latency_ms: out.latencyMs || (Date.now() - t0),
                         result_count: docs.length,
+                        rate_limited: !!out.rateLimited,
                     });
                 }
                 catch (e) {
@@ -96,6 +97,7 @@ export class WebCoreEngine {
                         latency_ms: Date.now() - t0,
                         result_count: 0,
                         error: e?.message || String(e),
+                        rate_limited: String(e?.message || "").includes("429"),
                     });
                 }
             }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vespermcp/mcp-server",
-  "version": "1.2.25",
+  "version": "1.2.27",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",

package/src/python/normalize_schema_engine.py ADDED Viewed

@@ -0,0 +1,224 @@
+"""
+Normalize schema for semi-structured JSON/JSONL exports so they can be converted to Parquet safely.
+Usage:
+  normalize_schema_engine.py <input_path> <output_path> [options_json]
+options_json (optional):
+  {
+    "flatten_metadata_json": true,
+    "max_keys": 200,
+    "extras_mode": "blob" | "drop"
+  }
+Outputs JSON:
+  {"ok": true, "output_path": "...", "rows": N, "columns": M, "flattened_keys": K, "extras_mode": "..."}
+or {"ok": false, "error": "..."}
+"""
+import sys
+import json
+import os
+import re
+from typing import Any, Dict, List, Tuple
+def _safe_col(name: str) -> str:
+    s = str(name or "").strip()
+    s = re.sub(r"[^a-zA-Z0-9_]", "_", s)
+    s = re.sub(r"_+", "_", s).strip("_")
+    if not s:
+        return "unknown"
+    if len(s) > 64:
+        s = s[:64]
+    return s
+def _coerce_cell(v: Any) -> Any:
+    if v is None:
+        return None
+    if isinstance(v, (str, int, float, bool)):
+        return v
+    # Keep nested values as JSON strings so downstream Parquet has stable scalar columns.
+    try:
+        return json.dumps(v, ensure_ascii=False)
+    except Exception:
+        return str(v)
+def _load_records(src: str) -> List[Dict[str, Any]]:
+    ext = os.path.splitext(src)[1].lower()
+    if ext in (".jsonl", ".ndjson"):
+        rows: List[Dict[str, Any]] = []
+        with open(src, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                obj = json.loads(line)
+                if isinstance(obj, dict):
+                    rows.append(obj)
+        return rows
+    raw = open(src, "r", encoding="utf-8").read().strip()
+    if not raw:
+        return []
+    obj = json.loads(raw)
+    if isinstance(obj, list):
+        return [r for r in obj if isinstance(r, dict)]
+    if isinstance(obj, dict):
+        for key in ("results", "rows", "items", "records", "data", "entries", "samples"):
+            v = obj.get(key)
+            if isinstance(v, list) and (len(v) == 0 or isinstance(v[0], dict)):
+                return [r for r in v if isinstance(r, dict)]
+        # Sometimes the dict itself is the record
+        return [obj]
+    return []
+def _gather_flat_keys(records: List[Dict[str, Any]], max_keys: int) -> List[str]:
+    keys: List[str] = []
+    seen = set()
+    for r in records:
+        mj = r.get("metadata_json")
+        if not isinstance(mj, dict):
+            continue
+        for k in mj.keys():
+            col = f"metadata__{_safe_col(k)}"
+            if col in seen:
+                continue
+            seen.add(col)
+            keys.append(col)
+            if len(keys) >= max_keys:
+                return keys
+    return keys
+def _normalize_records(
+    records: List[Dict[str, Any]],
+    flat_keys: List[str],
+    extras_mode: str,
+) -> Tuple[List[Dict[str, Any]], int]:
+    flat_set = set(flat_keys)
+    extras_count = 0
+    out: List[Dict[str, Any]] = []
+    for r in records:
+        base: Dict[str, Any] = {}
+        # Keep top-level stable fields as-is.
+        for k in (
+            "source_type",
+            "source_url",
+            "content",
+            "quality_score",
+            "collected_at",
+            "content_type",
+        ):
+            if k in r:
+                base[k] = _coerce_cell(r.get(k))
+        # Preserve source_chain as a JSON string (it is nested).
+        if "source_chain" in r:
+            base["source_chain"] = _coerce_cell(r.get("source_chain"))
+        mj = r.get("metadata_json")
+        extras: Dict[str, Any] = {}
+        if isinstance(mj, dict):
+            for k, v in mj.items():
+                col = f"metadata__{_safe_col(k)}"
+                if col in flat_set:
+                    base[col] = _coerce_cell(v)
+                else:
+                    extras[k] = v
+        # Fill missing flattened keys with nulls for uniform schema.
+        for col in flat_keys:
+            if col not in base:
+                base[col] = None
+        if extras:
+            extras_count += 1
+            if extras_mode == "blob":
+                base["metadata_json_blob"] = _coerce_cell(extras)
+            # extras_mode == "drop" => ignore
+        else:
+            if extras_mode == "blob":
+                base["metadata_json_blob"] = None
+        out.append(base)
+    return out, extras_count
+def main():
+    if len(sys.argv) < 3:
+        print(json.dumps({"ok": False, "error": "Usage: normalize_schema_engine.py <input> <output> [options_json]"}))
+        sys.exit(1)
+    input_path = sys.argv[1]
+    output_path = sys.argv[2]
+    options_raw = sys.argv[3] if len(sys.argv) >= 4 else "{}"
+    if not os.path.exists(input_path):
+        print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
+        sys.exit(1)
+    try:
+        options = json.loads(options_raw) if options_raw else {}
+    except Exception:
+        options = {}
+    flatten_metadata_json = options.get("flatten_metadata_json", True) is True
+    max_keys = int(options.get("max_keys", 200) or 200)
+    max_keys = max(0, min(2000, max_keys))
+    extras_mode = str(options.get("extras_mode", "blob") or "blob").lower()
+    if extras_mode not in ("blob", "drop"):
+        extras_mode = "blob"
+    try:
+        records = _load_records(input_path)
+        if not records:
+            os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write("")
+            print(json.dumps({"ok": True, "output_path": output_path, "rows": 0, "columns": 0, "flattened_keys": 0, "extras_mode": extras_mode}))
+            return
+        flat_keys: List[str] = []
+        if flatten_metadata_json and max_keys > 0:
+            flat_keys = _gather_flat_keys(records, max_keys)
+        normalized, extras_rows = _normalize_records(records, flat_keys, extras_mode)
+        ext = os.path.splitext(output_path)[1].lower()
+        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+        if ext in (".jsonl", ".ndjson"):
+            with open(output_path, "w", encoding="utf-8") as f:
+                for r in normalized:
+                    f.write(json.dumps(r, ensure_ascii=False) + "\n")
+        elif ext == ".json":
+            with open(output_path, "w", encoding="utf-8") as f:
+                json.dump(normalized, f, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported output format: {ext}. Use .jsonl or .json")
+        columns = len(normalized[0].keys()) if normalized else 0
+        print(json.dumps({
+            "ok": True,
+            "output_path": output_path,
+            "rows": len(normalized),
+            "columns": columns,
+            "flattened_keys": len(flat_keys),
+            "extras_mode": extras_mode,
+            "extras_rows": extras_rows,
+        }))
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": str(e)}))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()