vesper-wizard 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +82 -3
- package/build/install/install-service.js +4 -3
- package/build/metadata/scraper.js +13 -4
- package/build/python/convert_engine.py +92 -0
- package/build/search/engine.js +7 -1
- package/build/search/query-intent.js +45 -0
- package/package.json +1 -1
- package/src/python/convert_engine.py +92 -0
package/build/index.js
CHANGED
|
@@ -1352,6 +1352,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1352
1352
|
properties: {},
|
|
1353
1353
|
},
|
|
1354
1354
|
},
|
|
1355
|
+
{
|
|
1356
|
+
name: "vesper_convert_format",
|
|
1357
|
+
description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
|
|
1358
|
+
inputSchema: {
|
|
1359
|
+
type: "object",
|
|
1360
|
+
properties: {
|
|
1361
|
+
file_path: {
|
|
1362
|
+
type: "string",
|
|
1363
|
+
description: "Absolute path to the input dataset file.",
|
|
1364
|
+
},
|
|
1365
|
+
target_format: {
|
|
1366
|
+
type: "string",
|
|
1367
|
+
enum: ["csv", "parquet", "json", "jsonl"],
|
|
1368
|
+
description: "The desired output format.",
|
|
1369
|
+
},
|
|
1370
|
+
},
|
|
1371
|
+
required: ["file_path", "target_format"],
|
|
1372
|
+
},
|
|
1373
|
+
},
|
|
1355
1374
|
{
|
|
1356
1375
|
name: "fuse_datasets",
|
|
1357
1376
|
description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
|
|
@@ -1784,7 +1803,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1784
1803
|
max_items: maxItems,
|
|
1785
1804
|
workers,
|
|
1786
1805
|
image_column: imageColumn,
|
|
1787
|
-
output_root:
|
|
1806
|
+
output_root: requestedOutputDir || process.cwd(),
|
|
1788
1807
|
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1789
1808
|
};
|
|
1790
1809
|
try {
|
|
@@ -2272,6 +2291,63 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2272
2291
|
content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
|
|
2273
2292
|
};
|
|
2274
2293
|
}
|
|
2294
|
+
case "vesper_convert_format": {
|
|
2295
|
+
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
2296
|
+
const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
|
|
2297
|
+
if (!filePath) {
|
|
2298
|
+
throw new McpError(ErrorCode.InvalidParams, "file_path is required");
|
|
2299
|
+
}
|
|
2300
|
+
if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
|
|
2301
|
+
throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
|
|
2302
|
+
}
|
|
2303
|
+
if (!fs.existsSync(filePath)) {
|
|
2304
|
+
return {
|
|
2305
|
+
content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
|
|
2306
|
+
isError: true,
|
|
2307
|
+
};
|
|
2308
|
+
}
|
|
2309
|
+
const inputExt = path.extname(filePath).toLowerCase();
|
|
2310
|
+
const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
|
|
2311
|
+
const outputExt = extMap[targetFormat];
|
|
2312
|
+
if (inputExt === outputExt) {
|
|
2313
|
+
return {
|
|
2314
|
+
content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
|
|
2315
|
+
};
|
|
2316
|
+
}
|
|
2317
|
+
const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
|
|
2318
|
+
try {
|
|
2319
|
+
await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
|
|
2320
|
+
const convertScript = path.join(dataRoot, "python", "convert_engine.py");
|
|
2321
|
+
const result = await runPythonJson(convertScript, [filePath, outputPath]);
|
|
2322
|
+
if (!result.ok) {
|
|
2323
|
+
return {
|
|
2324
|
+
content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
|
|
2325
|
+
isError: true,
|
|
2326
|
+
};
|
|
2327
|
+
}
|
|
2328
|
+
// Register converted file in the registry
|
|
2329
|
+
const datasetId = path.basename(outputPath, outputExt);
|
|
2330
|
+
try {
|
|
2331
|
+
upsertRegistry(datasetId, outputPath, "completed");
|
|
2332
|
+
}
|
|
2333
|
+
catch (e) {
|
|
2334
|
+
console.error(`[Convert] Registry write failed: ${e?.message || e}`);
|
|
2335
|
+
}
|
|
2336
|
+
let msg = `**Conversion complete**\n`;
|
|
2337
|
+
msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
|
|
2338
|
+
msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
|
|
2339
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2340
|
+
if (result.size_mb !== undefined)
|
|
2341
|
+
msg += `- **Size**: ${result.size_mb} MB\n`;
|
|
2342
|
+
return { content: [{ type: "text", text: msg }] };
|
|
2343
|
+
}
|
|
2344
|
+
catch (error) {
|
|
2345
|
+
return {
|
|
2346
|
+
content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
|
|
2347
|
+
isError: true,
|
|
2348
|
+
};
|
|
2349
|
+
}
|
|
2350
|
+
}
|
|
2275
2351
|
case "fuse_datasets": {
|
|
2276
2352
|
const rawSources = request.params.arguments?.sources;
|
|
2277
2353
|
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
@@ -2312,10 +2388,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2312
2388
|
try {
|
|
2313
2389
|
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
|
|
2314
2390
|
const ext = extMap[outputFormat] || ".feather";
|
|
2315
|
-
const outDir =
|
|
2391
|
+
const outDir = process.cwd();
|
|
2316
2392
|
if (!fs.existsSync(outDir))
|
|
2317
2393
|
fs.mkdirSync(outDir, { recursive: true });
|
|
2318
2394
|
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
2395
|
+
console.error(`[Fusion] Resolved output directory: ${outDir}`);
|
|
2319
2396
|
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
2320
2397
|
strategy,
|
|
2321
2398
|
join_on: joinOn,
|
|
@@ -2874,10 +2951,12 @@ async function runExportCli(args) {
|
|
|
2874
2951
|
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
2875
2952
|
const ext = extMap[requestedFormat] || ".parquet";
|
|
2876
2953
|
const safeName = toSafeDatasetPathFragment(datasetId);
|
|
2877
|
-
const outDir = targetDir ||
|
|
2954
|
+
const outDir = targetDir || process.cwd();
|
|
2878
2955
|
if (!fs.existsSync(outDir))
|
|
2879
2956
|
fs.mkdirSync(outDir, { recursive: true });
|
|
2880
2957
|
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
2958
|
+
console.error(`[Export] Resolved output directory: ${outDir}`);
|
|
2959
|
+
console.error(`[Export] Output file: ${outputFile}`);
|
|
2881
2960
|
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
2882
2961
|
console.log(`Export complete: ${result.output_path}`);
|
|
2883
2962
|
console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
|
|
@@ -21,11 +21,12 @@ export class InstallService {
|
|
|
21
21
|
// Create target directory
|
|
22
22
|
const installLabel = dataset?.name || datasetId;
|
|
23
23
|
const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
|
|
24
|
-
// If caller specified a target dir, use it directly
|
|
25
|
-
// Otherwise
|
|
24
|
+
// If caller specified a target dir, use it directly
|
|
25
|
+
// Otherwise use the current working directory
|
|
26
26
|
const installDir = targetDir
|
|
27
27
|
? path.resolve(targetDir)
|
|
28
|
-
: path.
|
|
28
|
+
: path.resolve(process.cwd(), sanitizedName);
|
|
29
|
+
console.error(`[InstallService] Resolved install directory: ${installDir}`);
|
|
29
30
|
if (!fs.existsSync(installDir)) {
|
|
30
31
|
fs.mkdirSync(installDir, { recursive: true });
|
|
31
32
|
}
|
|
@@ -3,7 +3,7 @@ import { categorizeLicense } from "./license.js";
|
|
|
3
3
|
import { calculateQualityScore } from "./quality.js";
|
|
4
4
|
import { classifyDomain } from "./domain.js";
|
|
5
5
|
import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
|
|
6
|
-
import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent } from "../search/query-intent.js";
|
|
6
|
+
import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "../search/query-intent.js";
|
|
7
7
|
export class HuggingFaceScraper {
|
|
8
8
|
/**
|
|
9
9
|
* Bulk discovery: Fetch many datasets quickly without deep details.
|
|
@@ -85,6 +85,9 @@ export class HuggingFaceScraper {
|
|
|
85
85
|
has_readme: false,
|
|
86
86
|
is_incomplete: true // Flag for Phase 2
|
|
87
87
|
};
|
|
88
|
+
// Hard language exclusion
|
|
89
|
+
if (intent && shouldExcludeByLanguage(metadata, intent))
|
|
90
|
+
continue;
|
|
88
91
|
results.push(metadata);
|
|
89
92
|
}
|
|
90
93
|
}
|
|
@@ -302,10 +305,16 @@ export class HuggingFaceScraper {
|
|
|
302
305
|
description_length: description.length,
|
|
303
306
|
has_readme: !!(cardData.readme || cardData.readme_content)
|
|
304
307
|
};
|
|
305
|
-
|
|
306
|
-
|
|
308
|
+
// Hard language exclusion — drop bilingual/multilingual for single-language queries
|
|
309
|
+
if (intent && shouldExcludeByLanguage(metadata, intent)) {
|
|
310
|
+
// skip — do not push
|
|
311
|
+
}
|
|
312
|
+
else {
|
|
313
|
+
if (intent) {
|
|
314
|
+
metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
|
|
315
|
+
}
|
|
316
|
+
results.push(metadata);
|
|
307
317
|
}
|
|
308
|
-
results.push(metadata);
|
|
309
318
|
}
|
|
310
319
|
catch (e) {
|
|
311
320
|
// Track all errors for user feedback
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
|
|
3
|
+
Usage: convert_engine.py <input_path> <output_path>
|
|
4
|
+
Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import polars as pl
|
|
12
|
+
except Exception:
|
|
13
|
+
print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load(src: str) -> pl.DataFrame:
|
|
18
|
+
ext = os.path.splitext(src)[1].lower()
|
|
19
|
+
if ext == ".csv":
|
|
20
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
21
|
+
if ext in (".tsv", ".tab"):
|
|
22
|
+
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
23
|
+
if ext in (".parquet", ".pq"):
|
|
24
|
+
return pl.read_parquet(src)
|
|
25
|
+
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
26
|
+
return pl.read_ipc(src)
|
|
27
|
+
if ext in (".jsonl", ".ndjson"):
|
|
28
|
+
return pl.read_ndjson(src)
|
|
29
|
+
if ext == ".json":
|
|
30
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
31
|
+
if raw.startswith("["):
|
|
32
|
+
return pl.read_json(src)
|
|
33
|
+
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
34
|
+
return pl.read_ndjson(src)
|
|
35
|
+
obj = json.loads(raw)
|
|
36
|
+
if isinstance(obj, dict):
|
|
37
|
+
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
38
|
+
if key in obj and isinstance(obj[key], list):
|
|
39
|
+
return pl.DataFrame(obj[key])
|
|
40
|
+
for v in obj.values():
|
|
41
|
+
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
42
|
+
return pl.DataFrame(v)
|
|
43
|
+
return pl.read_json(src)
|
|
44
|
+
# Fallback: try csv
|
|
45
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _write(df: pl.DataFrame, dst: str) -> None:
|
|
49
|
+
ext = os.path.splitext(dst)[1].lower()
|
|
50
|
+
os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
|
|
51
|
+
if ext in (".parquet", ".pq"):
|
|
52
|
+
df.write_parquet(dst)
|
|
53
|
+
elif ext == ".csv":
|
|
54
|
+
df.write_csv(dst)
|
|
55
|
+
elif ext == ".json":
|
|
56
|
+
df.write_json(dst, row_oriented=True)
|
|
57
|
+
elif ext in (".jsonl", ".ndjson"):
|
|
58
|
+
df.write_ndjson(dst)
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError(f"Unsupported output format: {ext}")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def main():
|
|
64
|
+
if len(sys.argv) < 3:
|
|
65
|
+
print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
|
|
66
|
+
sys.exit(1)
|
|
67
|
+
|
|
68
|
+
input_path = sys.argv[1]
|
|
69
|
+
output_path = sys.argv[2]
|
|
70
|
+
|
|
71
|
+
if not os.path.exists(input_path):
|
|
72
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
73
|
+
sys.exit(1)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
df = _load(input_path)
|
|
77
|
+
_write(df, output_path)
|
|
78
|
+
size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
|
|
79
|
+
print(json.dumps({
|
|
80
|
+
"ok": True,
|
|
81
|
+
"output_path": output_path,
|
|
82
|
+
"rows": df.height,
|
|
83
|
+
"columns": df.width,
|
|
84
|
+
"size_mb": size_mb,
|
|
85
|
+
}))
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
main()
|
package/build/search/engine.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { JITOrchestrator } from "./jit-orchestrator.js";
|
|
2
|
-
import { analyzeDatasetQuery, scoreDatasetAgainstIntent } from "./query-intent.js";
|
|
2
|
+
import { analyzeDatasetQuery, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "./query-intent.js";
|
|
3
3
|
import fs from "fs";
|
|
4
4
|
function log(msg) {
|
|
5
5
|
fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
|
|
@@ -68,6 +68,12 @@ export class SearchEngine {
|
|
|
68
68
|
// Filter: Safe only
|
|
69
69
|
if (options.safeOnly && metadata.license.category === "restricted")
|
|
70
70
|
continue;
|
|
71
|
+
// Filter: Hard language exclusion — completely drop bilingual/multilingual datasets
|
|
72
|
+
// when user explicitly requested a single language
|
|
73
|
+
if (shouldExcludeByLanguage(metadata, intent)) {
|
|
74
|
+
log(`Language exclusion: Dropped ${match.id} (bilingual/multilingual for single-language query)`);
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
71
77
|
const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
|
|
72
78
|
// Filter: Explicit Negative Keywords
|
|
73
79
|
if (negativeKeywords.some(neg => text.includes(neg))) {
|
|
@@ -172,6 +172,51 @@ const LANGUAGE_TO_CODE = {
|
|
|
172
172
|
russian: "ru",
|
|
173
173
|
hindi: "hi",
|
|
174
174
|
};
|
|
175
|
+
const BILINGUAL_INDICATORS = [
|
|
176
|
+
"translation", "parallel", "bilingual", "multilingual",
|
|
177
|
+
"cross-lingual", "crosslingual", "machine-translation",
|
|
178
|
+
"aligned", "comparable corpus",
|
|
179
|
+
];
|
|
180
|
+
/**
|
|
181
|
+
* Hard-exclude a dataset when the user requests a single specific language
|
|
182
|
+
* and the dataset is bilingual, multilingual, or tagged with other languages.
|
|
183
|
+
* Returns true if the dataset should be EXCLUDED from results.
|
|
184
|
+
*/
|
|
185
|
+
export function shouldExcludeByLanguage(dataset, intent) {
|
|
186
|
+
if (!intent?.language || intent.language === "multilingual")
|
|
187
|
+
return false;
|
|
188
|
+
const aliases = getLanguageAliases(intent.language);
|
|
189
|
+
const datasetLanguages = dataset.languages.map(normalizeToken).filter(l => l && l !== "unknown");
|
|
190
|
+
// If the dataset has language tags and ANY of them are NOT the requested language, exclude
|
|
191
|
+
if (datasetLanguages.length > 0) {
|
|
192
|
+
const hasRequestedLang = aliases.some(a => datasetLanguages.includes(a));
|
|
193
|
+
const hasOtherLangs = datasetLanguages.some(lang => !aliases.includes(lang));
|
|
194
|
+
if (hasOtherLangs)
|
|
195
|
+
return true; // bilingual/multilingual → exclude
|
|
196
|
+
if (!hasRequestedLang)
|
|
197
|
+
return true; // wrong language entirely
|
|
198
|
+
}
|
|
199
|
+
// Check name, description, and tags for bilingual indicators or other language names
|
|
200
|
+
const text = [
|
|
201
|
+
dataset.name,
|
|
202
|
+
dataset.description,
|
|
203
|
+
dataset.tags.join(" "),
|
|
204
|
+
].join(" ").toLowerCase();
|
|
205
|
+
// Check for bilingual/translation keywords
|
|
206
|
+
if (BILINGUAL_INDICATORS.some(indicator => text.includes(indicator))) {
|
|
207
|
+
return true;
|
|
208
|
+
}
|
|
209
|
+
// Check if the text mentions other specific languages by name
|
|
210
|
+
const otherLanguageNames = Object.keys(LANGUAGE_ALIASES).filter(lang => lang !== intent.language && lang !== "multilingual");
|
|
211
|
+
for (const otherLang of otherLanguageNames) {
|
|
212
|
+
const otherAliases = LANGUAGE_ALIASES[otherLang];
|
|
213
|
+
// Only check the full language name (not 2-letter codes which could appear in regular text)
|
|
214
|
+
if (otherAliases && otherAliases[0] && text.includes(otherAliases[0])) {
|
|
215
|
+
return true;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return false;
|
|
219
|
+
}
|
|
175
220
|
function buildHeuristicIntent(query, requirements) {
|
|
176
221
|
const originalQuery = `${query || ""} ${requirements || ""}`.trim();
|
|
177
222
|
const normalizedQuery = originalQuery.toLowerCase();
|
package/package.json
CHANGED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
|
|
3
|
+
Usage: convert_engine.py <input_path> <output_path>
|
|
4
|
+
Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import polars as pl
|
|
12
|
+
except Exception:
|
|
13
|
+
print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load(src: str) -> pl.DataFrame:
|
|
18
|
+
ext = os.path.splitext(src)[1].lower()
|
|
19
|
+
if ext == ".csv":
|
|
20
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
21
|
+
if ext in (".tsv", ".tab"):
|
|
22
|
+
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
23
|
+
if ext in (".parquet", ".pq"):
|
|
24
|
+
return pl.read_parquet(src)
|
|
25
|
+
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
26
|
+
return pl.read_ipc(src)
|
|
27
|
+
if ext in (".jsonl", ".ndjson"):
|
|
28
|
+
return pl.read_ndjson(src)
|
|
29
|
+
if ext == ".json":
|
|
30
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
31
|
+
if raw.startswith("["):
|
|
32
|
+
return pl.read_json(src)
|
|
33
|
+
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
34
|
+
return pl.read_ndjson(src)
|
|
35
|
+
obj = json.loads(raw)
|
|
36
|
+
if isinstance(obj, dict):
|
|
37
|
+
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
38
|
+
if key in obj and isinstance(obj[key], list):
|
|
39
|
+
return pl.DataFrame(obj[key])
|
|
40
|
+
for v in obj.values():
|
|
41
|
+
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
42
|
+
return pl.DataFrame(v)
|
|
43
|
+
return pl.read_json(src)
|
|
44
|
+
# Fallback: try csv
|
|
45
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _write(df: pl.DataFrame, dst: str) -> None:
|
|
49
|
+
ext = os.path.splitext(dst)[1].lower()
|
|
50
|
+
os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
|
|
51
|
+
if ext in (".parquet", ".pq"):
|
|
52
|
+
df.write_parquet(dst)
|
|
53
|
+
elif ext == ".csv":
|
|
54
|
+
df.write_csv(dst)
|
|
55
|
+
elif ext == ".json":
|
|
56
|
+
df.write_json(dst, row_oriented=True)
|
|
57
|
+
elif ext in (".jsonl", ".ndjson"):
|
|
58
|
+
df.write_ndjson(dst)
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError(f"Unsupported output format: {ext}")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def main():
|
|
64
|
+
if len(sys.argv) < 3:
|
|
65
|
+
print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
|
|
66
|
+
sys.exit(1)
|
|
67
|
+
|
|
68
|
+
input_path = sys.argv[1]
|
|
69
|
+
output_path = sys.argv[2]
|
|
70
|
+
|
|
71
|
+
if not os.path.exists(input_path):
|
|
72
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
73
|
+
sys.exit(1)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
df = _load(input_path)
|
|
77
|
+
_write(df, output_path)
|
|
78
|
+
size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
|
|
79
|
+
print(json.dumps({
|
|
80
|
+
"ok": True,
|
|
81
|
+
"output_path": output_path,
|
|
82
|
+
"rows": df.height,
|
|
83
|
+
"columns": df.width,
|
|
84
|
+
"size_mb": size_mb,
|
|
85
|
+
}))
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
main()
|