@vespermcp/mcp-server 1.1.3 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/fusion/engine.js +69 -0
- package/build/index.js +900 -50
- package/build/ingestion/hf-downloader.js +12 -3
- package/build/ingestion/ingestor.js +33 -9
- package/build/ingestion/kaggle-downloader.js +2 -2
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/scraper.js +34 -10
- package/build/python/config.py +259 -0
- package/build/python/export_engine.py +148 -52
- package/build/python/fusion_engine.py +368 -0
- package/build/python/kaggle_engine.py +204 -0
- package/build/python/row_count.py +54 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/scripts/build-index.js +5 -5
- package/build/search/jit-orchestrator.js +72 -12
- package/build/tools/formatter.js +14 -14
- package/package.json +9 -3
- package/scripts/refresh-index.cjs +87 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/config.py +259 -0
- package/src/python/export_engine.py +148 -52
- package/src/python/fusion_engine.py +368 -0
- package/src/python/kaggle_engine.py +204 -0
- package/src/python/row_count.py +54 -0
- package/src/python/test_fusion_engine.py +89 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
const { spawnSync } = require("child_process");
|
|
4
|
+
const fs = require("fs");
|
|
5
|
+
const path = require("path");
|
|
6
|
+
const os = require("os");
|
|
7
|
+
const Database = require("better-sqlite3");
|
|
8
|
+
|
|
9
|
+
function runCommand(command, args, options = {}) {
|
|
10
|
+
const result = spawnSync(command, args, {
|
|
11
|
+
stdio: "inherit",
|
|
12
|
+
shell: process.platform === "win32",
|
|
13
|
+
...options,
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
if (result.status !== 0) {
|
|
17
|
+
throw new Error(`Command failed: ${command} ${args.join(" ")} (exit ${result.status})`);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function countDatasets(dbPath) {
|
|
22
|
+
if (!fs.existsSync(dbPath)) return "N/A";
|
|
23
|
+
const db = new Database(dbPath);
|
|
24
|
+
const count = db.prepare("SELECT COUNT(*) AS c FROM datasets").get().c;
|
|
25
|
+
db.close();
|
|
26
|
+
return count;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function countVectors(jsonPath) {
|
|
30
|
+
if (!fs.existsSync(jsonPath)) return "N/A";
|
|
31
|
+
const data = JSON.parse(fs.readFileSync(jsonPath, "utf8"));
|
|
32
|
+
if (typeof data.count === "number") return data.count;
|
|
33
|
+
if (Array.isArray(data.ids)) return data.ids.length;
|
|
34
|
+
return "N/A";
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function syncRuntime(workspaceRoot) {
|
|
38
|
+
const runtimeDir = path.join(os.homedir(), ".vesper", "data");
|
|
39
|
+
fs.mkdirSync(runtimeDir, { recursive: true });
|
|
40
|
+
|
|
41
|
+
const files = ["metadata.db", "vectors.json", "vectors.bin"];
|
|
42
|
+
for (const file of files) {
|
|
43
|
+
const src = path.join(workspaceRoot, "data", file);
|
|
44
|
+
const dest = path.join(runtimeDir, file);
|
|
45
|
+
if (!fs.existsSync(src)) {
|
|
46
|
+
throw new Error(`Missing source file: ${src}`);
|
|
47
|
+
}
|
|
48
|
+
fs.copyFileSync(src, dest);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return runtimeDir;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function main() {
|
|
55
|
+
const workspaceRoot = process.cwd();
|
|
56
|
+
const runtimeDbPath = path.join(os.homedir(), ".vesper", "data", "metadata.db");
|
|
57
|
+
const runtimeVecPath = path.join(os.homedir(), ".vesper", "data", "vectors.json");
|
|
58
|
+
const workspaceDbPath = path.join(workspaceRoot, "data", "metadata.db");
|
|
59
|
+
const workspaceVecPath = path.join(workspaceRoot, "data", "vectors.json");
|
|
60
|
+
|
|
61
|
+
console.log("\n[refresh-index] Step 1/3: Massive scrape...");
|
|
62
|
+
runCommand("npm", ["run", "massive-scrape"]);
|
|
63
|
+
|
|
64
|
+
console.log("\n[refresh-index] Step 2/3: High-memory indexing...");
|
|
65
|
+
const env = { ...process.env, NODE_OPTIONS: "--max-old-space-size=8192" };
|
|
66
|
+
runCommand("npm", ["run", "index"], { env });
|
|
67
|
+
|
|
68
|
+
console.log("\n[refresh-index] Step 3/3: Sync workspace index to runtime...");
|
|
69
|
+
const runtimeDir = syncRuntime(workspaceRoot);
|
|
70
|
+
|
|
71
|
+
const wsDb = countDatasets(workspaceDbPath);
|
|
72
|
+
const wsVec = countVectors(workspaceVecPath);
|
|
73
|
+
const rtDb = countDatasets(runtimeDbPath);
|
|
74
|
+
const rtVec = countVectors(runtimeVecPath);
|
|
75
|
+
|
|
76
|
+
console.log("\n[refresh-index] Completed successfully.");
|
|
77
|
+
console.log(`[refresh-index] Workspace: DB=${wsDb}, VECTORS=${wsVec}`);
|
|
78
|
+
console.log(`[refresh-index] Runtime: DB=${rtDb}, VECTORS=${rtVec}`);
|
|
79
|
+
console.log(`[refresh-index] Runtime path: ${runtimeDir}\n`);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
try {
|
|
83
|
+
main();
|
|
84
|
+
} catch (error) {
|
|
85
|
+
console.error("\n[refresh-index] Failed:", error.message);
|
|
86
|
+
process.exit(1);
|
|
87
|
+
}
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import json
|
|
4
|
+
import base64
|
|
5
|
+
import hashlib
|
|
6
|
+
import secrets
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, Optional
|
|
9
|
+
|
|
10
|
+
SERVICE_NAME = "vesper"
|
|
11
|
+
|
|
12
|
+
KEY_ALIASES = {
|
|
13
|
+
"hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
|
|
14
|
+
"kaggle_username": ["KAGGLE_USERNAME"],
|
|
15
|
+
"kaggle_key": ["KAGGLE_KEY"],
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
import keyring # type: ignore
|
|
20
|
+
HAS_KEYRING = True
|
|
21
|
+
except Exception:
|
|
22
|
+
HAS_KEYRING = False
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from cryptography.fernet import Fernet, InvalidToken # type: ignore
|
|
26
|
+
HAS_FERNET = True
|
|
27
|
+
except Exception:
|
|
28
|
+
HAS_FERNET = False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _config_path() -> Path:
|
|
32
|
+
return Path.home() / ".vesper" / "config.toml"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _secret_path() -> Path:
|
|
36
|
+
return Path.home() / ".vesper" / ".config_key"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _ensure_parent(path: Path) -> None:
|
|
40
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _read_fallback_toml() -> Dict[str, str]:
|
|
44
|
+
path = _config_path()
|
|
45
|
+
if not path.exists():
|
|
46
|
+
return {}
|
|
47
|
+
|
|
48
|
+
values: Dict[str, str] = {}
|
|
49
|
+
in_keys = False
|
|
50
|
+
method = ""
|
|
51
|
+
|
|
52
|
+
for raw in path.read_text(encoding="utf-8").splitlines():
|
|
53
|
+
line = raw.strip()
|
|
54
|
+
if not line or line.startswith("#"):
|
|
55
|
+
continue
|
|
56
|
+
if line.startswith("[") and line.endswith("]"):
|
|
57
|
+
in_keys = (line == "[keys]")
|
|
58
|
+
continue
|
|
59
|
+
if line.startswith("method") and "=" in line:
|
|
60
|
+
method = line.split("=", 1)[1].strip().strip('"').strip("'")
|
|
61
|
+
continue
|
|
62
|
+
if not in_keys or "=" not in line:
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
key, val = line.split("=", 1)
|
|
66
|
+
key = key.strip()
|
|
67
|
+
val = val.strip().strip('"').strip("'")
|
|
68
|
+
values[key] = val
|
|
69
|
+
|
|
70
|
+
if method:
|
|
71
|
+
values["__method__"] = method
|
|
72
|
+
|
|
73
|
+
return values
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _get_or_create_local_secret() -> str:
|
|
77
|
+
secret_file = _secret_path()
|
|
78
|
+
_ensure_parent(secret_file)
|
|
79
|
+
|
|
80
|
+
if secret_file.exists():
|
|
81
|
+
return secret_file.read_text(encoding="utf-8").strip()
|
|
82
|
+
|
|
83
|
+
secret = base64.urlsafe_b64encode(secrets.token_bytes(32)).decode("utf-8")
|
|
84
|
+
secret_file.write_text(secret, encoding="utf-8")
|
|
85
|
+
try:
|
|
86
|
+
os.chmod(secret_file, 0o600)
|
|
87
|
+
except Exception:
|
|
88
|
+
pass
|
|
89
|
+
return secret
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _xor_encrypt(plain: str, secret: str) -> str:
|
|
93
|
+
key = hashlib.sha256(secret.encode("utf-8")).digest()
|
|
94
|
+
data = plain.encode("utf-8")
|
|
95
|
+
out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
|
|
96
|
+
return base64.urlsafe_b64encode(out).decode("utf-8")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _xor_decrypt(cipher_text: str, secret: str) -> str:
|
|
100
|
+
key = hashlib.sha256(secret.encode("utf-8")).digest()
|
|
101
|
+
data = base64.urlsafe_b64decode(cipher_text.encode("utf-8"))
|
|
102
|
+
out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
|
|
103
|
+
return out.decode("utf-8")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _encrypt_value(value: str, secret: str) -> Dict[str, str]:
|
|
107
|
+
if HAS_FERNET:
|
|
108
|
+
token = Fernet(secret.encode("utf-8")).encrypt(value.encode("utf-8")).decode("utf-8")
|
|
109
|
+
return {"method": "fernet", "value": token}
|
|
110
|
+
# fallback encryption (weaker than fernet, but still not plaintext)
|
|
111
|
+
return {"method": "xor", "value": _xor_encrypt(value, secret)}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _decrypt_value(value: str, method: str, secret: str) -> Optional[str]:
|
|
115
|
+
try:
|
|
116
|
+
if method == "fernet" and HAS_FERNET:
|
|
117
|
+
return Fernet(secret.encode("utf-8")).decrypt(value.encode("utf-8")).decode("utf-8")
|
|
118
|
+
if method == "xor":
|
|
119
|
+
return _xor_decrypt(value, secret)
|
|
120
|
+
return None
|
|
121
|
+
except InvalidToken:
|
|
122
|
+
return None
|
|
123
|
+
except Exception:
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _write_fallback_toml(values: Dict[str, str]) -> None:
|
|
128
|
+
path = _config_path()
|
|
129
|
+
_ensure_parent(path)
|
|
130
|
+
|
|
131
|
+
method = values.get("__method__", "fernet" if HAS_FERNET else "xor")
|
|
132
|
+
lines = [
|
|
133
|
+
"# Vesper optional API keys fallback storage",
|
|
134
|
+
"# Encrypted fallback (keyring is preferred)",
|
|
135
|
+
"[meta]",
|
|
136
|
+
f'method = "{method}"',
|
|
137
|
+
"[keys]",
|
|
138
|
+
]
|
|
139
|
+
for key in sorted(values.keys()):
|
|
140
|
+
if key.startswith("__"):
|
|
141
|
+
continue
|
|
142
|
+
val = str(values[key]).replace('"', '\\"')
|
|
143
|
+
lines.append(f'{key} = "{val}"')
|
|
144
|
+
|
|
145
|
+
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _get_from_env(name: str) -> Optional[str]:
|
|
149
|
+
for env_key in KEY_ALIASES.get(name, []):
|
|
150
|
+
val = os.getenv(env_key)
|
|
151
|
+
if val:
|
|
152
|
+
return val
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_key(name: str) -> Optional[str]:
|
|
157
|
+
# 1) env vars (highest priority)
|
|
158
|
+
env_val = _get_from_env(name)
|
|
159
|
+
if env_val:
|
|
160
|
+
return env_val
|
|
161
|
+
|
|
162
|
+
# 2) keyring (secure)
|
|
163
|
+
if HAS_KEYRING:
|
|
164
|
+
try:
|
|
165
|
+
val = keyring.get_password(SERVICE_NAME, name)
|
|
166
|
+
if val:
|
|
167
|
+
return val
|
|
168
|
+
except Exception:
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
# 3) encrypted fallback config.toml
|
|
172
|
+
fallback = _read_fallback_toml()
|
|
173
|
+
enc = fallback.get(name)
|
|
174
|
+
if not enc:
|
|
175
|
+
return None
|
|
176
|
+
secret = _get_or_create_local_secret()
|
|
177
|
+
method = fallback.get("__method__", "fernet" if HAS_FERNET else "xor")
|
|
178
|
+
return _decrypt_value(enc, method, secret)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def set_key(name: str, value: str) -> Dict[str, str]:
|
|
182
|
+
if not value:
|
|
183
|
+
return {"ok": "false", "method": "none", "error": "Empty value"}
|
|
184
|
+
|
|
185
|
+
if HAS_KEYRING:
|
|
186
|
+
try:
|
|
187
|
+
keyring.set_password(SERVICE_NAME, name, value)
|
|
188
|
+
return {"ok": "true", "method": "keyring"}
|
|
189
|
+
except Exception:
|
|
190
|
+
pass
|
|
191
|
+
|
|
192
|
+
fallback = _read_fallback_toml()
|
|
193
|
+
secret = _get_or_create_local_secret()
|
|
194
|
+
enc = _encrypt_value(value, secret)
|
|
195
|
+
fallback["__method__"] = enc["method"]
|
|
196
|
+
fallback[name] = enc["value"]
|
|
197
|
+
_write_fallback_toml(fallback)
|
|
198
|
+
return {"ok": "true", "method": f'toml:{enc["method"]}'}
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def has_key(name: str) -> bool:
|
|
202
|
+
return bool(get_key(name))
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def get_all() -> Dict[str, Optional[str]]:
|
|
206
|
+
return {
|
|
207
|
+
"hf_token": get_key("hf_token"),
|
|
208
|
+
"kaggle_username": get_key("kaggle_username"),
|
|
209
|
+
"kaggle_key": get_key("kaggle_key"),
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _print_json(data):
|
|
214
|
+
print(json.dumps(data))
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def main() -> None:
|
|
218
|
+
if len(sys.argv) < 2:
|
|
219
|
+
_print_json({
|
|
220
|
+
"ok": False,
|
|
221
|
+
"error": "Usage: config.py <get|set|has|all> [name] [value]",
|
|
222
|
+
})
|
|
223
|
+
sys.exit(1)
|
|
224
|
+
|
|
225
|
+
cmd = sys.argv[1].lower()
|
|
226
|
+
|
|
227
|
+
if cmd == "all":
|
|
228
|
+
_print_json({"ok": True, "data": get_all()})
|
|
229
|
+
return
|
|
230
|
+
|
|
231
|
+
if len(sys.argv) < 3:
|
|
232
|
+
_print_json({"ok": False, "error": "Missing key name"})
|
|
233
|
+
sys.exit(1)
|
|
234
|
+
|
|
235
|
+
name = sys.argv[2]
|
|
236
|
+
|
|
237
|
+
if cmd == "get":
|
|
238
|
+
_print_json({"ok": True, "name": name, "value": get_key(name)})
|
|
239
|
+
return
|
|
240
|
+
|
|
241
|
+
if cmd == "has":
|
|
242
|
+
_print_json({"ok": True, "name": name, "value": has_key(name)})
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
if cmd == "set":
|
|
246
|
+
if len(sys.argv) < 4:
|
|
247
|
+
_print_json({"ok": False, "error": "Missing value for set"})
|
|
248
|
+
sys.exit(1)
|
|
249
|
+
value = sys.argv[3]
|
|
250
|
+
result = set_key(name, value)
|
|
251
|
+
_print_json({"ok": result.get("ok") == "true", "name": name, "method": result.get("method"), "error": result.get("error")})
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
_print_json({"ok": False, "error": f"Unknown command: {cmd}"})
|
|
255
|
+
sys.exit(1)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
if __name__ == "__main__":
|
|
259
|
+
main()
|
|
@@ -2,25 +2,102 @@ import sys
|
|
|
2
2
|
import json
|
|
3
3
|
import polars as pl
|
|
4
4
|
import os
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
# Optional imports for extra formats
|
|
8
|
+
try:
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
import pyarrow.feather as pf
|
|
11
|
+
HAS_PYARROW = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
HAS_PYARROW = False
|
|
5
14
|
|
|
6
|
-
# Optional TensorFlow import for TFRecord support
|
|
7
15
|
try:
|
|
8
16
|
import tensorflow as tf
|
|
9
17
|
HAS_TENSORFLOW = True
|
|
10
18
|
except ImportError:
|
|
11
19
|
HAS_TENSORFLOW = False
|
|
12
20
|
|
|
13
|
-
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Helpers
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
27
|
+
"""Load any supported input format into a Polars DataFrame."""
|
|
28
|
+
sample_rows = options.get("sample_rows") # int | None
|
|
29
|
+
columns = options.get("columns") # list[str] | None
|
|
30
|
+
|
|
31
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
32
|
+
if ext == ".csv":
|
|
33
|
+
df = pl.read_csv(file_path, ignore_errors=True)
|
|
34
|
+
elif ext in (".parquet", ".pq"):
|
|
35
|
+
df = pl.read_parquet(file_path)
|
|
36
|
+
elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
37
|
+
df = pl.read_ipc(file_path)
|
|
38
|
+
elif ext == ".jsonl":
|
|
39
|
+
df = pl.read_ndjson(file_path)
|
|
40
|
+
else:
|
|
41
|
+
raise ValueError(f"Unsupported input format: {ext}")
|
|
42
|
+
|
|
43
|
+
# Column selection (before sampling for speed)
|
|
44
|
+
if columns:
|
|
45
|
+
valid = [c for c in columns if c in df.columns]
|
|
46
|
+
if valid:
|
|
47
|
+
df = df.select(valid)
|
|
48
|
+
|
|
49
|
+
# Optional sampling
|
|
50
|
+
if sample_rows and sample_rows < len(df):
|
|
51
|
+
seed = options.get("random_seed", 42)
|
|
52
|
+
df = df.sample(n=sample_rows, seed=seed)
|
|
53
|
+
|
|
54
|
+
return df
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _safe_csv_df(df: pl.DataFrame) -> pl.DataFrame:
|
|
58
|
+
"""Stringify complex columns so CSV doesn't choke."""
|
|
59
|
+
for col in df.columns:
|
|
60
|
+
dtype = df.schema[col]
|
|
61
|
+
is_simple = (
|
|
62
|
+
dtype.is_numeric()
|
|
63
|
+
or dtype.is_temporal()
|
|
64
|
+
or str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
|
|
65
|
+
)
|
|
66
|
+
if not is_simple:
|
|
67
|
+
def safe_serialize(val):
|
|
68
|
+
try:
|
|
69
|
+
if hasattr(val, "to_list"):
|
|
70
|
+
return json.dumps(val.to_list())
|
|
71
|
+
if hasattr(val, "to_dict"):
|
|
72
|
+
return json.dumps(val.to_dict())
|
|
73
|
+
return json.dumps(val)
|
|
74
|
+
except Exception:
|
|
75
|
+
return str(val)
|
|
76
|
+
df = df.with_columns(
|
|
77
|
+
pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8)
|
|
78
|
+
)
|
|
79
|
+
return df
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _write_preview(df: pl.DataFrame, output_path: str, n: int = 500):
|
|
83
|
+
"""Write a small CSV preview next to the exported file."""
|
|
84
|
+
preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
|
|
85
|
+
preview_df = _safe_csv_df(df.head(min(n, len(df))))
|
|
86
|
+
preview_df.write_csv(preview_path)
|
|
87
|
+
return preview_path
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
# Main export function
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
def export_data(file_path: str, output_path: str, format: str, options: dict | None = None):
|
|
14
95
|
options = options or {}
|
|
15
|
-
|
|
16
|
-
|
|
96
|
+
t0 = time.perf_counter()
|
|
97
|
+
|
|
98
|
+
# ---- Load ----
|
|
17
99
|
try:
|
|
18
|
-
|
|
19
|
-
df = pl.read_csv(file_path, ignore_errors=True)
|
|
20
|
-
elif file_path.endswith(".parquet"):
|
|
21
|
-
df = pl.read_parquet(file_path)
|
|
22
|
-
else:
|
|
23
|
-
return {"error": f"Unsupported input format: {file_path}"}
|
|
100
|
+
df = _load(file_path, options)
|
|
24
101
|
except Exception as e:
|
|
25
102
|
return {"error": f"Failed to load input file: {str(e)}"}
|
|
26
103
|
|
|
@@ -28,104 +105,123 @@ def export_data(file_path, output_path, format, options=None):
|
|
|
28
105
|
if output_dir and not os.path.exists(output_dir):
|
|
29
106
|
os.makedirs(output_dir, exist_ok=True)
|
|
30
107
|
|
|
108
|
+
preview_path = None
|
|
109
|
+
generate_preview = options.get("preview", False)
|
|
110
|
+
|
|
31
111
|
try:
|
|
32
|
-
#
|
|
33
|
-
if format == "
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
return json.dumps(val.to_list())
|
|
47
|
-
if hasattr(val, "to_dict"):
|
|
48
|
-
return json.dumps(val.to_dict())
|
|
49
|
-
return json.dumps(val)
|
|
50
|
-
except:
|
|
51
|
-
return str(val)
|
|
52
|
-
df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
|
|
53
|
-
df.write_csv(output_path)
|
|
54
|
-
|
|
112
|
+
# ---- Feather (Arrow IPC) – fastest binary format ----
|
|
113
|
+
if format == "feather":
|
|
114
|
+
if not HAS_PYARROW:
|
|
115
|
+
return {"error": "pyarrow is not installed. Run: pip install pyarrow"}
|
|
116
|
+
compression = options.get("compression", "lz4")
|
|
117
|
+
if compression in ("uncompressed", "none", "None", None):
|
|
118
|
+
compression = "uncompressed"
|
|
119
|
+
# Polars write_ipc uses Arrow IPC (= Feather v2) under the hood
|
|
120
|
+
arrow_table = df.to_arrow()
|
|
121
|
+
pf.write_feather(arrow_table, output_path, compression=compression)
|
|
122
|
+
if generate_preview:
|
|
123
|
+
preview_path = _write_preview(df, output_path)
|
|
124
|
+
|
|
125
|
+
# ---- Parquet – best compression, big-data friendly ----
|
|
55
126
|
elif format == "parquet":
|
|
56
127
|
compression = options.get("compression", "snappy")
|
|
128
|
+
if compression in ("uncompressed", "none", "None", None):
|
|
129
|
+
compression = "uncompressed"
|
|
57
130
|
df.write_parquet(output_path, compression=compression)
|
|
58
|
-
|
|
131
|
+
if generate_preview:
|
|
132
|
+
preview_path = _write_preview(df, output_path)
|
|
133
|
+
|
|
134
|
+
# ---- CSV – human-readable fallback ----
|
|
135
|
+
elif format == "csv":
|
|
136
|
+
df = _safe_csv_df(df)
|
|
137
|
+
df.write_csv(output_path)
|
|
138
|
+
|
|
139
|
+
# ---- JSONL ----
|
|
59
140
|
elif format == "jsonl":
|
|
60
141
|
df.write_ndjson(output_path)
|
|
61
|
-
|
|
62
|
-
|
|
142
|
+
if generate_preview:
|
|
143
|
+
preview_path = _write_preview(df, output_path)
|
|
144
|
+
|
|
145
|
+
# ---- Arrow IPC (legacy name kept for compat) ----
|
|
146
|
+
elif format in ("arrow", "ipc"):
|
|
63
147
|
compression = options.get("compression", "uncompressed")
|
|
64
|
-
if compression == "uncompressed":
|
|
148
|
+
if compression == "uncompressed":
|
|
149
|
+
compression = None
|
|
65
150
|
df.write_ipc(output_path, compression=compression)
|
|
151
|
+
if generate_preview:
|
|
152
|
+
preview_path = _write_preview(df, output_path)
|
|
66
153
|
|
|
154
|
+
# ---- TFRecord ----
|
|
67
155
|
elif format == "tfrecord":
|
|
68
156
|
if not HAS_TENSORFLOW:
|
|
69
157
|
return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
|
|
70
|
-
|
|
71
|
-
# TFRecord Export Logic (using TensorFlow)
|
|
72
158
|
with tf.io.TFRecordWriter(output_path) as writer:
|
|
73
|
-
# Convert Polars -> Pandas for iteration (simpler for now)
|
|
74
|
-
# TODO: Optimize with Arrow -> Tensor conversion if needed for massive data
|
|
75
159
|
pdf = df.to_pandas()
|
|
76
160
|
for _, row in pdf.iterrows():
|
|
77
161
|
feature = {}
|
|
78
162
|
for col, value in row.items():
|
|
79
163
|
if value is None:
|
|
80
164
|
continue
|
|
81
|
-
|
|
82
|
-
# Type inference for TFRecord features
|
|
83
165
|
if isinstance(value, int):
|
|
84
166
|
feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
|
|
85
167
|
elif isinstance(value, float):
|
|
86
168
|
feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
|
|
87
169
|
elif isinstance(value, str):
|
|
88
|
-
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode(
|
|
170
|
+
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode("utf-8")]))
|
|
89
171
|
elif isinstance(value, bytes):
|
|
90
172
|
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
|
|
91
173
|
else:
|
|
92
|
-
|
|
93
|
-
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode('utf-8')]))
|
|
94
|
-
|
|
174
|
+
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode("utf-8")]))
|
|
95
175
|
example = tf.train.Example(features=tf.train.Features(feature=feature))
|
|
96
176
|
writer.write(example.SerializeToString())
|
|
97
177
|
|
|
98
178
|
else:
|
|
99
179
|
return {"error": f"Unknown export format: {format}"}
|
|
100
180
|
|
|
101
|
-
|
|
181
|
+
elapsed = round(time.perf_counter() - t0, 3)
|
|
182
|
+
file_size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
|
|
183
|
+
|
|
184
|
+
result = {
|
|
102
185
|
"success": True,
|
|
103
186
|
"output_path": output_path,
|
|
104
187
|
"rows": len(df),
|
|
105
|
-
"
|
|
188
|
+
"columns": len(df.columns),
|
|
189
|
+
"format": format,
|
|
190
|
+
"compression": options.get("compression", "default"),
|
|
191
|
+
"file_size_mb": file_size_mb,
|
|
192
|
+
"elapsed_seconds": elapsed,
|
|
106
193
|
}
|
|
194
|
+
if preview_path:
|
|
195
|
+
result["preview_path"] = preview_path
|
|
196
|
+
|
|
197
|
+
return result
|
|
107
198
|
|
|
108
199
|
except Exception as e:
|
|
109
200
|
return {"error": f"Export failed: {str(e)}"}
|
|
110
201
|
|
|
202
|
+
|
|
111
203
|
def main():
|
|
112
204
|
if len(sys.argv) < 4:
|
|
113
|
-
print(
|
|
205
|
+
print(
|
|
206
|
+
json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}),
|
|
207
|
+
file=sys.stderr,
|
|
208
|
+
)
|
|
114
209
|
sys.exit(1)
|
|
115
210
|
|
|
116
211
|
input_file = sys.argv[1]
|
|
117
212
|
output_file = sys.argv[2]
|
|
118
213
|
fmt = sys.argv[3]
|
|
119
|
-
|
|
214
|
+
|
|
120
215
|
options = {}
|
|
121
216
|
if len(sys.argv) > 4:
|
|
122
217
|
try:
|
|
123
218
|
options = json.loads(sys.argv[4])
|
|
124
|
-
except:
|
|
219
|
+
except Exception:
|
|
125
220
|
pass
|
|
126
221
|
|
|
127
222
|
result = export_data(input_file, output_file, fmt, options)
|
|
128
223
|
print(json.dumps(result))
|
|
129
224
|
|
|
225
|
+
|
|
130
226
|
if __name__ == "__main__":
|
|
131
227
|
main()
|