@vespermcp/mcp-server 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,259 @@
1
+ import os
2
+ import sys
3
+ import json
4
+ import base64
5
+ import hashlib
6
+ import secrets
7
+ from pathlib import Path
8
+ from typing import Dict, Optional
9
+
10
+ SERVICE_NAME = "vesper"
11
+
12
+ KEY_ALIASES = {
13
+ "hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
14
+ "kaggle_username": ["KAGGLE_USERNAME"],
15
+ "kaggle_key": ["KAGGLE_KEY"],
16
+ }
17
+
18
+ try:
19
+ import keyring # type: ignore
20
+ HAS_KEYRING = True
21
+ except Exception:
22
+ HAS_KEYRING = False
23
+
24
+ try:
25
+ from cryptography.fernet import Fernet, InvalidToken # type: ignore
26
+ HAS_FERNET = True
27
+ except Exception:
28
+ HAS_FERNET = False
29
+
30
+
31
+ def _config_path() -> Path:
32
+ return Path.home() / ".vesper" / "config.toml"
33
+
34
+
35
+ def _secret_path() -> Path:
36
+ return Path.home() / ".vesper" / ".config_key"
37
+
38
+
39
+ def _ensure_parent(path: Path) -> None:
40
+ path.parent.mkdir(parents=True, exist_ok=True)
41
+
42
+
43
+ def _read_fallback_toml() -> Dict[str, str]:
44
+ path = _config_path()
45
+ if not path.exists():
46
+ return {}
47
+
48
+ values: Dict[str, str] = {}
49
+ in_keys = False
50
+ method = ""
51
+
52
+ for raw in path.read_text(encoding="utf-8").splitlines():
53
+ line = raw.strip()
54
+ if not line or line.startswith("#"):
55
+ continue
56
+ if line.startswith("[") and line.endswith("]"):
57
+ in_keys = (line == "[keys]")
58
+ continue
59
+ if line.startswith("method") and "=" in line:
60
+ method = line.split("=", 1)[1].strip().strip('"').strip("'")
61
+ continue
62
+ if not in_keys or "=" not in line:
63
+ continue
64
+
65
+ key, val = line.split("=", 1)
66
+ key = key.strip()
67
+ val = val.strip().strip('"').strip("'")
68
+ values[key] = val
69
+
70
+ if method:
71
+ values["__method__"] = method
72
+
73
+ return values
74
+
75
+
76
+ def _get_or_create_local_secret() -> str:
77
+ secret_file = _secret_path()
78
+ _ensure_parent(secret_file)
79
+
80
+ if secret_file.exists():
81
+ return secret_file.read_text(encoding="utf-8").strip()
82
+
83
+ secret = base64.urlsafe_b64encode(secrets.token_bytes(32)).decode("utf-8")
84
+ secret_file.write_text(secret, encoding="utf-8")
85
+ try:
86
+ os.chmod(secret_file, 0o600)
87
+ except Exception:
88
+ pass
89
+ return secret
90
+
91
+
92
+ def _xor_encrypt(plain: str, secret: str) -> str:
93
+ key = hashlib.sha256(secret.encode("utf-8")).digest()
94
+ data = plain.encode("utf-8")
95
+ out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
96
+ return base64.urlsafe_b64encode(out).decode("utf-8")
97
+
98
+
99
+ def _xor_decrypt(cipher_text: str, secret: str) -> str:
100
+ key = hashlib.sha256(secret.encode("utf-8")).digest()
101
+ data = base64.urlsafe_b64decode(cipher_text.encode("utf-8"))
102
+ out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
103
+ return out.decode("utf-8")
104
+
105
+
106
+ def _encrypt_value(value: str, secret: str) -> Dict[str, str]:
107
+ if HAS_FERNET:
108
+ token = Fernet(secret.encode("utf-8")).encrypt(value.encode("utf-8")).decode("utf-8")
109
+ return {"method": "fernet", "value": token}
110
+ # fallback encryption (weaker than fernet, but still not plaintext)
111
+ return {"method": "xor", "value": _xor_encrypt(value, secret)}
112
+
113
+
114
+ def _decrypt_value(value: str, method: str, secret: str) -> Optional[str]:
115
+ try:
116
+ if method == "fernet" and HAS_FERNET:
117
+ return Fernet(secret.encode("utf-8")).decrypt(value.encode("utf-8")).decode("utf-8")
118
+ if method == "xor":
119
+ return _xor_decrypt(value, secret)
120
+ return None
121
+ except InvalidToken:
122
+ return None
123
+ except Exception:
124
+ return None
125
+
126
+
127
+ def _write_fallback_toml(values: Dict[str, str]) -> None:
128
+ path = _config_path()
129
+ _ensure_parent(path)
130
+
131
+ method = values.get("__method__", "fernet" if HAS_FERNET else "xor")
132
+ lines = [
133
+ "# Vesper optional API keys fallback storage",
134
+ "# Encrypted fallback (keyring is preferred)",
135
+ "[meta]",
136
+ f'method = "{method}"',
137
+ "[keys]",
138
+ ]
139
+ for key in sorted(values.keys()):
140
+ if key.startswith("__"):
141
+ continue
142
+ val = str(values[key]).replace('"', '\\"')
143
+ lines.append(f'{key} = "{val}"')
144
+
145
+ path.write_text("\n".join(lines) + "\n", encoding="utf-8")
146
+
147
+
148
+ def _get_from_env(name: str) -> Optional[str]:
149
+ for env_key in KEY_ALIASES.get(name, []):
150
+ val = os.getenv(env_key)
151
+ if val:
152
+ return val
153
+ return None
154
+
155
+
156
+ def get_key(name: str) -> Optional[str]:
157
+ # 1) env vars (highest priority)
158
+ env_val = _get_from_env(name)
159
+ if env_val:
160
+ return env_val
161
+
162
+ # 2) keyring (secure)
163
+ if HAS_KEYRING:
164
+ try:
165
+ val = keyring.get_password(SERVICE_NAME, name)
166
+ if val:
167
+ return val
168
+ except Exception:
169
+ pass
170
+
171
+ # 3) encrypted fallback config.toml
172
+ fallback = _read_fallback_toml()
173
+ enc = fallback.get(name)
174
+ if not enc:
175
+ return None
176
+ secret = _get_or_create_local_secret()
177
+ method = fallback.get("__method__", "fernet" if HAS_FERNET else "xor")
178
+ return _decrypt_value(enc, method, secret)
179
+
180
+
181
+ def set_key(name: str, value: str) -> Dict[str, str]:
182
+ if not value:
183
+ return {"ok": "false", "method": "none", "error": "Empty value"}
184
+
185
+ if HAS_KEYRING:
186
+ try:
187
+ keyring.set_password(SERVICE_NAME, name, value)
188
+ return {"ok": "true", "method": "keyring"}
189
+ except Exception:
190
+ pass
191
+
192
+ fallback = _read_fallback_toml()
193
+ secret = _get_or_create_local_secret()
194
+ enc = _encrypt_value(value, secret)
195
+ fallback["__method__"] = enc["method"]
196
+ fallback[name] = enc["value"]
197
+ _write_fallback_toml(fallback)
198
+ return {"ok": "true", "method": f'toml:{enc["method"]}'}
199
+
200
+
201
+ def has_key(name: str) -> bool:
202
+ return bool(get_key(name))
203
+
204
+
205
+ def get_all() -> Dict[str, Optional[str]]:
206
+ return {
207
+ "hf_token": get_key("hf_token"),
208
+ "kaggle_username": get_key("kaggle_username"),
209
+ "kaggle_key": get_key("kaggle_key"),
210
+ }
211
+
212
+
213
+ def _print_json(data):
214
+ print(json.dumps(data))
215
+
216
+
217
+ def main() -> None:
218
+ if len(sys.argv) < 2:
219
+ _print_json({
220
+ "ok": False,
221
+ "error": "Usage: config.py <get|set|has|all> [name] [value]",
222
+ })
223
+ sys.exit(1)
224
+
225
+ cmd = sys.argv[1].lower()
226
+
227
+ if cmd == "all":
228
+ _print_json({"ok": True, "data": get_all()})
229
+ return
230
+
231
+ if len(sys.argv) < 3:
232
+ _print_json({"ok": False, "error": "Missing key name"})
233
+ sys.exit(1)
234
+
235
+ name = sys.argv[2]
236
+
237
+ if cmd == "get":
238
+ _print_json({"ok": True, "name": name, "value": get_key(name)})
239
+ return
240
+
241
+ if cmd == "has":
242
+ _print_json({"ok": True, "name": name, "value": has_key(name)})
243
+ return
244
+
245
+ if cmd == "set":
246
+ if len(sys.argv) < 4:
247
+ _print_json({"ok": False, "error": "Missing value for set"})
248
+ sys.exit(1)
249
+ value = sys.argv[3]
250
+ result = set_key(name, value)
251
+ _print_json({"ok": result.get("ok") == "true", "name": name, "method": result.get("method"), "error": result.get("error")})
252
+ return
253
+
254
+ _print_json({"ok": False, "error": f"Unknown command: {cmd}"})
255
+ sys.exit(1)
256
+
257
+
258
+ if __name__ == "__main__":
259
+ main()
@@ -2,25 +2,102 @@ import sys
2
2
  import json
3
3
  import polars as pl
4
4
  import os
5
+ import time
6
+
7
+ # Optional imports for extra formats
8
+ try:
9
+ import pyarrow as pa
10
+ import pyarrow.feather as pf
11
+ HAS_PYARROW = True
12
+ except ImportError:
13
+ HAS_PYARROW = False
5
14
 
6
- # Optional TensorFlow import for TFRecord support
7
15
  try:
8
16
  import tensorflow as tf
9
17
  HAS_TENSORFLOW = True
10
18
  except ImportError:
11
19
  HAS_TENSORFLOW = False
12
20
 
13
- def export_data(file_path, output_path, format, options=None):
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Helpers
24
+ # ---------------------------------------------------------------------------
25
+
26
+ def _load(file_path: str, options: dict) -> pl.DataFrame:
27
+ """Load any supported input format into a Polars DataFrame."""
28
+ sample_rows = options.get("sample_rows") # int | None
29
+ columns = options.get("columns") # list[str] | None
30
+
31
+ ext = os.path.splitext(file_path)[1].lower()
32
+ if ext == ".csv":
33
+ df = pl.read_csv(file_path, ignore_errors=True)
34
+ elif ext in (".parquet", ".pq"):
35
+ df = pl.read_parquet(file_path)
36
+ elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
37
+ df = pl.read_ipc(file_path)
38
+ elif ext == ".jsonl":
39
+ df = pl.read_ndjson(file_path)
40
+ else:
41
+ raise ValueError(f"Unsupported input format: {ext}")
42
+
43
+ # Column selection (before sampling for speed)
44
+ if columns:
45
+ valid = [c for c in columns if c in df.columns]
46
+ if valid:
47
+ df = df.select(valid)
48
+
49
+ # Optional sampling
50
+ if sample_rows and sample_rows < len(df):
51
+ seed = options.get("random_seed", 42)
52
+ df = df.sample(n=sample_rows, seed=seed)
53
+
54
+ return df
55
+
56
+
57
+ def _safe_csv_df(df: pl.DataFrame) -> pl.DataFrame:
58
+ """Stringify complex columns so CSV doesn't choke."""
59
+ for col in df.columns:
60
+ dtype = df.schema[col]
61
+ is_simple = (
62
+ dtype.is_numeric()
63
+ or dtype.is_temporal()
64
+ or str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
65
+ )
66
+ if not is_simple:
67
+ def safe_serialize(val):
68
+ try:
69
+ if hasattr(val, "to_list"):
70
+ return json.dumps(val.to_list())
71
+ if hasattr(val, "to_dict"):
72
+ return json.dumps(val.to_dict())
73
+ return json.dumps(val)
74
+ except Exception:
75
+ return str(val)
76
+ df = df.with_columns(
77
+ pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8)
78
+ )
79
+ return df
80
+
81
+
82
+ def _write_preview(df: pl.DataFrame, output_path: str, n: int = 500):
83
+ """Write a small CSV preview next to the exported file."""
84
+ preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
85
+ preview_df = _safe_csv_df(df.head(min(n, len(df))))
86
+ preview_df.write_csv(preview_path)
87
+ return preview_path
88
+
89
+
90
+ # ---------------------------------------------------------------------------
91
+ # Main export function
92
+ # ---------------------------------------------------------------------------
93
+
94
+ def export_data(file_path: str, output_path: str, format: str, options: dict | None = None):
14
95
  options = options or {}
15
-
16
- # Load Data
96
+ t0 = time.perf_counter()
97
+
98
+ # ---- Load ----
17
99
  try:
18
- if file_path.endswith(".csv"):
19
- df = pl.read_csv(file_path, ignore_errors=True)
20
- elif file_path.endswith(".parquet"):
21
- df = pl.read_parquet(file_path)
22
- else:
23
- return {"error": f"Unsupported input format: {file_path}"}
100
+ df = _load(file_path, options)
24
101
  except Exception as e:
25
102
  return {"error": f"Failed to load input file: {str(e)}"}
26
103
 
@@ -28,104 +105,123 @@ def export_data(file_path, output_path, format, options=None):
28
105
  if output_dir and not os.path.exists(output_dir):
29
106
  os.makedirs(output_dir, exist_ok=True)
30
107
 
108
+ preview_path = None
109
+ generate_preview = options.get("preview", False)
110
+
31
111
  try:
32
- # Export Logic
33
- if format == "csv":
34
- # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
35
- for col in df.columns:
36
- dtype = df.schema[col]
37
- is_simple = (
38
- dtype.is_numeric() or
39
- dtype.is_temporal() or
40
- str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
41
- )
42
- if not is_simple:
43
- def safe_serialize(val):
44
- try:
45
- if hasattr(val, "to_list"):
46
- return json.dumps(val.to_list())
47
- if hasattr(val, "to_dict"):
48
- return json.dumps(val.to_dict())
49
- return json.dumps(val)
50
- except:
51
- return str(val)
52
- df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
53
- df.write_csv(output_path)
54
-
112
+ # ---- Feather (Arrow IPC) – fastest binary format ----
113
+ if format == "feather":
114
+ if not HAS_PYARROW:
115
+ return {"error": "pyarrow is not installed. Run: pip install pyarrow"}
116
+ compression = options.get("compression", "lz4")
117
+ if compression in ("uncompressed", "none", "None", None):
118
+ compression = "uncompressed"
119
+ # Polars write_ipc uses Arrow IPC (= Feather v2) under the hood
120
+ arrow_table = df.to_arrow()
121
+ pf.write_feather(arrow_table, output_path, compression=compression)
122
+ if generate_preview:
123
+ preview_path = _write_preview(df, output_path)
124
+
125
+ # ---- Parquet – best compression, big-data friendly ----
55
126
  elif format == "parquet":
56
127
  compression = options.get("compression", "snappy")
128
+ if compression in ("uncompressed", "none", "None", None):
129
+ compression = "uncompressed"
57
130
  df.write_parquet(output_path, compression=compression)
58
-
131
+ if generate_preview:
132
+ preview_path = _write_preview(df, output_path)
133
+
134
+ # ---- CSV – human-readable fallback ----
135
+ elif format == "csv":
136
+ df = _safe_csv_df(df)
137
+ df.write_csv(output_path)
138
+
139
+ # ---- JSONL ----
59
140
  elif format == "jsonl":
60
141
  df.write_ndjson(output_path)
61
-
62
- elif format == "arrow" or format == "ipc":
142
+ if generate_preview:
143
+ preview_path = _write_preview(df, output_path)
144
+
145
+ # ---- Arrow IPC (legacy name kept for compat) ----
146
+ elif format in ("arrow", "ipc"):
63
147
  compression = options.get("compression", "uncompressed")
64
- if compression == "uncompressed": compression = None
148
+ if compression == "uncompressed":
149
+ compression = None
65
150
  df.write_ipc(output_path, compression=compression)
151
+ if generate_preview:
152
+ preview_path = _write_preview(df, output_path)
66
153
 
154
+ # ---- TFRecord ----
67
155
  elif format == "tfrecord":
68
156
  if not HAS_TENSORFLOW:
69
157
  return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
70
-
71
- # TFRecord Export Logic (using TensorFlow)
72
158
  with tf.io.TFRecordWriter(output_path) as writer:
73
- # Convert Polars -> Pandas for iteration (simpler for now)
74
- # TODO: Optimize with Arrow -> Tensor conversion if needed for massive data
75
159
  pdf = df.to_pandas()
76
160
  for _, row in pdf.iterrows():
77
161
  feature = {}
78
162
  for col, value in row.items():
79
163
  if value is None:
80
164
  continue
81
-
82
- # Type inference for TFRecord features
83
165
  if isinstance(value, int):
84
166
  feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
85
167
  elif isinstance(value, float):
86
168
  feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
87
169
  elif isinstance(value, str):
88
- feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))
170
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode("utf-8")]))
89
171
  elif isinstance(value, bytes):
90
172
  feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
91
173
  else:
92
- # Fallback to string for unknown types
93
- feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode('utf-8')]))
94
-
174
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode("utf-8")]))
95
175
  example = tf.train.Example(features=tf.train.Features(feature=feature))
96
176
  writer.write(example.SerializeToString())
97
177
 
98
178
  else:
99
179
  return {"error": f"Unknown export format: {format}"}
100
180
 
101
- return {
181
+ elapsed = round(time.perf_counter() - t0, 3)
182
+ file_size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
183
+
184
+ result = {
102
185
  "success": True,
103
186
  "output_path": output_path,
104
187
  "rows": len(df),
105
- "format": format
188
+ "columns": len(df.columns),
189
+ "format": format,
190
+ "compression": options.get("compression", "default"),
191
+ "file_size_mb": file_size_mb,
192
+ "elapsed_seconds": elapsed,
106
193
  }
194
+ if preview_path:
195
+ result["preview_path"] = preview_path
196
+
197
+ return result
107
198
 
108
199
  except Exception as e:
109
200
  return {"error": f"Export failed: {str(e)}"}
110
201
 
202
+
111
203
  def main():
112
204
  if len(sys.argv) < 4:
113
- print(json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}), file=sys.stderr)
205
+ print(
206
+ json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}),
207
+ file=sys.stderr,
208
+ )
114
209
  sys.exit(1)
115
210
 
116
211
  input_file = sys.argv[1]
117
212
  output_file = sys.argv[2]
118
213
  fmt = sys.argv[3]
119
-
214
+
120
215
  options = {}
121
216
  if len(sys.argv) > 4:
122
217
  try:
123
218
  options = json.loads(sys.argv[4])
124
- except:
219
+ except Exception:
125
220
  pass
126
221
 
127
222
  result = export_data(input_file, output_file, fmt, options)
128
223
  print(json.dumps(result))
129
224
 
225
+
130
226
  if __name__ == "__main__":
131
227
  main()