@remnic/cli 1.0.5 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -3
- package/dist/assets/download-datasets.sh +468 -3
- package/dist/index.js +3958 -568
- package/package.json +47 -4
- package/dist/chunk-GAZ3DFWX.js +0 -12027
- package/dist/dist-7DCVQLUB.js +0 -292
package/README.md
CHANGED
|
@@ -55,6 +55,7 @@ kept as a compatibility alias.
|
|
|
55
55
|
|
|
56
56
|
```bash
|
|
57
57
|
remnic bench list
|
|
58
|
+
remnic bench run --quick longmemeval --runtime-profile baseline
|
|
58
59
|
remnic bench datasets status
|
|
59
60
|
remnic bench datasets download longmemeval
|
|
60
61
|
remnic bench datasets download --all
|
|
@@ -63,6 +64,10 @@ remnic bench runs show candidate-run --detail
|
|
|
63
64
|
remnic bench runs delete candidate-run
|
|
64
65
|
remnic bench run --quick longmemeval
|
|
65
66
|
remnic bench run longmemeval --dataset-dir ~/datasets/longmemeval
|
|
67
|
+
remnic bench run longmemeval --runtime-profile real --remnic-config ~/.config/remnic/config.json
|
|
68
|
+
remnic bench run longmemeval --runtime-profile real --system-provider openai --system-model gpt-5.4-mini
|
|
69
|
+
remnic bench run longmemeval --runtime-profile openclaw-chain --openclaw-config ~/.openclaw/openclaw.json --gateway-agent-id memory-primary
|
|
70
|
+
remnic bench run longmemeval --matrix baseline,real,openclaw-chain
|
|
66
71
|
remnic bench compare base-run candidate-run
|
|
67
72
|
remnic bench baseline save main candidate-run
|
|
68
73
|
remnic bench baseline list
|
|
@@ -80,9 +85,16 @@ full runs need a real benchmark dataset. In a repo checkout the CLI will use
|
|
|
80
85
|
`evals/datasets/<benchmark>` automatically; in packaged installs pass
|
|
81
86
|
`--dataset-dir <path>` explicitly.
|
|
82
87
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
88
|
+
Package-backed benchmark runs also write `MANIFEST.json` in the results
|
|
89
|
+
directory. The manifest records result artifact hashes, dataset file hashes,
|
|
90
|
+
fixed seeds, runtime profile/model configuration, git state, QMD collection
|
|
91
|
+
names, selected benchmark environment keys, and config-file hashes. Secret
|
|
92
|
+
argument values are redacted.
|
|
93
|
+
|
|
94
|
+
`remnic bench datasets download` currently manages the published benchmark
|
|
95
|
+
datasets for `ama-bench`, `memory-arena`, `amemgym`, `longmemeval`, `locomo`,
|
|
96
|
+
`beam`, `personamem`, `membench`, and `memoryagentbench`. Internal Remnic
|
|
97
|
+
benchmarks keep their bundled or repo-managed fixtures.
|
|
86
98
|
|
|
87
99
|
## Connecting agents
|
|
88
100
|
|
|
@@ -10,9 +10,9 @@ DATASETS_DIR="${DATASETS_DIR:-$(cd "$SCRIPT_DIR/.." && pwd)/datasets}"
|
|
|
10
10
|
usage() {
|
|
11
11
|
echo "Usage: $0 [--benchmark <name>]"
|
|
12
12
|
echo ""
|
|
13
|
-
echo "Downloads benchmark datasets for the
|
|
13
|
+
echo "Downloads benchmark datasets for the Remnic bench suite."
|
|
14
14
|
echo ""
|
|
15
|
-
echo "Benchmarks: ama-bench, longmemeval, amemgym, locomo, memory-arena, all"
|
|
15
|
+
echo "Benchmarks: ama-bench, longmemeval, amemgym, locomo, memory-arena, beam, personamem, membench, memoryagentbench, all"
|
|
16
16
|
echo ""
|
|
17
17
|
echo "Options:"
|
|
18
18
|
echo " --benchmark <name> Download only the specified benchmark (default: all)"
|
|
@@ -38,6 +38,79 @@ check_deps() {
|
|
|
38
38
|
done
|
|
39
39
|
}
|
|
40
40
|
|
|
41
|
+
PYTHON_BIN=""
|
|
42
|
+
|
|
43
|
+
python_has_modules() {
|
|
44
|
+
local python_bin="$1"
|
|
45
|
+
shift
|
|
46
|
+
|
|
47
|
+
"$python_bin" - "$@" <<'PY'
|
|
48
|
+
import sys
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
import importlib.util as importlib_util
|
|
52
|
+
except Exception: # pragma: no cover - Python 2 fallback
|
|
53
|
+
importlib_util = None
|
|
54
|
+
import pkgutil
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def has_module(name):
|
|
58
|
+
if importlib_util is not None:
|
|
59
|
+
return importlib_util.find_spec(name) is not None
|
|
60
|
+
return pkgutil.find_loader(name) is not None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
missing = [name for name in sys.argv[1:] if not has_module(name)]
|
|
64
|
+
if missing:
|
|
65
|
+
names = ", ".join(missing)
|
|
66
|
+
sys.stderr.write(
|
|
67
|
+
"ERROR: missing required Python module(s): {}. Install them before downloading this dataset.\n".format(
|
|
68
|
+
names
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
sys.exit(1)
|
|
72
|
+
PY
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
resolve_python_bin() {
|
|
76
|
+
if [[ -n "$PYTHON_BIN" ]]; then
|
|
77
|
+
if [[ $# -eq 0 ]] || python_has_modules "$PYTHON_BIN" "$@" >/dev/null 2>&1; then
|
|
78
|
+
printf '%s\n' "$PYTHON_BIN"
|
|
79
|
+
return 0
|
|
80
|
+
fi
|
|
81
|
+
fi
|
|
82
|
+
|
|
83
|
+
local candidate
|
|
84
|
+
local found_any=0
|
|
85
|
+
for candidate in python3 python; do
|
|
86
|
+
if ! command -v "$candidate" &>/dev/null; then
|
|
87
|
+
continue
|
|
88
|
+
fi
|
|
89
|
+
found_any=1
|
|
90
|
+
if [[ $# -gt 0 ]] && ! python_has_modules "$candidate" "$@" >/dev/null 2>&1; then
|
|
91
|
+
continue
|
|
92
|
+
fi
|
|
93
|
+
PYTHON_BIN="$candidate"
|
|
94
|
+
printf '%s\n' "$PYTHON_BIN"
|
|
95
|
+
return 0
|
|
96
|
+
done
|
|
97
|
+
|
|
98
|
+
if [[ $found_any -eq 1 && $# -gt 0 ]]; then
|
|
99
|
+
local names
|
|
100
|
+
names=$(printf '%s, ' "$@")
|
|
101
|
+
names=${names%, }
|
|
102
|
+
echo "ERROR: missing required Python module(s): $names. Install them before downloading this dataset."
|
|
103
|
+
exit 1
|
|
104
|
+
fi
|
|
105
|
+
|
|
106
|
+
echo "ERROR: python or python3 is required but not found"
|
|
107
|
+
exit 1
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
require_python_modules() {
|
|
111
|
+
resolve_python_bin "$@" >/dev/null
|
|
112
|
+
}
|
|
113
|
+
|
|
41
114
|
download_ama_bench() {
|
|
42
115
|
local dir="$DATASETS_DIR/ama-bench"
|
|
43
116
|
if [[ -f "$dir/open_end_qa_set.jsonl" ]]; then
|
|
@@ -153,6 +226,390 @@ download_memory_arena() {
|
|
|
153
226
|
echo "[memory-arena] Downloaded to $dir ($count domains)"
|
|
154
227
|
}
|
|
155
228
|
|
|
229
|
+
download_beam() {
|
|
230
|
+
local dir="$DATASETS_DIR/beam"
|
|
231
|
+
if [[ -f "$dir/beam_100k.json" && -f "$dir/beam_500k.json" && -f "$dir/beam_1m.json" && -f "$dir/beam_10m.json" ]]; then
|
|
232
|
+
echo "[beam] Already downloaded at $dir"
|
|
233
|
+
return
|
|
234
|
+
fi
|
|
235
|
+
echo "[beam] Downloading from Hugging Face parquet sources (Mohammadta/BEAM, Mohammadta/BEAM-10M)..."
|
|
236
|
+
mkdir -p "$dir"
|
|
237
|
+
require_python_modules huggingface_hub pyarrow
|
|
238
|
+
local python_bin
|
|
239
|
+
python_bin="$(resolve_python_bin)"
|
|
240
|
+
"$python_bin" - "$dir" <<'PY'
|
|
241
|
+
from __future__ import annotations
|
|
242
|
+
|
|
243
|
+
import json
|
|
244
|
+
import sys
|
|
245
|
+
from pathlib import Path
|
|
246
|
+
|
|
247
|
+
import pyarrow.parquet as pq
|
|
248
|
+
from huggingface_hub import hf_hub_download
|
|
249
|
+
|
|
250
|
+
out_dir = Path(sys.argv[1])
|
|
251
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
252
|
+
|
|
253
|
+
targets = [
|
|
254
|
+
("Mohammadta/BEAM", ["data/100K-00000-of-00001.parquet"], "beam_100k.json"),
|
|
255
|
+
("Mohammadta/BEAM", ["data/500K-00000-of-00001.parquet"], "beam_500k.json"),
|
|
256
|
+
("Mohammadta/BEAM", ["data/1M-00000-of-00001.parquet"], "beam_1m.json"),
|
|
257
|
+
(
|
|
258
|
+
"Mohammadta/BEAM-10M",
|
|
259
|
+
["data/10M-00000-of-00002.parquet", "data/10M-00001-of-00002.parquet"],
|
|
260
|
+
"beam_10m.json",
|
|
261
|
+
),
|
|
262
|
+
]
|
|
263
|
+
|
|
264
|
+
for repo_id, parquet_files, output_name in targets:
|
|
265
|
+
output_path = out_dir / output_name
|
|
266
|
+
if output_path.exists() and output_path.stat().st_size > 0:
|
|
267
|
+
print(f"[beam] Reusing {output_name}")
|
|
268
|
+
continue
|
|
269
|
+
|
|
270
|
+
rows: list[dict] = []
|
|
271
|
+
for parquet_file in parquet_files:
|
|
272
|
+
parquet_path = hf_hub_download(
|
|
273
|
+
repo_id=repo_id,
|
|
274
|
+
repo_type="dataset",
|
|
275
|
+
filename=parquet_file,
|
|
276
|
+
)
|
|
277
|
+
rows.extend(pq.read_table(parquet_path).to_pylist())
|
|
278
|
+
|
|
279
|
+
with output_path.open("w", encoding="utf-8") as handle:
|
|
280
|
+
json.dump(rows, handle, ensure_ascii=False)
|
|
281
|
+
print(f"[beam] Wrote {output_name} ({len(rows)} conversations)")
|
|
282
|
+
PY
|
|
283
|
+
echo "[beam] Downloaded to $dir"
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
download_personamem() {
|
|
287
|
+
local dir="$DATASETS_DIR/personamem"
|
|
288
|
+
if [[ -f "$dir/benchmark/text/benchmark.csv" ]] \
|
|
289
|
+
&& [[ -f "$dir/data/chat_history_32k/.download-complete" ]]; then
|
|
290
|
+
echo "[personamem] Already downloaded at $dir"
|
|
291
|
+
return
|
|
292
|
+
fi
|
|
293
|
+
echo "[personamem] Downloading from Hugging Face (bowen-upenn/PersonaMem-v2)..."
|
|
294
|
+
mkdir -p "$dir"
|
|
295
|
+
require_python_modules huggingface_hub
|
|
296
|
+
local python_bin
|
|
297
|
+
python_bin="$(resolve_python_bin)"
|
|
298
|
+
"$python_bin" - "$dir" <<'PY'
|
|
299
|
+
from __future__ import annotations
|
|
300
|
+
|
|
301
|
+
import csv
|
|
302
|
+
import os
|
|
303
|
+
import shutil
|
|
304
|
+
import sys
|
|
305
|
+
import time
|
|
306
|
+
from pathlib import Path, PurePosixPath
|
|
307
|
+
|
|
308
|
+
from huggingface_hub import hf_hub_download
|
|
309
|
+
|
|
310
|
+
REPO_ID = "bowen-upenn/PersonaMem-v2"
|
|
311
|
+
BENCHMARK_PATH = "benchmark/text/benchmark.csv"
|
|
312
|
+
|
|
313
|
+
out_dir = Path(sys.argv[1])
|
|
314
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
315
|
+
out_dir_root = out_dir.resolve()
|
|
316
|
+
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def resolve_dataset_destination(relative_path: str) -> tuple[str, Path]:
|
|
320
|
+
normalized = relative_path.strip().replace("\\", "/")
|
|
321
|
+
if not normalized:
|
|
322
|
+
raise ValueError("dataset path cannot be empty")
|
|
323
|
+
|
|
324
|
+
posix_path = PurePosixPath(normalized)
|
|
325
|
+
if posix_path.is_absolute():
|
|
326
|
+
raise ValueError(
|
|
327
|
+
f'PersonaMem dataset file reference "{relative_path}" must stay within dataset root.'
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
safe_parts = []
|
|
331
|
+
for part in posix_path.parts:
|
|
332
|
+
if part in ("", "."):
|
|
333
|
+
continue
|
|
334
|
+
if part == "..":
|
|
335
|
+
raise ValueError(
|
|
336
|
+
f'PersonaMem dataset file reference "{relative_path}" must stay within dataset root.'
|
|
337
|
+
)
|
|
338
|
+
safe_parts.append(part)
|
|
339
|
+
|
|
340
|
+
if not safe_parts:
|
|
341
|
+
raise ValueError("dataset path cannot resolve to the dataset root")
|
|
342
|
+
|
|
343
|
+
destination = (out_dir / Path(*safe_parts)).resolve()
|
|
344
|
+
try:
|
|
345
|
+
destination.relative_to(out_dir_root)
|
|
346
|
+
except ValueError as exc:
|
|
347
|
+
raise ValueError(
|
|
348
|
+
f'PersonaMem dataset file reference "{relative_path}" must stay within dataset root.'
|
|
349
|
+
) from exc
|
|
350
|
+
|
|
351
|
+
return PurePosixPath(*safe_parts).as_posix(), destination
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def copy_dataset_file(relative_path: str) -> Path:
|
|
355
|
+
safe_relative_path, destination = resolve_dataset_destination(relative_path)
|
|
356
|
+
source = Path(
|
|
357
|
+
hf_hub_download(
|
|
358
|
+
repo_id=REPO_ID,
|
|
359
|
+
repo_type="dataset",
|
|
360
|
+
filename=safe_relative_path,
|
|
361
|
+
token=token,
|
|
362
|
+
)
|
|
363
|
+
)
|
|
364
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
365
|
+
shutil.copy2(source, destination)
|
|
366
|
+
return destination
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
benchmark_destination = copy_dataset_file(BENCHMARK_PATH)
|
|
370
|
+
|
|
371
|
+
with benchmark_destination.open("r", encoding="utf8", newline="") as handle:
|
|
372
|
+
reader = csv.DictReader(handle)
|
|
373
|
+
history_paths = sorted(
|
|
374
|
+
{
|
|
375
|
+
(row.get("chat_history_32k_link") or "").strip()
|
|
376
|
+
for row in reader
|
|
377
|
+
if (row.get("chat_history_32k_link") or "").strip()
|
|
378
|
+
}
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
if not history_paths:
|
|
382
|
+
raise SystemExit("PersonaMem benchmark.csv did not contain any chat_history_32k_link values")
|
|
383
|
+
|
|
384
|
+
completed = 0
|
|
385
|
+
for index, relative_path in enumerate(history_paths, start=1):
|
|
386
|
+
_, destination = resolve_dataset_destination(relative_path)
|
|
387
|
+
if destination.is_file():
|
|
388
|
+
completed += 1
|
|
389
|
+
continue
|
|
390
|
+
|
|
391
|
+
for attempt in range(1, 6):
|
|
392
|
+
try:
|
|
393
|
+
copy_dataset_file(relative_path)
|
|
394
|
+
completed += 1
|
|
395
|
+
break
|
|
396
|
+
except Exception as exc: # noqa: BLE001
|
|
397
|
+
if attempt == 5:
|
|
398
|
+
raise SystemExit(
|
|
399
|
+
f"failed to download PersonaMem asset {relative_path}: {exc}"
|
|
400
|
+
) from exc
|
|
401
|
+
delay_seconds = min(30, 2 ** attempt)
|
|
402
|
+
print(
|
|
403
|
+
f"[personamem] Retry {attempt}/5 for {relative_path} after error: {exc}. "
|
|
404
|
+
f"Sleeping {delay_seconds}s..."
|
|
405
|
+
)
|
|
406
|
+
time.sleep(delay_seconds)
|
|
407
|
+
|
|
408
|
+
if index % 100 == 0 or index == len(history_paths):
|
|
409
|
+
print(f"[personamem] Downloaded {completed}/{len(history_paths)} chat histories")
|
|
410
|
+
|
|
411
|
+
print(
|
|
412
|
+
f"[personamem] Mirrored benchmark.csv and {completed} chat histories into {out_dir}"
|
|
413
|
+
)
|
|
414
|
+
PY
|
|
415
|
+
touch "$dir/data/chat_history_32k/.download-complete"
|
|
416
|
+
echo "[personamem] Downloaded to $dir"
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
download_membench() {
|
|
420
|
+
local dir="$DATASETS_DIR/membench"
|
|
421
|
+
if [[ -f "$dir/membench.json" ]]; then
|
|
422
|
+
echo "[membench] Already downloaded at $dir"
|
|
423
|
+
return
|
|
424
|
+
fi
|
|
425
|
+
echo "[membench] Downloading and normalizing from GitHub (import-myself/Membench)..."
|
|
426
|
+
mkdir -p "$dir"
|
|
427
|
+
local tmpdir
|
|
428
|
+
tmpdir=$(mktemp -d)
|
|
429
|
+
git clone --depth 1 https://github.com/import-myself/Membench.git "$tmpdir/repo" 2>/dev/null || {
|
|
430
|
+
echo "[membench] ERROR: Could not clone. Try manually:"
|
|
431
|
+
echo " git clone --depth 1 https://github.com/import-myself/Membench.git /tmp/membench"
|
|
432
|
+
rm -rf "$tmpdir"
|
|
433
|
+
return 1
|
|
434
|
+
}
|
|
435
|
+
local python_bin
|
|
436
|
+
python_bin="$(resolve_python_bin)"
|
|
437
|
+
"$python_bin" - "$tmpdir/repo" "$dir/membench.json" <<'PY'
|
|
438
|
+
from __future__ import annotations
|
|
439
|
+
|
|
440
|
+
import json
|
|
441
|
+
import re
|
|
442
|
+
import sys
|
|
443
|
+
from pathlib import Path
|
|
444
|
+
|
|
445
|
+
repo_root = Path(sys.argv[1])
|
|
446
|
+
output_path = Path(sys.argv[2])
|
|
447
|
+
|
|
448
|
+
def normalize_text(value):
|
|
449
|
+
if isinstance(value, str):
|
|
450
|
+
return value.strip()
|
|
451
|
+
if isinstance(value, list):
|
|
452
|
+
for item in value:
|
|
453
|
+
text = normalize_text(item)
|
|
454
|
+
if text:
|
|
455
|
+
return text
|
|
456
|
+
return ""
|
|
457
|
+
if value is None:
|
|
458
|
+
return ""
|
|
459
|
+
return str(value).strip()
|
|
460
|
+
|
|
461
|
+
def sanitize_case_id(value: str) -> str:
|
|
462
|
+
return re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
|
|
463
|
+
|
|
464
|
+
def iter_qa_entries(value):
|
|
465
|
+
if isinstance(value, dict):
|
|
466
|
+
return [value]
|
|
467
|
+
if isinstance(value, list):
|
|
468
|
+
return [entry for entry in value if isinstance(entry, dict)]
|
|
469
|
+
return []
|
|
470
|
+
|
|
471
|
+
def build_turns(message_list):
|
|
472
|
+
turns = []
|
|
473
|
+
if not isinstance(message_list, list):
|
|
474
|
+
return turns
|
|
475
|
+
for session in message_list:
|
|
476
|
+
if isinstance(session, dict):
|
|
477
|
+
session = [session]
|
|
478
|
+
if not isinstance(session, list):
|
|
479
|
+
continue
|
|
480
|
+
for step in session:
|
|
481
|
+
if not isinstance(step, dict):
|
|
482
|
+
continue
|
|
483
|
+
user = normalize_text(step.get("user"))
|
|
484
|
+
assistant = normalize_text(step.get("assistant"))
|
|
485
|
+
if user:
|
|
486
|
+
turns.append({"role": "user", "content": user})
|
|
487
|
+
if assistant:
|
|
488
|
+
turns.append({"role": "assistant", "content": assistant})
|
|
489
|
+
return turns
|
|
490
|
+
|
|
491
|
+
cases = []
|
|
492
|
+
source_roots = [
|
|
493
|
+
("FirstAgent", "participant"),
|
|
494
|
+
("ThirdAgent", "observation"),
|
|
495
|
+
]
|
|
496
|
+
|
|
497
|
+
for source_root, scenario in source_roots:
|
|
498
|
+
for dataset_path in sorted((repo_root / "MemData" / source_root).glob("*.json")):
|
|
499
|
+
label = dataset_path.stem.lower()
|
|
500
|
+
memory_type = "reflective" if "highlevel" in label else "factual"
|
|
501
|
+
level = "high_level" if memory_type == "reflective" else "low_level"
|
|
502
|
+
document = json.loads(dataset_path.read_text(encoding="utf-8"))
|
|
503
|
+
|
|
504
|
+
if not isinstance(document, dict):
|
|
505
|
+
continue
|
|
506
|
+
|
|
507
|
+
for group_name, entries in document.items():
|
|
508
|
+
if not isinstance(entries, list):
|
|
509
|
+
continue
|
|
510
|
+
for entry_index, entry in enumerate(entries):
|
|
511
|
+
if not isinstance(entry, dict):
|
|
512
|
+
continue
|
|
513
|
+
|
|
514
|
+
turns = build_turns(entry.get("message_list") or entry.get("messages"))
|
|
515
|
+
if not turns:
|
|
516
|
+
continue
|
|
517
|
+
|
|
518
|
+
qa_entries = iter_qa_entries(
|
|
519
|
+
entry.get("QA")
|
|
520
|
+
or entry.get("qa")
|
|
521
|
+
or entry.get("qas")
|
|
522
|
+
or entry.get("question_answers")
|
|
523
|
+
)
|
|
524
|
+
for qa_index, qa in enumerate(qa_entries):
|
|
525
|
+
question = normalize_text(qa.get("question") or qa.get("query"))
|
|
526
|
+
answer = normalize_text(qa.get("answer"))
|
|
527
|
+
if not question or not answer:
|
|
528
|
+
continue
|
|
529
|
+
|
|
530
|
+
qid = normalize_text(
|
|
531
|
+
qa.get("qid") or qa.get("id") or qa.get("question_id") or qa_index
|
|
532
|
+
)
|
|
533
|
+
raw_id = (
|
|
534
|
+
f"{source_root}-{dataset_path.stem}-{group_name}-"
|
|
535
|
+
f"{entry_index}-{qid}"
|
|
536
|
+
)
|
|
537
|
+
case_id = sanitize_case_id(raw_id)
|
|
538
|
+
cases.append(
|
|
539
|
+
{
|
|
540
|
+
"id": case_id,
|
|
541
|
+
"memoryType": memory_type,
|
|
542
|
+
"scenario": scenario,
|
|
543
|
+
"level": level,
|
|
544
|
+
"turns": turns,
|
|
545
|
+
"question": question,
|
|
546
|
+
"answer": answer,
|
|
547
|
+
}
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
if not cases:
|
|
551
|
+
raise SystemExit("MemBench normalization produced no runnable cases.")
|
|
552
|
+
|
|
553
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
554
|
+
with output_path.open("w", encoding="utf-8") as handle:
|
|
555
|
+
json.dump(cases, handle, ensure_ascii=False)
|
|
556
|
+
|
|
557
|
+
print(f"[membench] Wrote {output_path.name} ({len(cases)} cases)")
|
|
558
|
+
PY
|
|
559
|
+
rm -rf "$tmpdir"
|
|
560
|
+
echo "[membench] Downloaded to $dir"
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
download_memoryagentbench() {
|
|
564
|
+
local dir="$DATASETS_DIR/memoryagentbench"
|
|
565
|
+
if [[ -f "$dir/Accurate_Retrieval.json" && -f "$dir/Test_Time_Learning.json" && -f "$dir/Long_Range_Understanding.json" && -f "$dir/Conflict_Resolution.json" ]]; then
|
|
566
|
+
echo "[memoryagentbench] Already downloaded at $dir"
|
|
567
|
+
return
|
|
568
|
+
fi
|
|
569
|
+
echo "[memoryagentbench] Downloading from Hugging Face parquet sources (ai-hyz/MemoryAgentBench)..."
|
|
570
|
+
mkdir -p "$dir"
|
|
571
|
+
require_python_modules huggingface_hub pyarrow
|
|
572
|
+
local python_bin
|
|
573
|
+
python_bin="$(resolve_python_bin)"
|
|
574
|
+
"$python_bin" - "$dir" <<'PY'
|
|
575
|
+
from __future__ import annotations
|
|
576
|
+
|
|
577
|
+
import json
|
|
578
|
+
import sys
|
|
579
|
+
from pathlib import Path
|
|
580
|
+
|
|
581
|
+
import pyarrow.parquet as pq
|
|
582
|
+
from huggingface_hub import hf_hub_download
|
|
583
|
+
|
|
584
|
+
out_dir = Path(sys.argv[1])
|
|
585
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
586
|
+
|
|
587
|
+
targets = [
|
|
588
|
+
("data/Accurate_Retrieval-00000-of-00001.parquet", "Accurate_Retrieval.json"),
|
|
589
|
+
("data/Test_Time_Learning-00000-of-00001.parquet", "Test_Time_Learning.json"),
|
|
590
|
+
("data/Long_Range_Understanding-00000-of-00001.parquet", "Long_Range_Understanding.json"),
|
|
591
|
+
("data/Conflict_Resolution-00000-of-00001.parquet", "Conflict_Resolution.json"),
|
|
592
|
+
]
|
|
593
|
+
|
|
594
|
+
for parquet_file, output_name in targets:
|
|
595
|
+
output_path = out_dir / output_name
|
|
596
|
+
if output_path.exists() and output_path.stat().st_size > 0:
|
|
597
|
+
print(f"[memoryagentbench] Reusing {output_name}")
|
|
598
|
+
continue
|
|
599
|
+
|
|
600
|
+
parquet_path = hf_hub_download(
|
|
601
|
+
repo_id="ai-hyz/MemoryAgentBench",
|
|
602
|
+
repo_type="dataset",
|
|
603
|
+
filename=parquet_file,
|
|
604
|
+
)
|
|
605
|
+
rows = pq.read_table(parquet_path).to_pylist()
|
|
606
|
+
with output_path.open("w", encoding="utf-8") as handle:
|
|
607
|
+
json.dump(rows, handle, ensure_ascii=False)
|
|
608
|
+
print(f"[memoryagentbench] Wrote {output_name} ({len(rows)} samples)")
|
|
609
|
+
PY
|
|
610
|
+
echo "[memoryagentbench] Downloaded to $dir"
|
|
611
|
+
}
|
|
612
|
+
|
|
156
613
|
# ── Main ──
|
|
157
614
|
|
|
158
615
|
check_deps
|
|
@@ -164,16 +621,24 @@ case "$BENCHMARK" in
|
|
|
164
621
|
amemgym) download_amemgym ;;
|
|
165
622
|
locomo) download_locomo ;;
|
|
166
623
|
memory-arena) download_memory_arena ;;
|
|
624
|
+
beam) download_beam ;;
|
|
625
|
+
personamem) download_personamem ;;
|
|
626
|
+
membench) download_membench ;;
|
|
627
|
+
memoryagentbench) download_memoryagentbench ;;
|
|
167
628
|
all)
|
|
168
629
|
download_ama_bench
|
|
169
630
|
download_longmemeval
|
|
170
631
|
download_amemgym
|
|
171
632
|
download_locomo
|
|
172
633
|
download_memory_arena
|
|
634
|
+
download_beam
|
|
635
|
+
download_personamem
|
|
636
|
+
download_membench
|
|
637
|
+
download_memoryagentbench
|
|
173
638
|
;;
|
|
174
639
|
*)
|
|
175
640
|
echo "Unknown benchmark: $BENCHMARK"
|
|
176
|
-
echo "Available: ama-bench, longmemeval, amemgym, locomo, memory-arena, all"
|
|
641
|
+
echo "Available: ama-bench, longmemeval, amemgym, locomo, memory-arena, beam, personamem, membench, memoryagentbench, all"
|
|
177
642
|
exit 1
|
|
178
643
|
;;
|
|
179
644
|
esac
|