@remnic/cli 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -55,6 +55,7 @@ kept as a compatibility alias.
55
55
 
56
56
  ```bash
57
57
  remnic bench list
58
+ remnic bench run --quick longmemeval --runtime-profile baseline
58
59
  remnic bench datasets status
59
60
  remnic bench datasets download longmemeval
60
61
  remnic bench datasets download --all
@@ -63,6 +64,10 @@ remnic bench runs show candidate-run --detail
63
64
  remnic bench runs delete candidate-run
64
65
  remnic bench run --quick longmemeval
65
66
  remnic bench run longmemeval --dataset-dir ~/datasets/longmemeval
67
+ remnic bench run longmemeval --runtime-profile real --remnic-config ~/.config/remnic/config.json
68
+ remnic bench run longmemeval --runtime-profile real --system-provider openai --system-model gpt-5.4-mini
69
+ remnic bench run longmemeval --runtime-profile openclaw-chain --openclaw-config ~/.openclaw/openclaw.json --gateway-agent-id memory-primary
70
+ remnic bench run longmemeval --matrix baseline,real,openclaw-chain
66
71
  remnic bench compare base-run candidate-run
67
72
  remnic bench baseline save main candidate-run
68
73
  remnic bench baseline list
@@ -80,9 +85,16 @@ full runs need a real benchmark dataset. In a repo checkout the CLI will use
80
85
  `evals/datasets/<benchmark>` automatically; in packaged installs pass
81
86
  `--dataset-dir <path>` explicitly.
82
87
 
83
- `remnic bench datasets download` currently manages the script-backed published
84
- datasets for `ama-bench`, `memory-arena`, `amemgym`, `longmemeval`, and `locomo`.
85
- Other benchmark fixtures remain repo-managed or need manual dataset wiring.
88
+ Package-backed benchmark runs also write `MANIFEST.json` in the results
89
+ directory. The manifest records result artifact hashes, dataset file hashes,
90
+ fixed seeds, runtime profile/model configuration, git state, QMD collection
91
+ names, selected benchmark environment keys, and config-file hashes. Secret
92
+ argument values are redacted.
93
+
94
+ `remnic bench datasets download` currently manages the published benchmark
95
+ datasets for `ama-bench`, `memory-arena`, `amemgym`, `longmemeval`, `locomo`,
96
+ `beam`, `personamem`, `membench`, and `memoryagentbench`. Internal Remnic
97
+ benchmarks keep their bundled or repo-managed fixtures.
86
98
 
87
99
  ## Connecting agents
88
100
 
@@ -10,9 +10,9 @@ DATASETS_DIR="${DATASETS_DIR:-$(cd "$SCRIPT_DIR/.." && pwd)/datasets}"
10
10
  usage() {
11
11
  echo "Usage: $0 [--benchmark <name>]"
12
12
  echo ""
13
- echo "Downloads benchmark datasets for the Engram eval suite."
13
+ echo "Downloads benchmark datasets for the Remnic bench suite."
14
14
  echo ""
15
- echo "Benchmarks: ama-bench, longmemeval, amemgym, locomo, memory-arena, all"
15
+ echo "Benchmarks: ama-bench, longmemeval, amemgym, locomo, memory-arena, beam, personamem, membench, memoryagentbench, all"
16
16
  echo ""
17
17
  echo "Options:"
18
18
  echo " --benchmark <name> Download only the specified benchmark (default: all)"
@@ -38,6 +38,79 @@ check_deps() {
38
38
  done
39
39
  }
40
40
 
41
+ PYTHON_BIN=""
42
+
43
+ python_has_modules() {
44
+ local python_bin="$1"
45
+ shift
46
+
47
+ "$python_bin" - "$@" <<'PY'
48
+ import sys
49
+
50
+ try:
51
+ import importlib.util as importlib_util
52
+ except Exception: # pragma: no cover - Python 2 fallback
53
+ importlib_util = None
54
+ import pkgutil
55
+
56
+
57
+ def has_module(name):
58
+ if importlib_util is not None:
59
+ return importlib_util.find_spec(name) is not None
60
+ return pkgutil.find_loader(name) is not None
61
+
62
+
63
+ missing = [name for name in sys.argv[1:] if not has_module(name)]
64
+ if missing:
65
+ names = ", ".join(missing)
66
+ sys.stderr.write(
67
+ "ERROR: missing required Python module(s): {}. Install them before downloading this dataset.\n".format(
68
+ names
69
+ )
70
+ )
71
+ sys.exit(1)
72
+ PY
73
+ }
74
+
75
+ resolve_python_bin() {
76
+ if [[ -n "$PYTHON_BIN" ]]; then
77
+ if [[ $# -eq 0 ]] || python_has_modules "$PYTHON_BIN" "$@" >/dev/null 2>&1; then
78
+ printf '%s\n' "$PYTHON_BIN"
79
+ return 0
80
+ fi
81
+ fi
82
+
83
+ local candidate
84
+ local found_any=0
85
+ for candidate in python3 python; do
86
+ if ! command -v "$candidate" &>/dev/null; then
87
+ continue
88
+ fi
89
+ found_any=1
90
+ if [[ $# -gt 0 ]] && ! python_has_modules "$candidate" "$@" >/dev/null 2>&1; then
91
+ continue
92
+ fi
93
+ PYTHON_BIN="$candidate"
94
+ printf '%s\n' "$PYTHON_BIN"
95
+ return 0
96
+ done
97
+
98
+ if [[ $found_any -eq 1 && $# -gt 0 ]]; then
99
+ local names
100
+ names=$(printf '%s, ' "$@")
101
+ names=${names%, }
102
+ echo "ERROR: missing required Python module(s): $names. Install them before downloading this dataset."
103
+ exit 1
104
+ fi
105
+
106
+ echo "ERROR: python or python3 is required but not found"
107
+ exit 1
108
+ }
109
+
110
+ require_python_modules() {
111
+ resolve_python_bin "$@" >/dev/null
112
+ }
113
+
41
114
  download_ama_bench() {
42
115
  local dir="$DATASETS_DIR/ama-bench"
43
116
  if [[ -f "$dir/open_end_qa_set.jsonl" ]]; then
@@ -153,6 +226,390 @@ download_memory_arena() {
153
226
  echo "[memory-arena] Downloaded to $dir ($count domains)"
154
227
  }
155
228
 
229
+ download_beam() {
230
+ local dir="$DATASETS_DIR/beam"
231
+ if [[ -f "$dir/beam_100k.json" && -f "$dir/beam_500k.json" && -f "$dir/beam_1m.json" && -f "$dir/beam_10m.json" ]]; then
232
+ echo "[beam] Already downloaded at $dir"
233
+ return
234
+ fi
235
+ echo "[beam] Downloading from Hugging Face parquet sources (Mohammadta/BEAM, Mohammadta/BEAM-10M)..."
236
+ mkdir -p "$dir"
237
+ require_python_modules huggingface_hub pyarrow
238
+ local python_bin
239
+ python_bin="$(resolve_python_bin)"
240
+ "$python_bin" - "$dir" <<'PY'
241
+ from __future__ import annotations
242
+
243
+ import json
244
+ import sys
245
+ from pathlib import Path
246
+
247
+ import pyarrow.parquet as pq
248
+ from huggingface_hub import hf_hub_download
249
+
250
+ out_dir = Path(sys.argv[1])
251
+ out_dir.mkdir(parents=True, exist_ok=True)
252
+
253
+ targets = [
254
+ ("Mohammadta/BEAM", ["data/100K-00000-of-00001.parquet"], "beam_100k.json"),
255
+ ("Mohammadta/BEAM", ["data/500K-00000-of-00001.parquet"], "beam_500k.json"),
256
+ ("Mohammadta/BEAM", ["data/1M-00000-of-00001.parquet"], "beam_1m.json"),
257
+ (
258
+ "Mohammadta/BEAM-10M",
259
+ ["data/10M-00000-of-00002.parquet", "data/10M-00001-of-00002.parquet"],
260
+ "beam_10m.json",
261
+ ),
262
+ ]
263
+
264
+ for repo_id, parquet_files, output_name in targets:
265
+ output_path = out_dir / output_name
266
+ if output_path.exists() and output_path.stat().st_size > 0:
267
+ print(f"[beam] Reusing {output_name}")
268
+ continue
269
+
270
+ rows: list[dict] = []
271
+ for parquet_file in parquet_files:
272
+ parquet_path = hf_hub_download(
273
+ repo_id=repo_id,
274
+ repo_type="dataset",
275
+ filename=parquet_file,
276
+ )
277
+ rows.extend(pq.read_table(parquet_path).to_pylist())
278
+
279
+ with output_path.open("w", encoding="utf-8") as handle:
280
+ json.dump(rows, handle, ensure_ascii=False)
281
+ print(f"[beam] Wrote {output_name} ({len(rows)} conversations)")
282
+ PY
283
+ echo "[beam] Downloaded to $dir"
284
+ }
285
+
286
+ download_personamem() {
287
+ local dir="$DATASETS_DIR/personamem"
288
+ if [[ -f "$dir/benchmark/text/benchmark.csv" ]] \
289
+ && [[ -f "$dir/data/chat_history_32k/.download-complete" ]]; then
290
+ echo "[personamem] Already downloaded at $dir"
291
+ return
292
+ fi
293
+ echo "[personamem] Downloading from Hugging Face (bowen-upenn/PersonaMem-v2)..."
294
+ mkdir -p "$dir"
295
+ require_python_modules huggingface_hub
296
+ local python_bin
297
+ python_bin="$(resolve_python_bin)"
298
+ "$python_bin" - "$dir" <<'PY'
299
+ from __future__ import annotations
300
+
301
+ import csv
302
+ import os
303
+ import shutil
304
+ import sys
305
+ import time
306
+ from pathlib import Path, PurePosixPath
307
+
308
+ from huggingface_hub import hf_hub_download
309
+
310
+ REPO_ID = "bowen-upenn/PersonaMem-v2"
311
+ BENCHMARK_PATH = "benchmark/text/benchmark.csv"
312
+
313
+ out_dir = Path(sys.argv[1])
314
+ out_dir.mkdir(parents=True, exist_ok=True)
315
+ out_dir_root = out_dir.resolve()
316
+ token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
317
+
318
+
319
+ def resolve_dataset_destination(relative_path: str) -> tuple[str, Path]:
320
+ normalized = relative_path.strip().replace("\\", "/")
321
+ if not normalized:
322
+ raise ValueError("dataset path cannot be empty")
323
+
324
+ posix_path = PurePosixPath(normalized)
325
+ if posix_path.is_absolute():
326
+ raise ValueError(
327
+ f'PersonaMem dataset file reference "{relative_path}" must stay within dataset root.'
328
+ )
329
+
330
+ safe_parts = []
331
+ for part in posix_path.parts:
332
+ if part in ("", "."):
333
+ continue
334
+ if part == "..":
335
+ raise ValueError(
336
+ f'PersonaMem dataset file reference "{relative_path}" must stay within dataset root.'
337
+ )
338
+ safe_parts.append(part)
339
+
340
+ if not safe_parts:
341
+ raise ValueError("dataset path cannot resolve to the dataset root")
342
+
343
+ destination = (out_dir / Path(*safe_parts)).resolve()
344
+ try:
345
+ destination.relative_to(out_dir_root)
346
+ except ValueError as exc:
347
+ raise ValueError(
348
+ f'PersonaMem dataset file reference "{relative_path}" must stay within dataset root.'
349
+ ) from exc
350
+
351
+ return PurePosixPath(*safe_parts).as_posix(), destination
352
+
353
+
354
+ def copy_dataset_file(relative_path: str) -> Path:
355
+ safe_relative_path, destination = resolve_dataset_destination(relative_path)
356
+ source = Path(
357
+ hf_hub_download(
358
+ repo_id=REPO_ID,
359
+ repo_type="dataset",
360
+ filename=safe_relative_path,
361
+ token=token,
362
+ )
363
+ )
364
+ destination.parent.mkdir(parents=True, exist_ok=True)
365
+ shutil.copy2(source, destination)
366
+ return destination
367
+
368
+
369
+ benchmark_destination = copy_dataset_file(BENCHMARK_PATH)
370
+
371
+ with benchmark_destination.open("r", encoding="utf8", newline="") as handle:
372
+ reader = csv.DictReader(handle)
373
+ history_paths = sorted(
374
+ {
375
+ (row.get("chat_history_32k_link") or "").strip()
376
+ for row in reader
377
+ if (row.get("chat_history_32k_link") or "").strip()
378
+ }
379
+ )
380
+
381
+ if not history_paths:
382
+ raise SystemExit("PersonaMem benchmark.csv did not contain any chat_history_32k_link values")
383
+
384
+ completed = 0
385
+ for index, relative_path in enumerate(history_paths, start=1):
386
+ _, destination = resolve_dataset_destination(relative_path)
387
+ if destination.is_file():
388
+ completed += 1
389
+ continue
390
+
391
+ for attempt in range(1, 6):
392
+ try:
393
+ copy_dataset_file(relative_path)
394
+ completed += 1
395
+ break
396
+ except Exception as exc: # noqa: BLE001
397
+ if attempt == 5:
398
+ raise SystemExit(
399
+ f"failed to download PersonaMem asset {relative_path}: {exc}"
400
+ ) from exc
401
+ delay_seconds = min(30, 2 ** attempt)
402
+ print(
403
+ f"[personamem] Retry {attempt}/5 for {relative_path} after error: {exc}. "
404
+ f"Sleeping {delay_seconds}s..."
405
+ )
406
+ time.sleep(delay_seconds)
407
+
408
+ if index % 100 == 0 or index == len(history_paths):
409
+ print(f"[personamem] Downloaded {completed}/{len(history_paths)} chat histories")
410
+
411
+ print(
412
+ f"[personamem] Mirrored benchmark.csv and {completed} chat histories into {out_dir}"
413
+ )
414
+ PY
415
+ touch "$dir/data/chat_history_32k/.download-complete"
416
+ echo "[personamem] Downloaded to $dir"
417
+ }
418
+
419
+ download_membench() {
420
+ local dir="$DATASETS_DIR/membench"
421
+ if [[ -f "$dir/membench.json" ]]; then
422
+ echo "[membench] Already downloaded at $dir"
423
+ return
424
+ fi
425
+ echo "[membench] Downloading and normalizing from GitHub (import-myself/Membench)..."
426
+ mkdir -p "$dir"
427
+ local tmpdir
428
+ tmpdir=$(mktemp -d)
429
+ git clone --depth 1 https://github.com/import-myself/Membench.git "$tmpdir/repo" 2>/dev/null || {
430
+ echo "[membench] ERROR: Could not clone. Try manually:"
431
+ echo " git clone --depth 1 https://github.com/import-myself/Membench.git /tmp/membench"
432
+ rm -rf "$tmpdir"
433
+ return 1
434
+ }
435
+ local python_bin
436
+ python_bin="$(resolve_python_bin)"
437
+ "$python_bin" - "$tmpdir/repo" "$dir/membench.json" <<'PY'
438
+ from __future__ import annotations
439
+
440
+ import json
441
+ import re
442
+ import sys
443
+ from pathlib import Path
444
+
445
+ repo_root = Path(sys.argv[1])
446
+ output_path = Path(sys.argv[2])
447
+
448
+ def normalize_text(value):
449
+ if isinstance(value, str):
450
+ return value.strip()
451
+ if isinstance(value, list):
452
+ for item in value:
453
+ text = normalize_text(item)
454
+ if text:
455
+ return text
456
+ return ""
457
+ if value is None:
458
+ return ""
459
+ return str(value).strip()
460
+
461
+ def sanitize_case_id(value: str) -> str:
462
+ return re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
463
+
464
+ def iter_qa_entries(value):
465
+ if isinstance(value, dict):
466
+ return [value]
467
+ if isinstance(value, list):
468
+ return [entry for entry in value if isinstance(entry, dict)]
469
+ return []
470
+
471
+ def build_turns(message_list):
472
+ turns = []
473
+ if not isinstance(message_list, list):
474
+ return turns
475
+ for session in message_list:
476
+ if isinstance(session, dict):
477
+ session = [session]
478
+ if not isinstance(session, list):
479
+ continue
480
+ for step in session:
481
+ if not isinstance(step, dict):
482
+ continue
483
+ user = normalize_text(step.get("user"))
484
+ assistant = normalize_text(step.get("assistant"))
485
+ if user:
486
+ turns.append({"role": "user", "content": user})
487
+ if assistant:
488
+ turns.append({"role": "assistant", "content": assistant})
489
+ return turns
490
+
491
+ cases = []
492
+ source_roots = [
493
+ ("FirstAgent", "participant"),
494
+ ("ThirdAgent", "observation"),
495
+ ]
496
+
497
+ for source_root, scenario in source_roots:
498
+ for dataset_path in sorted((repo_root / "MemData" / source_root).glob("*.json")):
499
+ label = dataset_path.stem.lower()
500
+ memory_type = "reflective" if "highlevel" in label else "factual"
501
+ level = "high_level" if memory_type == "reflective" else "low_level"
502
+ document = json.loads(dataset_path.read_text(encoding="utf-8"))
503
+
504
+ if not isinstance(document, dict):
505
+ continue
506
+
507
+ for group_name, entries in document.items():
508
+ if not isinstance(entries, list):
509
+ continue
510
+ for entry_index, entry in enumerate(entries):
511
+ if not isinstance(entry, dict):
512
+ continue
513
+
514
+ turns = build_turns(entry.get("message_list") or entry.get("messages"))
515
+ if not turns:
516
+ continue
517
+
518
+ qa_entries = iter_qa_entries(
519
+ entry.get("QA")
520
+ or entry.get("qa")
521
+ or entry.get("qas")
522
+ or entry.get("question_answers")
523
+ )
524
+ for qa_index, qa in enumerate(qa_entries):
525
+ question = normalize_text(qa.get("question") or qa.get("query"))
526
+ answer = normalize_text(qa.get("answer"))
527
+ if not question or not answer:
528
+ continue
529
+
530
+ qid = normalize_text(
531
+ qa.get("qid") or qa.get("id") or qa.get("question_id") or qa_index
532
+ )
533
+ raw_id = (
534
+ f"{source_root}-{dataset_path.stem}-{group_name}-"
535
+ f"{entry_index}-{qid}"
536
+ )
537
+ case_id = sanitize_case_id(raw_id)
538
+ cases.append(
539
+ {
540
+ "id": case_id,
541
+ "memoryType": memory_type,
542
+ "scenario": scenario,
543
+ "level": level,
544
+ "turns": turns,
545
+ "question": question,
546
+ "answer": answer,
547
+ }
548
+ )
549
+
550
+ if not cases:
551
+ raise SystemExit("MemBench normalization produced no runnable cases.")
552
+
553
+ output_path.parent.mkdir(parents=True, exist_ok=True)
554
+ with output_path.open("w", encoding="utf-8") as handle:
555
+ json.dump(cases, handle, ensure_ascii=False)
556
+
557
+ print(f"[membench] Wrote {output_path.name} ({len(cases)} cases)")
558
+ PY
559
+ rm -rf "$tmpdir"
560
+ echo "[membench] Downloaded to $dir"
561
+ }
562
+
563
+ download_memoryagentbench() {
564
+ local dir="$DATASETS_DIR/memoryagentbench"
565
+ if [[ -f "$dir/Accurate_Retrieval.json" && -f "$dir/Test_Time_Learning.json" && -f "$dir/Long_Range_Understanding.json" && -f "$dir/Conflict_Resolution.json" ]]; then
566
+ echo "[memoryagentbench] Already downloaded at $dir"
567
+ return
568
+ fi
569
+ echo "[memoryagentbench] Downloading from Hugging Face parquet sources (ai-hyz/MemoryAgentBench)..."
570
+ mkdir -p "$dir"
571
+ require_python_modules huggingface_hub pyarrow
572
+ local python_bin
573
+ python_bin="$(resolve_python_bin)"
574
+ "$python_bin" - "$dir" <<'PY'
575
+ from __future__ import annotations
576
+
577
+ import json
578
+ import sys
579
+ from pathlib import Path
580
+
581
+ import pyarrow.parquet as pq
582
+ from huggingface_hub import hf_hub_download
583
+
584
+ out_dir = Path(sys.argv[1])
585
+ out_dir.mkdir(parents=True, exist_ok=True)
586
+
587
+ targets = [
588
+ ("data/Accurate_Retrieval-00000-of-00001.parquet", "Accurate_Retrieval.json"),
589
+ ("data/Test_Time_Learning-00000-of-00001.parquet", "Test_Time_Learning.json"),
590
+ ("data/Long_Range_Understanding-00000-of-00001.parquet", "Long_Range_Understanding.json"),
591
+ ("data/Conflict_Resolution-00000-of-00001.parquet", "Conflict_Resolution.json"),
592
+ ]
593
+
594
+ for parquet_file, output_name in targets:
595
+ output_path = out_dir / output_name
596
+ if output_path.exists() and output_path.stat().st_size > 0:
597
+ print(f"[memoryagentbench] Reusing {output_name}")
598
+ continue
599
+
600
+ parquet_path = hf_hub_download(
601
+ repo_id="ai-hyz/MemoryAgentBench",
602
+ repo_type="dataset",
603
+ filename=parquet_file,
604
+ )
605
+ rows = pq.read_table(parquet_path).to_pylist()
606
+ with output_path.open("w", encoding="utf-8") as handle:
607
+ json.dump(rows, handle, ensure_ascii=False)
608
+ print(f"[memoryagentbench] Wrote {output_name} ({len(rows)} samples)")
609
+ PY
610
+ echo "[memoryagentbench] Downloaded to $dir"
611
+ }
612
+
156
613
  # ── Main ──
157
614
 
158
615
  check_deps
@@ -164,16 +621,24 @@ case "$BENCHMARK" in
164
621
  amemgym) download_amemgym ;;
165
622
  locomo) download_locomo ;;
166
623
  memory-arena) download_memory_arena ;;
624
+ beam) download_beam ;;
625
+ personamem) download_personamem ;;
626
+ membench) download_membench ;;
627
+ memoryagentbench) download_memoryagentbench ;;
167
628
  all)
168
629
  download_ama_bench
169
630
  download_longmemeval
170
631
  download_amemgym
171
632
  download_locomo
172
633
  download_memory_arena
634
+ download_beam
635
+ download_personamem
636
+ download_membench
637
+ download_memoryagentbench
173
638
  ;;
174
639
  *)
175
640
  echo "Unknown benchmark: $BENCHMARK"
176
- echo "Available: ama-bench, longmemeval, amemgym, locomo, memory-arena, all"
641
+ echo "Available: ama-bench, longmemeval, amemgym, locomo, memory-arena, beam, personamem, membench, memoryagentbench, all"
177
642
  exit 1
178
643
  ;;
179
644
  esac